diff --git a/.github/workflows/tinybvh_sdk_ci.yml b/.github/workflows/tinybvh_sdk_ci.yml
index 017fc97..1507e44 100644
--- a/.github/workflows/tinybvh_sdk_ci.yml
+++ b/.github/workflows/tinybvh_sdk_ci.yml
@@ -2,9 +2,9 @@ name: tinybvh CI
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, dev ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
 
 jobs:
   build:
diff --git a/README.md b/README.md
index 0fc5475..43da909 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,6 @@
+# dev
+This is the **development branch** for tinybvh. Potentially unstable code, consider yourself warned.
+
 # tinybvh
 Single-header BVH construction and traversal library written as "Sane C++" (or "C with classes"). The library has no dependencies. 
 
@@ -9,6 +12,8 @@ Single-header OpenCL library, which helps you select and initialize a device. It
 * Vendor and architecture detection and propagation to #defines in OpenCL code
 * ..And many other things.
 
+![Rendered with tinybvh](images/test.png)
+
 To use tinyocl, just include ````tiny_ocl.h````; this will automatically cause linking with ````OpenCL.lib```` in the 'external' folder, which in turn passes on work to vendor-specific driver code. But all that is not your problem!
 
 Note that the ````tiny_bvh.h```` library will work without ````tiny_ocl.h```` and remains dependency-free. The new ````tiny_ocl.h```` is only needed in projects that wish to trace rays _on the GPU_ using BVHs created by ````tiny_bvh.h````. 
@@ -31,6 +36,7 @@ A constructed BVH can be used to quickly intersect a ray with the geometry, usin
 
 The constructed BVH will have a layout suitable for construction ('````WALD_32BYTE````'). Several other layouts for the same data are available, which all serve one or more specific purposes. You can convert between layouts using ````BVH::Convert````. The available layouts are:
 * ````BVH::WALD_32BYTE```` : A compact format that stores the AABB for a node, along with child pointers and leaf information in a cross-platform-friendly way. The 32-byte size allows for cache-line alignment.
+* ````BVH::ALT_SOA```` : This format stores bounding box information in a SIMD-friendly format, making the BVH faster to traverse.
 * ````BVH::WALD_DOUBLE```` : Double-precision version of ````BVH::WALD_32BYTE````.
 * ````BVH::VERBOSE```` : A format designed for modifying BVHs, e.g. for post-build optimizations using ````BVH::Optimize()````.
 * ````BVH::AILA_LAINE```` : This format uses 64 bytes per node and stores the AABBs of the two child nodes. This is the format presented in the [2009 Aila & Laine paper](https://research.nvidia.com/sites/default/files/pubs/2009-08_Understanding-the-Efficiency/aila2009hpg_paper.pdf) and recommended for basic GPU ray tracing.
@@ -57,6 +63,10 @@ The cross-platform fenster-based single-source **bitmap renderer** can be compil
 
 ```g++ -std=c++20 -mavx -O3 -framework Cocoa tiny_bvh_fenster.cpp -o tiny_bvh_fenster``` (on macOS)
 
+The multi-threaded **ambient occlusion** demo can be compiled with
+
+````g++ -std=c++20 -mavx -mwindows -fopenmp -O3 tiny_bvh_pt.cpp -o tiny_bvh_pt```` (on windows)
+
 The **performance measurement tool** uses OpenMP and can be compiled with:
 
 ````g++ -std=c++20 -mavx -Ofast -fopenmp tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest````
diff --git a/external/fenster.h b/external/fenster.h
index c823d66..89225ca 100644
--- a/external/fenster.h
+++ b/external/fenster.h
@@ -199,8 +199,8 @@ static LRESULT CALLBACK fenster_wndproc(HWND hwnd, UINT msg, WPARAM wParam,
     PAINTSTRUCT ps;
     HDC hdc = BeginPaint(hwnd, &ps);
     HDC memdc = CreateCompatibleDC(hdc);
-    auto hbmp = CreateCompatibleBitmap(hdc, f->width, f->height);
-    auto oldbmp = SelectObject(memdc, hbmp);
+    HBITMAP hbmp = CreateCompatibleBitmap(hdc, f->width, f->height);
+	HBITMAP oldbmp = static_cast<HBITMAP>(SelectObject(memdc, hbmp));
     BINFO bi = {{sizeof(bi), f->width, -f->height, 1, 32, BI_BITFIELDS}};
     bi.bmiColors[0].rgbRed = 0xff;
     bi.bmiColors[1].rgbGreen = 0xff;
@@ -249,10 +249,13 @@ FENSTER_API int fenster_open(struct fenster *f) {
   wc.hInstance = hInstance;
   wc.lpszClassName = f->title;
   RegisterClassEx(&wc);
+  RECT desiredRect = {0, 0, f->width, f->height};
+  AdjustWindowRectEx(&desiredRect, WS_OVERLAPPEDWINDOW, FALSE, WS_EX_CLIENTEDGE);
+  int adjustedWidth = desiredRect.right - desiredRect.left;
+  int adjustedHeight = desiredRect.bottom - desiredRect.top;
   f->hwnd = CreateWindowEx(WS_EX_CLIENTEDGE, f->title, f->title,
                            WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT,
-                           f->width, f->height, NULL, NULL, hInstance, NULL);
-
+                           adjustedWidth, adjustedHeight, NULL, NULL, hInstance, NULL);
   if (f->hwnd == NULL)
     return -1;
   SetWindowLongPtr(f->hwnd, GWLP_USERDATA, (LONG_PTR)f);
@@ -261,7 +264,10 @@ FENSTER_API int fenster_open(struct fenster *f) {
   return 0;
 }
 
-FENSTER_API void fenster_close(struct fenster *f) { (void)f; }
+FENSTER_API void fenster_close(struct fenster *f) {
+  PostMessage(f->hwnd, WM_CLOSE, 0, 0);
+  (void)f;
+}
 
 FENSTER_API int fenster_loop(struct fenster *f) {
   MSG msg;
diff --git a/images/test.png b/images/test.png
new file mode 100644
index 0000000..b5487ad
Binary files /dev/null and b/images/test.png differ
diff --git a/testdata/armadillo.bin b/testdata/armadillo.bin
new file mode 100644
index 0000000..3de5c32
Binary files /dev/null and b/testdata/armadillo.bin differ
diff --git a/testdata/bunny.bin b/testdata/bunny.bin
new file mode 100644
index 0000000..d9f584a
Binary files /dev/null and b/testdata/bunny.bin differ
diff --git a/testdata/cryteksponza.bin b/testdata/cryteksponza.bin
index 7fa6e98..a96bd61 100644
Binary files a/testdata/cryteksponza.bin and b/testdata/cryteksponza.bin differ
diff --git a/testdata/dragon.bin b/testdata/dragon.bin
new file mode 100644
index 0000000..3ad2c33
Binary files /dev/null and b/testdata/dragon.bin differ
diff --git a/testdata/happybuddha.bin b/testdata/happybuddha.bin
index 9adea58..e63ba81 100644
Binary files a/testdata/happybuddha.bin and b/testdata/happybuddha.bin differ
diff --git a/testdata/head.bin b/testdata/head.bin
new file mode 100644
index 0000000..5f228d8
Binary files /dev/null and b/testdata/head.bin differ
diff --git a/testdata/legocar.bin b/testdata/legocar.bin
new file mode 100644
index 0000000..bf331ca
Binary files /dev/null and b/testdata/legocar.bin differ
diff --git a/testdata/lucy.bin b/testdata/lucy.bin
new file mode 100644
index 0000000..b8c0122
Binary files /dev/null and b/testdata/lucy.bin differ
diff --git a/testdata/suzanne.bin b/testdata/suzanne.bin
new file mode 100644
index 0000000..e4d8269
Binary files /dev/null and b/testdata/suzanne.bin differ
diff --git a/testdata/xyzrgb_dragon.bin b/testdata/xyzrgb_dragon.bin
new file mode 100644
index 0000000..8349e43
Binary files /dev/null and b/testdata/xyzrgb_dragon.bin differ
diff --git a/tiny_bvh.h b/tiny_bvh.h
index 48001b8..be0c95d 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -23,6 +23,8 @@ THE SOFTWARE.
 */
 
 // 2024
+// Dec 07: version 1.0.6 : Introduction of slices.
+// Dec 05: version 1.0.5 : Bug fixes, path tracing example.
 // Dec 02: version 1.0.1 : Shadow ray validation, double precision.
 // Nov 28: version 1.0.0 : Validation in speedtest, faster tri test.
 // Nov 25: version 0.9.8 : FATAL_ERROR_IF interface.
@@ -34,7 +36,7 @@ THE SOFTWARE.
 // Mov 16: version 0.9.0 : (external) OpenCL in speedtest.
 // Nov 15: version 0.8.3 : Incremental update / bugfixes.
 // Nov 14: version 0.8.0 : ARM/NEON support.
-// Nov 13: version 0.7.5 : Support for WASM with EMSCRIPTEN. 
+// Nov 13: version 0.7.5 : Support for WASM with EMSCRIPTEN.
 // Nov 12: version 0.7.0 : CWBVH construction and traversal.
 // Nov 11: version 0.5.1 : SBVH builder, BVH4_GPU traversal.
 // Nov 10: version 0.4.2 : BVH4/8, gpu-friendly BVH4.
@@ -90,11 +92,15 @@ THE SOFTWARE.
 // Jefferson Amstutz: CMake surpport
 // Christian Oliveros: WASM / EMSCRIPTEN support
 // Thierry Cantenot: user-defined alloc & free
+// David Peicho: slices & Rust bindings
 
 #ifndef TINY_BVH_H_
 #define TINY_BVH_H_
 
-// binned BVH building: bin count
+// Run-time checks; disabled by default.
+// #define PARANOID
+
+// Binned BVH building: bin count.
 #define BVHBINS 8
 
 // SAH BVH building: Heuristic parameters
@@ -127,12 +133,12 @@ THE SOFTWARE.
 // library version
 #define TINY_BVH_VERSION_MAJOR	1
 #define TINY_BVH_VERSION_MINOR	0
-#define TINY_BVH_VERSION_SUB	5
+#define TINY_BVH_VERSION_SUB	6
 
 // ============================================================================
 //
 //        P R E L I M I N A R I E S
-// 
+//
 // ============================================================================
 
 // needful includes
@@ -148,6 +154,7 @@ THE SOFTWARE.
 #include <cmath>
 #include <cstring>
 #endif
+#include <cstdint>
 
 // aligned memory allocation
 // note: formally size needs to be a multiple of 'alignment'. See:
@@ -209,7 +216,7 @@ struct ALIGNED( 16 ) bvhvec4
 	bvhvec4( const float a ) : x( a ), y( a ), z( a ), w( a ) {}
 	bvhvec4( const bvhvec3 & a );
 	bvhvec4( const bvhvec3 & a, float b );
-	float& operator [] ( const int i ) { return cell[i]; }
+	float& operator [] ( const int32_t i ) { return cell[i]; }
 	union { struct { float x, y, z, w; }; float cell[4]; };
 };
 
@@ -219,7 +226,7 @@ struct ALIGNED( 8 ) bvhvec2
 	bvhvec2( const float a, const float b ) : x( a ), y( b ) {}
 	bvhvec2( const float a ) : x( a ), y( a ) {}
 	bvhvec2( const bvhvec4 a ) : x( a.x ), y( a.y ) {}
-	float& operator [] ( const int i ) { return cell[i]; }
+	float& operator [] ( const int32_t i ) { return cell[i]; }
 	union { struct { float x, y; }; float cell[2]; };
 };
 
@@ -230,34 +237,34 @@ struct bvhvec3
 	bvhvec3( const float a ) : x( a ), y( a ), z( a ) {}
 	bvhvec3( const bvhvec4 a ) : x( a.x ), y( a.y ), z( a.z ) {}
 	float halfArea() { return x < -BVH_FAR ? 0 : (x * y + y * z + z * x); } // for SAH calculations
-	float& operator [] ( const int i ) { return cell[i]; }
+	float& operator [] ( const int32_t i ) { return cell[i]; }
 	union { struct { float x, y, z; }; float cell[3]; };
 };
 
 struct bvhint3
 {
 	bvhint3() = default;
-	bvhint3( const int a, const int b, const int c ) : x( a ), y( b ), z( c ) {}
-	bvhint3( const int a ) : x( a ), y( a ), z( a ) {}
-	bvhint3( const bvhvec3& a ) { x = (int)a.x, y = (int)a.y, z = (int)a.z; }
-	int& operator [] ( const int i ) { return cell[i]; }
-	union { struct { int x, y, z; }; int cell[3]; };
+	bvhint3( const int32_t a, const int32_t b, const int32_t c ) : x( a ), y( b ), z( c ) {}
+	bvhint3( const int32_t a ) : x( a ), y( a ), z( a ) {}
+	bvhint3( const bvhvec3& a ) { x = (int32_t)a.x, y = (int32_t)a.y, z = (int32_t)a.z; }
+	int32_t& operator [] ( const int32_t i ) { return cell[i]; }
+	union { struct { int32_t x, y, z; }; int32_t cell[3]; };
 };
 
 struct bvhint2
 {
 	bvhint2() = default;
-	bvhint2( const int a, const int b ) : x( a ), y( b ) {}
-	bvhint2( const int a ) : x( a ), y( a ) {}
-	int x, y;
+	bvhint2( const int32_t a, const int32_t b ) : x( a ), y( b ) {}
+	bvhint2( const int32_t a ) : x( a ), y( a ) {}
+	int32_t x, y;
 };
 
 struct bvhuint2
 {
 	bvhuint2() = default;
-	bvhuint2( const unsigned a, const unsigned b ) : x( a ), y( b ) {}
-	bvhuint2( const unsigned a ) : x( a ), y( a ) {}
-	unsigned x, y;
+	bvhuint2( const uint32_t a, const uint32_t b ) : x( a ), y( b ) {}
+	bvhuint2( const uint32_t a ) : x( a ), y( a ) {}
+	uint32_t x, y;
 };
 
 #ifdef TINYBVH_IMPLEMENTATION
@@ -269,8 +276,26 @@ bvhvec4::bvhvec4( const bvhvec3& a, float b ) { x = a.x; y = a.y; z = a.z; w = b
 
 struct bvhaabb
 {
-	bvhvec3 minBounds; unsigned dummy1;
-	bvhvec3 maxBounds; unsigned dummy2;
+	bvhvec3 minBounds; uint32_t dummy1;
+	bvhvec3 maxBounds; uint32_t dummy2;
+};
+
+/**
+ * Strided slice of @ref bvhvec4
+ */
+struct bvhvec4slice
+{
+	bvhvec4slice() = default;
+	/**
+	 * @param data Pointer to the first element
+	 * @param count Number of @ref bvhvec4 in the slice, not **bytes**
+	 * @param stride Byte stride between each @ref bvhvec4 element
+	 */
+	bvhvec4slice( const bvhvec4* data, uint32_t count, uint32_t stride = sizeof( bvhvec4 ) );
+	operator bool() const { return !!data; }
+	const bvhvec4& operator [] ( size_t i ) const;
+	const int8_t* data = nullptr;
+	uint32_t count, stride;
 };
 
 #ifdef _MSC_VER
@@ -287,16 +312,16 @@ static inline float tinybvh_min( const float a, const float b ) { return a < b ?
 static inline float tinybvh_max( const float a, const float b ) { return a > b ? a : b; }
 static inline double tinybvh_min( const double a, const double b ) { return a < b ? a : b; }
 static inline double tinybvh_max( const double a, const double b ) { return a > b ? a : b; }
-static inline int tinybvh_min( const int a, const int b ) { return a < b ? a : b; }
-static inline int tinybvh_max( const int a, const int b ) { return a > b ? a : b; }
-static inline unsigned tinybvh_min( const unsigned a, const unsigned b ) { return a < b ? a : b; }
-static inline unsigned tinybvh_max( const unsigned a, const unsigned b ) { return a > b ? a : b; }
+static inline int32_t tinybvh_min( const int32_t a, const int32_t b ) { return a < b ? a : b; }
+static inline int32_t tinybvh_max( const int32_t a, const int32_t b ) { return a > b ? a : b; }
+static inline uint32_t tinybvh_min( const uint32_t a, const uint32_t b ) { return a < b ? a : b; }
+static inline uint32_t tinybvh_max( const uint32_t a, const uint32_t b ) { return a > b ? a : b; }
 static inline bvhvec3 tinybvh_min( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( tinybvh_min( a.x, b.x ), tinybvh_min( a.y, b.y ), tinybvh_min( a.z, b.z ) ); }
 static inline bvhvec4 tinybvh_min( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( tinybvh_min( a.x, b.x ), tinybvh_min( a.y, b.y ), tinybvh_min( a.z, b.z ), tinybvh_min( a.w, b.w ) ); }
 static inline bvhvec3 tinybvh_max( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( tinybvh_max( a.x, b.x ), tinybvh_max( a.y, b.y ), tinybvh_max( a.z, b.z ) ); }
 static inline bvhvec4 tinybvh_max( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( tinybvh_max( a.x, b.x ), tinybvh_max( a.y, b.y ), tinybvh_max( a.z, b.z ), tinybvh_max( a.w, b.w ) ); }
 static inline float tinybvh_clamp( const float x, const float a, const float b ) { return x < a ? a : (x > b ? b : x); }
-static inline int tinybvh_clamp( const int x, const int a, const int b ) { return x < a ? a : (x > b ? b : x); }
+static inline int32_t tinybvh_clamp( const int32_t x, const int32_t a, const int32_t b ) { return x < a ? a : (x > b ? b : x); }
 template <class T> inline static void tinybvh_swap( T& a, T& b ) { T t = a; a = b; b = t; }
 
 // Operator overloads.
@@ -309,6 +334,7 @@ inline bvhvec4 operator-( const bvhvec4& a ) { return bvhvec4( -a.x, -a.y, -a.z,
 inline bvhvec2 operator+( const bvhvec2& a, const bvhvec2& b ) { return bvhvec2( a.x + b.x, a.y + b.y ); }
 inline bvhvec3 operator+( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( a.x + b.x, a.y + b.y, a.z + b.z ); }
 inline bvhvec4 operator+( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w ); }
+inline bvhvec4 operator+( const bvhvec4& a, const bvhvec3& b ) { return bvhvec4( a.x + b.x, a.y + b.y, a.z + b.z, a.w ); }
 inline bvhvec2 operator-( const bvhvec2& a, const bvhvec2& b ) { return bvhvec2( a.x - b.x, a.y - b.y ); }
 inline bvhvec3 operator-( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( a.x - b.x, a.y - b.y, a.z - b.z ); }
 inline bvhvec4 operator-( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w ); }
@@ -327,7 +353,7 @@ inline bvhvec4 operator*( float b, const bvhvec4& a ) { return bvhvec4( b * a.x,
 inline bvhvec2 operator/( float b, const bvhvec2& a ) { return bvhvec2( b / a.x, b / a.y ); }
 inline bvhvec3 operator/( float b, const bvhvec3& a ) { return bvhvec3( b / a.x, b / a.y, b / a.z ); }
 inline bvhvec4 operator/( float b, const bvhvec4& a ) { return bvhvec4( b / a.x, b / a.y, b / a.z, b / a.w ); }
-inline bvhvec3 operator*=( bvhvec3& a, const float b ) { return bvhvec3( a.x * b, a.y * b, a.z * b ); }
+inline void operator*=( bvhvec3& a, const float b ) { a.x *= b; a.y *= b; a.z *= b; }
 
 #endif // TINYBVH_USE_CUSTOM_VECTOR_TYPES
 
@@ -360,7 +386,7 @@ struct bvhdbl3
 	bvhdbl3( const double a ) : x( a ), y( a ), z( a ) {}
 	bvhdbl3( const bvhvec3 a ) : x( (double)a.x ), y( (double)a.y ), z( (double)a.z ) {}
 	double halfArea() { return x < -BVH_FAR ? 0 : (x * y + y * z + z * x); } // for SAH calculations
-	double& operator [] ( const int i ) { return cell[i]; }
+	double& operator [] ( const int32_t i ) { return cell[i]; }
 	union { struct { double x, y, z; }; double cell[3]; };
 };
 
@@ -376,10 +402,10 @@ inline bvhdbl3 operator+( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3(
 inline bvhdbl3 operator-( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3( a.x - b.x, a.y - b.y, a.z - b.z ); }
 inline void operator+=( bvhdbl3& a, const bvhdbl3& b ) { a.x += b.x; a.y += b.y; a.z += b.z; }
 inline bvhdbl3 operator*( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3( a.x * b.x, a.y * b.y, a.z * b.z ); }
-inline bvhdbl3 operator*( const bvhdbl3& a, float b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); }
-inline bvhdbl3 operator*( float b, const bvhdbl3& a ) { return bvhdbl3( b * a.x, b * a.y, b * a.z ); }
-inline bvhdbl3 operator/( float b, const bvhdbl3& a ) { return bvhdbl3( b / a.x, b / a.y, b / a.z ); }
-inline bvhdbl3 operator*=( bvhdbl3& a, const float b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); }
+inline bvhdbl3 operator*( const bvhdbl3& a, double b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); }
+inline bvhdbl3 operator*( double b, const bvhdbl3& a ) { return bvhdbl3( b * a.x, b * a.y, b * a.z ); }
+inline bvhdbl3 operator/( double b, const bvhdbl3& a ) { return bvhdbl3( b / a.x, b / a.y, b / a.z ); }
+inline bvhdbl3 operator*=( bvhdbl3& a, const double b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); }
 
 #endif // TINYBVH_USE_CUSTOM_VECTOR_TYPES
 
@@ -408,12 +434,21 @@ inline float32x4_t SIMD_SETRVEC( float x, float y, float z, float w )
 	ALIGNED( 64 ) float data[4] = { x, y, z, w };
 	return vld1q_f32( data );
 }
+
+inline uint32x4_t SIMD_SETRVECU( uint32_t x, uint32_t y, uint32_t z, uint32_t w )
+{
+	ALIGNED( 64 ) uint32_t data[4] = { x, y, z, w };
+	return vld1q_u32( data );
+}
+
 #else
 typedef bvhvec4 SIMDVEC4;
 #define SIMD_SETVEC(a,b,c,d) bvhvec4( d, c, b, a )
 #define SIMD_SETRVEC(a,b,c,d) bvhvec4( a, b, c, d )
 #endif
 
+#endif
+
 // error handling
 #define FATAL_ERROR_IF(c,s) if (c) { fprintf( stderr, \
 	"Fatal error in tiny_bvh.h, line %i:\n%s\n", __LINE__, s ); exit( 1 ); }
@@ -421,7 +456,7 @@ typedef bvhvec4 SIMDVEC4;
 // ============================================================================
 //
 //        T I N Y _ B V H   I N T E R F A C E
-// 
+//
 // ============================================================================
 
 struct Intersection
@@ -433,7 +468,7 @@ struct Intersection
 	// Using this data and the original triangle data, all other info for
 	// shading (such as normal, texture color etc.) can be reconstructed.
 	float t, u, v;	// distance along ray & barycentric coordinates of the intersection
-	unsigned prim;	// primitive index
+	uint32_t prim;	// primitive index
 };
 
 struct Ray
@@ -448,9 +483,9 @@ struct Ray
 		O = origin, D = normalize( direction ), rD = tinybvh_safercp( D );
 		hit.t = t;
 	}
-	ALIGNED( 16 ) bvhvec3 O; unsigned dummy1;
-	ALIGNED( 16 ) bvhvec3 D; unsigned dummy2;
-	ALIGNED( 16 ) bvhvec3 rD; unsigned dummy3;
+	ALIGNED( 16 ) bvhvec3 O; uint32_t dummy1;
+	ALIGNED( 16 ) bvhvec3 D; uint32_t dummy2;
+	ALIGNED( 16 ) bvhvec3 rD; uint32_t dummy3;
 	ALIGNED( 16 ) Intersection hit;
 };
 
@@ -471,7 +506,7 @@ struct RayEx
 	}
 	bvhdbl3 O, D, rD;
 	double t, u, v;
-	unsigned long long int primIdx;
+	uint64_t primIdx;
 };
 
 #endif
@@ -486,7 +521,7 @@ struct BVHContext
 class BVH
 {
 public:
-	enum BVHLayout {
+	enum BVHLayout : uint32_t {
 		WALD_32BYTE = 1,	// Default format, obtained using BVH::Build variants.
 		WALD_DOUBLE,		// Double-precision version of the default format.
 		AILA_LAINE,			// For GPU rendering. Obtained by converting WALD_32BYTE.
@@ -495,14 +530,15 @@ class BVH
 		BASIC_BVH4,			// Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE.
 		BVH4_GPU,			// For fast GPU rendering. Obtained by converting BASIC_BVH4.
 		BVH4_AFRA,			// For fast CPU rendering. Obtained by converting BASIC_BVH4.
+		BVH4_WIVE,			// For fast CPU rendering. Obtained by converting BASIC_BVH4.
 		BASIC_BVH8,			// Input for CWBVH. Obtained by converting WALD_32BYTE.
 		CWBVH				// Fastest GPU rendering. Obtained by converting BASIC_BVH8.
 	};
-	enum TraceDevice {
+	enum TraceDevice : uint32_t {
 		USE_CPU = 1,
 		USE_GPU
 	};
-	enum BuildFlags {
+	enum BuildFlags : uint32_t {
 		NONE = 0,			// Default building behavior (binned, SAH-driven).
 		FULLSPLIT = 1		// Split as far as possible, even when SAH doesn't agree.
 	};
@@ -510,8 +546,8 @@ class BVH
 	{
 		// 'Traditional' 32-byte BVH node layout, as proposed by Ingo Wald.
 		// When aligned to a cache line boundary, two of these fit together.
-		bvhvec3 aabbMin; unsigned leftFirst; // 16 bytes
-		bvhvec3 aabbMax; unsigned triCount;	// 16 bytes, total: 32 bytes
+		bvhvec3 aabbMin; uint32_t leftFirst; // 16 bytes
+		bvhvec3 aabbMax; uint32_t triCount;	// 16 bytes, total: 32 bytes
 		bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ }
 		float Intersect( const Ray& ray ) const { return BVH::IntersectAABB( ray, aabbMin, aabbMax ); }
 		float SurfaceArea() const { return BVH::SA( aabbMin, aabbMax ); }
@@ -523,8 +559,8 @@ class BVH
 		// Compared to the default BVHNode, child node indices and triangle indices
 		// are also expanded to 64bit values to support massive scenes.
 		bvhdbl3 aabbMin, aabbMax; // 2x24 bytes
-		unsigned long long int leftFirst; // 8 bytes
-		unsigned long long int triCount; // 8 bytes, total: 64 bytes
+		uint64_t leftFirst; // 8 bytes
+		uint64_t triCount; // 8 bytes, total: 64 bytes
 		bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ }
 		double Intersect( const RayEx& ray ) const;
 		double SurfaceArea() const;
@@ -533,7 +569,7 @@ class BVH
 	{
 		// Double-precision version of the fragment sruct.
 		bvhdbl3 bmin, bmax;					// AABB
-		unsigned long long int primIdx;		// index of the original primitive
+		uint64_t primIdx;		// index of the original primitive
 	};
 #endif
 	struct BVHNodeAlt
@@ -541,10 +577,10 @@ class BVH
 		// Alternative 64-byte BVH node layout, which specifies the bounds of
 		// the children rather than the node itself. This layout is used by
 		// Aila and Laine in their seminal GPU ray tracing paper.
-		bvhvec3 lmin; unsigned left;
-		bvhvec3 lmax; unsigned right;
-		bvhvec3 rmin; unsigned triCount;
-		bvhvec3 rmax; unsigned firstTri; // total: 64 bytes
+		bvhvec3 lmin; uint32_t left;
+		bvhvec3 lmax; uint32_t right;
+		bvhvec3 rmin; uint32_t triCount;
+		bvhvec3 rmax; uint32_t firstTri; // total: 64 bytes
 		bool isLeaf() const { return triCount > 0; }
 	};
 	struct BVHNodeAlt2
@@ -552,7 +588,7 @@ class BVH
 		// Second alternative 64-byte BVH node layout, same as BVHNodeAlt but
 		// with child AABBs stored in SoA order.
 		SIMDVEC4 xxxx, yyyy, zzzz;
-		unsigned left, right, triCount, firstTri; // total: 64 bytes
+		uint32_t left, right, triCount, firstTri; // total: 64 bytes
 		bool isLeaf() const { return triCount > 0; }
 	};
 	struct BVHNodeVerbose
@@ -560,28 +596,28 @@ class BVH
 		// This node layout has some extra data per node: It stores left and right
 		// child node indices explicitly, and stores the index of the parent node.
 		// This format exists primarily for the BVH optimizer.
-		bvhvec3 aabbMin; unsigned left;
-		bvhvec3 aabbMax; unsigned right;
-		unsigned triCount, firstTri, parent, dummy;
+		bvhvec3 aabbMin; uint32_t left;
+		bvhvec3 aabbMax; uint32_t right;
+		uint32_t triCount, firstTri, parent, dummy;
 		bool isLeaf() const { return triCount > 0; }
 	};
 	struct BVHNode4
 	{
-		// 4-wide (aka 'shallow') BVH layout.  
-		bvhvec3 aabbMin; unsigned firstTri;
-		bvhvec3 aabbMax; unsigned triCount;
-		unsigned child[4];
-		unsigned childCount, dummy1, dummy2, dummy3; // dummies are for alignment.
+		// 4-wide (aka 'shallow') BVH layout.
+		bvhvec3 aabbMin; uint32_t firstTri;
+		bvhvec3 aabbMax; uint32_t triCount;
+		uint32_t child[4];
+		uint32_t childCount, dummy1, dummy2, dummy3; // dummies are for alignment.
 		bool isLeaf() const { return triCount > 0; }
 	};
 	struct BVHNode4Alt
 	{
 		// 4-way BVH node, optimized for GPU rendering
-		struct aabb8 { unsigned char xmin, ymin, zmin, xmax, ymax, zmax; }; // quantized
-		bvhvec3 aabbMin; unsigned c0Info;			// 16
-		bvhvec3 aabbExt; unsigned c1Info;			// 16
-		aabb8 c0bounds, c1bounds; unsigned c2Info;	// 16
-		aabb8 c2bounds, c3bounds; unsigned c3Info;	// 16; total: 64 bytes
+		struct aabb8 { uint8_t xmin, ymin, zmin, xmax, ymax, zmax; }; // quantized
+		bvhvec3 aabbMin; uint32_t c0Info;			// 16
+		bvhvec3 aabbExt; uint32_t c1Info;			// 16
+		aabb8 c0bounds, c1bounds; uint32_t c2Info;	// 16
+		aabb8 c2bounds, c3bounds; uint32_t c3Info;	// 16; total: 64 bytes
 		// childInfo, 32bit:
 		// msb:        0=interior, 1=leaf
 		// leaf:       16 bits: relative start of triangle data, 15 bits: triangle count.
@@ -600,16 +636,29 @@ class BVH
 		// Áfra, 2013.
 		SIMDVEC4 xmin4, ymin4, zmin4;
 		SIMDVEC4 xmax4, ymax4, zmax4;
-		unsigned childFirst[4];
-		unsigned triCount[4];
+		uint32_t childFirst[4];
+		uint32_t triCount[4];
+	};
+	struct BVHNode4WiVe
+	{
+		// 4-way BVH node, optimized for CPU rendering.
+		// Based on: "Accelerated Single Ray Tracing for Wide Vector Units",
+		// Fuetterling1 et al., 2017.
+		union { SIMDVEC4 xmin4; float xmin[4]; };
+		union { SIMDVEC4 xmax4; float xmax[4]; };
+		union { SIMDVEC4 ymin4; float ymin[4]; };
+		union { SIMDVEC4 ymax4; float ymax[4]; };
+		union { SIMDVEC4 zmin4; float zmin[4]; };
+		union { SIMDVEC4 zmax4; float zmax[4]; };
+		// ORSTRec rec[4];
 	};
 	struct BVHNode8
 	{
-		// 8-wide (aka 'shallow') BVH layout.  
-		bvhvec3 aabbMin; unsigned firstTri;
-		bvhvec3 aabbMax; unsigned triCount;
-		unsigned child[8];
-		unsigned childCount, dummy1, dummy2, dummy3; // dummies are for alignment.
+		// 8-wide (aka 'shallow') BVH layout.
+		bvhvec3 aabbMin; uint32_t firstTri;
+		bvhvec3 aabbMax; uint32_t triCount;
+		uint32_t child[8];
+		uint32_t childCount, dummy1, dummy2, dummy3; // dummies are for alignment.
 		bool isLeaf() const { return triCount > 0; }
 	};
 	struct Fragment
@@ -618,14 +667,14 @@ class BVH
 		// "Parallel Spatial Splits in Bounding Volume Hierarchies", 2016, Fuetterling et al.,
 		// and refers to the potential splitting of these boxes for SBVH construction.
 		bvhvec3 bmin;				// AABB min x, y and z
-		unsigned primIdx;			// index of the original primitive
+		uint32_t primIdx;			// index of the original primitive
 		bvhvec3 bmax;				// AABB max x, y and z
-		unsigned clipped = 0;		// Fragment is the result of clipping if > 0.
+		uint32_t clipped = 0;		// Fragment is the result of clipping if > 0.
 		bool validBox() { return bmin.x < BVH_FAR; }
 	};
 	// BLASInstance: A TLAS is built over BLAS instances, where a single BLAS can be
 	// used with multiple transforms, and multiple BLASses can be combined in a complex
-	// scene. The TLAS is built over the world-space AABBs of the BLAS root nodes. 
+	// scene. The TLAS is built over the world-space AABBs of the BLAS root nodes.
 	class BLASInstance
 	{
 	public:
@@ -658,7 +707,7 @@ class BVH
 		allocatedAlt4aBlocks = 0;
 		allocatedBVH8Nodes = 0;
 	}
-	float SAHCost( const unsigned nodeIdx = 0 ) const
+	float SAHCost( const uint32_t nodeIdx = 0 ) const
 	{
 		// Determine the SAH cost of the tree. This provides an indication
 		// of the quality of the BVH: Lower is better.
@@ -667,31 +716,33 @@ class BVH
 		float cost = C_TRAV * n.SurfaceArea() + SAHCost( n.leftFirst ) + SAHCost( n.leftFirst + 1 );
 		return nodeIdx == 0 ? (cost / n.SurfaceArea()) : cost;
 	}
-	int NodeCount( const BVHLayout layout ) const;
-	int PrimCount( const unsigned nodeIdx = 0 ) const
+	int32_t NodeCount( const BVHLayout layout ) const;
+	int32_t PrimCount( const uint32_t nodeIdx = 0 ) const
 	{
 		// Determine the total number of primitives / fragments in leaf nodes.
 		const BVHNode& n = bvhNode[nodeIdx];
 		return n.isLeaf() ? n.triCount : (PrimCount( n.leftFirst ) + PrimCount( n.leftFirst + 1 ));
 	}
 	void Compact( const BVHLayout layout /* must be WALD_32BYTE or VERBOSE */ );
-	void BuildQuick( const bvhvec4* vertices, const unsigned primCount );
-	void Build( const bvhvec4* vertices, const unsigned primCount );
-	void BuildHQ( const bvhvec4* vertices, const unsigned primCount );
+	void BuildQuick( const bvhvec4* vertices, const uint32_t primCount );
+	void Build( const bvhvec4* vertices, const uint32_t primCount );
+	void BuildHQ( const bvhvec4slice& vertices );
+	void Build( const bvhvec4slice& vertices );
+	void BuildHQ( const bvhvec4* vertices, const uint32_t primCount );
 #ifdef BVH_USEAVX
-	void BuildAVX( const bvhvec4* vertices, const unsigned primCount );
+	void BuildAVX( const bvhvec4* vertices, const uint32_t primCount );
 #elif defined BVH_USENEON
-	void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
+	void BuildNEON( const bvhvec4* vertices, const uint32_t primCount );
 #endif
-	void BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount );
-	void BuildTLAS( const BLASInstance* bvhs, const unsigned instCount );
+	void BuildTLAS( const bvhaabb* aabbs, const uint32_t aabbCount );
+	void BuildTLAS( const BLASInstance* bvhs, const uint32_t instCount );
 	void Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal = false );
-	void SplitLeafs( const unsigned maxPrims = 1 ); // operates on VERBOSE layout
-	void SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims = 1 ); // operates on BVH8 layout
+	void SplitLeafs( const uint32_t maxPrims = 1 ); // operates on VERBOSE layout
+	void SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims = 1 ); // operates on BVH8 layout
 	void MergeLeafs(); // operates on VERBOSE layout
-	void Optimize( const unsigned iterations ); // operates on VERBOSE
-	void Refit( const BVHLayout layout = WALD_32BYTE, const unsigned nodeIdx = 0 );
-	int Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
+	void Optimize( const uint32_t iterations ); // operates on VERBOSE
+	void Refit( const BVHLayout layout = WALD_32BYTE, const uint32_t nodeIdx = 0 );
+	int32_t Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
 	// IntersectTLAS: Interface is under construction. Current plan:
 	// * application constructs one or more BVHs (BLAS) using a layout of choice;
 	// * application instantiates one or more BVHInstances using the blasses;
@@ -700,48 +751,48 @@ class BVH
 	// In the leafs of the TLAS, tiny_bvh finds a BVHInstance index. Using the index,
 	// the BLAS is found, the ray is transformed, and the appropriate Intersect_xxx
 	// method is called.
-	int IntersectTLAS( Ray& ray ) const;
+	int32_t IntersectTLAS( Ray& ray ) const;
 	bool IsOccluded( const Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
-	void BatchIntersect( Ray* rayBatch, const unsigned N,
+	void BatchIntersect( Ray* rayBatch, const uint32_t N,
 		const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
-	void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result,
+	void BatchIsOccluded( Ray* rayBatch, const uint32_t N, uint32_t* result,
 		const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
 	void Intersect256Rays( Ray* first ) const;
 	void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX
 private:
 	void* AlignedAlloc( size_t size );
 	void AlignedFree( void* ptr );
-	int Intersect_Wald32Byte( Ray& ray ) const;
-	int Intersect_AilaLaine( Ray& ray ) const;
-	int Intersect_Afra( Ray& ray ) const;
-	int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX or BVH_USENEON.
-	int Intersect_BasicBVH4( Ray& ray ) const; // only for testing, not efficient.
-	int Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient.
-	int Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient.
-	int Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient.
+	int32_t Intersect_Wald32Byte( Ray& ray ) const;
+	int32_t Intersect_AilaLaine( Ray& ray ) const;
+	int32_t Intersect_Afra( Ray& ray ) const;
+	int32_t Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX or BVH_USENEON.
+	int32_t Intersect_BasicBVH4( Ray& ray ) const; // only for testing, not efficient.
+	int32_t Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient.
+	int32_t Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient.
+	int32_t Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient.
 	bool IsOccluded_Wald32Byte( const Ray& ray ) const;
 	bool IsOccluded_AilaLaine( const Ray& ray ) const;
 	bool IsOccluded_AltSoA( const Ray& ray ) const;
 	bool IsOccluded_Afra( const Ray& ray ) const;
-	void IntersectTri( Ray& ray, const unsigned triIdx ) const;
+	void IntersectTri( Ray& ray, const uint32_t triIdx ) const;
 	static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax );
 	static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax )
 	{
 		bvhvec3 e = aabbMax - aabbMin; // extent of the node
 		return e.x * e.y + e.y * e.z + e.z * e.x;
 	}
-	void PrecomputeTriangle( const bvhvec4* const vert, float* T );
+	void PrecomputeTriangle( const bvhvec4slice& vert, uint32_t triIndex, float* T );
 	bool ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhvec3 bmax, bvhvec3 minDim );
-	void RefitUpVerbose( unsigned nodeIdx );
-	unsigned FindBestNewPosition( const unsigned Lid );
-	void ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin );
-	unsigned CountSubtreeTris( const unsigned nodeIdx, unsigned* counters );
-	void MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr );
+	void RefitUpVerbose( uint32_t nodeIdx );
+	uint32_t FindBestNewPosition( const uint32_t Lid );
+	void ReinsertNodeVerbose( const uint32_t Lid, const uint32_t Nid, const uint32_t origin );
+	uint32_t CountSubtreeTris( const uint32_t nodeIdx, uint32_t* counters );
+	void MergeSubtree( const uint32_t nodeIdx, uint32_t* newIdx, uint32_t& newIdxPtr );
 #ifdef DOUBLE_PRECISION_SUPPORT
 	// double-precision things
 public:
-	void BuildEx( const bvhdbl3* vertices, const unsigned primCount );
-	double SAHCostEx( const unsigned long long int nodeIdx = 0 ) const
+	void BuildEx( const bvhdbl3* vertices, const uint32_t primCount );
+	double SAHCostEx( const uint64_t nodeIdx = 0 ) const
 	{
 		// Determine the SAH cost of a double-precision tree.
 		const BVHNodeEx& n = bvhNodeEx[nodeIdx];
@@ -749,24 +800,24 @@ class BVH
 		double cost = C_TRAV * n.SurfaceArea() + SAHCostEx( n.leftFirst ) + SAHCostEx( n.leftFirst + 1 );
 		return nodeIdx == 0 ? (cost / n.SurfaceArea()) : cost;
 	}
-	int IntersectEx( RayEx& ray, const BVHLayout layout = WALD_DOUBLE ) const;
-	int IntersectEx_WaldDouble( RayEx& ray ) const;
+	int32_t IntersectEx( RayEx& ray, const BVHLayout layout = WALD_DOUBLE ) const;
+	int32_t IntersectEx_WaldDouble( RayEx& ray ) const;
 	FragmentEx* fragEx = 0;			// input primitive bounding boxes, double-precision.
-	unsigned long long int* triIdxEx; // primitive index array for double-precision bvh.
-	unsigned allocatedBVHExNodes = 0;
-	unsigned usedBVHExNodes = 0;
+	uint64_t* triIdxEx; // primitive index array for double-precision bvh.
+	uint32_t allocatedBVHExNodes = 0;
+	uint32_t usedBVHExNodes = 0;
 private:
 	bvhdbl3* vertsEx = 0;			// pointer to input primitive array, double-precision, 3x24 bytes per tri.
 	BVHNodeEx* bvhNodeEx = 0;		// BVH node, double precision format.
 #endif
 public:
 	// Basic BVH data (WALD_32BYTE layout).
-	bvhvec4* verts = 0;				// pointer to input primitive array: 3x16 bytes per tri.
-	unsigned triCount = 0;			// number of primitives in tris.
-	unsigned* triIdx = 0;			// primitive index array.
+	bvhvec4slice verts = {};				// pointer to input primitive array: 3x16 bytes per tri.
+	uint32_t triCount = 0;			// number of primitives in tris.
+	uint32_t* triIdx = 0;			// primitive index array.
 	BVHNode* bvhNode = 0;			// BVH node pool, Wald 32-byte format. Root is always in node 0.
 	Fragment* fragment = 0;			// input primitive bounding boxes.
-	unsigned idxCount = 0;			// number of indices in triIdx. May exceed triCount * 3 for SBVH.
+	uint32_t idxCount = 0;			// number of indices in triIdx. May exceed triCount * 3 for SBVH.
 	BuildFlags buildFlag = NONE;	// hint to the builder: currently, NONE or FULLSPLIT.
 	// Data for various other layuouts.
 	BVHNodeAlt* altNode = 0;		// BVH node in Aila & Laine format.
@@ -774,7 +825,8 @@ class BVH
 	BVHNodeVerbose* verbose = 0;	// BVH node with additional info, for BVH optimizer.
 	BVHNode4* bvh4Node = 0;			// BVH node for 4-wide BVH.
 	bvhvec4* bvh4Alt = 0;			// 64-byte 4-wide BVH node for efficient GPU rendering.
-	BVHNode4Alt2* bvh4Alt2 = 0;		// 64-byte 4-wide BVH node for efficient CPU rendering.
+	BVHNode4Alt2* bvh4Alt2 = 0;		// 128-byte 4-wide BVH node for efficient CPU rendering.
+	BVHNode4WiVe* bvh4WiVe = 0;		// 128-byte 4-wide BVH node for efficient CPU rendering.
 	bvhvec4* bvh4Tris = 0;			// triangle data for BVHNode4Alt2 nodes.
 	BVHNode8* bvh8Node = 0;			// BVH node for 8-wide BVH.
 	bvhvec4* bvh8Compact = 0;		// nodes in CWBVH format.
@@ -787,24 +839,26 @@ class BVH
 	bool bvh_over_aabbs = false;	// a BVH over AABBs is useful for e.g. TLAS traversal.
 	BVHContext context;				// context used to provide user-defined allocation functions
 	// Keep track of allocated buffer size to avoid repeated allocation during layout conversion.
-	unsigned allocatedBVHNodes = 0;
-	unsigned allocatedAltNodes = 0;
-	unsigned allocatedAlt2Nodes = 0;
-	unsigned allocatedVerbose = 0;
-	unsigned allocatedBVH4Nodes = 0;
-	unsigned allocatedAlt4aBlocks = 0;
-	unsigned allocatedAlt4bNodes = 0;
-	unsigned allocatedBVH8Nodes = 0;
-	unsigned allocatedCWBVHBlocks = 0;
-	unsigned usedBVHNodes = 0;
-	unsigned usedAltNodes = 0;
-	unsigned usedAlt2Nodes = 0;
-	unsigned usedVerboseNodes = 0;
-	unsigned usedBVH4Nodes = 0;
-	unsigned usedAlt4aBlocks = 0;
-	unsigned usedAlt4bNodes = 0;
-	unsigned usedBVH8Nodes = 0;
-	unsigned usedCWBVHBlocks = 0;
+	uint32_t allocatedBVHNodes = 0;
+	uint32_t allocatedAltNodes = 0;
+	uint32_t allocatedAlt2Nodes = 0;
+	uint32_t allocatedVerbose = 0;
+	uint32_t allocatedBVH4Nodes = 0;
+	uint32_t allocatedAlt4aBlocks = 0;
+	uint32_t allocatedAlt4bNodes = 0;
+	uint32_t allocatedWiVeNodes = 0;
+	uint32_t allocatedBVH8Nodes = 0;
+	uint32_t allocatedCWBVHBlocks = 0;
+	uint32_t usedBVHNodes = 0;
+	uint32_t usedAltNodes = 0;
+	uint32_t usedAlt2Nodes = 0;
+	uint32_t usedVerboseNodes = 0;
+	uint32_t usedBVH4Nodes = 0;
+	uint32_t usedAlt4aBlocks = 0;
+	uint32_t usedAlt4bNodes = 0;
+	uint32_t usedWiVeNodes = 0;
+	uint32_t usedBVH8Nodes = 0;
+	uint32_t usedCWBVHBlocks = 0;
 };
 
 } // namespace tinybvh
@@ -812,7 +866,7 @@ class BVH
 // ============================================================================
 //
 //        I M P L E M E N T A T I O N
-// 
+//
 // ============================================================================
 
 #ifdef TINYBVH_IMPLEMENTATION
@@ -822,6 +876,8 @@ class BVH
 #include <intrin.h>			// for __lzcnt
 #endif
 
+// We need quite a bit of type reinterpretation, so we'll 
+// turn off the gcc warning here until the end of the file.
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
@@ -829,6 +885,40 @@ class BVH
 
 namespace tinybvh {
 
+#if defined BVH_USEAVX || defined BVH_USENEON
+
+static uint32_t __bfind( uint32_t x ) // https://github.com/mackron/refcode/blob/master/lzcnt.c
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+	return 31 - __lzcnt( x );
+#elif defined(__EMSCRIPTEN__)
+	return 31 - __builtin_clz( x );
+#elif defined(__GNUC__) || defined(__clang__)
+#ifndef __APPLE__
+	uint32_t r;
+	__asm__ __volatile__( "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" );
+	return 31 - r;
+#else
+	return 31 - __builtin_clz( x ); // TODO: unverified.
+#endif
+#endif
+}
+
+bvhvec4::bvhvec4( const bvhvec3& a ) { x = a.x; y = a.y; z = a.z; w = 0; }
+bvhvec4::bvhvec4( const bvhvec3& a, float b ) { x = a.x; y = a.y; z = a.z; w = b; }
+
+bvhvec4slice::bvhvec4slice( const bvhvec4* data, uint32_t count, uint32_t stride ) :
+	data{ reinterpret_cast<const int8_t*>(data) },
+	count{ count }, stride{ stride } {}
+
+const bvhvec4& bvhvec4slice::operator[]( size_t i ) const
+{
+#ifdef PARANOID
+	FATAL_ERROR_IF( i >= count, "bvhvec4slice::[..], Reading outside slice." );
+#endif
+	return *reinterpret_cast<const bvhvec4*>(data + stride * i);
+}
+
 void* BVH::AlignedAlloc( size_t size )
 {
 	return context.malloc ? context.malloc( size, context.userdata ) : nullptr;
@@ -846,7 +936,7 @@ void BVH::BLASInstance::Update()
 	// transform and calculate the worldspace aabb over these.
 	worldBounds.minBounds = bvhvec3( BVH_FAR ), worldBounds.maxBounds = bvhvec3( -BVH_FAR );
 	bvhvec3 bmin = blas->bvhNode[0].aabbMin, bmax = blas->bvhNode[0].aabbMax;
-	for (int i = 0; i < 8; i++)
+	for (int32_t i = 0; i < 8; i++)
 	{
 		const bvhvec3 p( i & 1 ? bmax.x : bmin.x, i & 2 ? bmax.y : bmin.y, i & 4 ? bmax.z : bmin.z );
 		const bvhvec3 t = TransformPoint( p );
@@ -856,9 +946,10 @@ void BVH::BLASInstance::Update()
 }
 
 // BVH builder entry point for arrays of aabbs.
-void BVH::BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount )
+void BVH::BuildTLAS( const bvhaabb* aabbs, const uint32_t aabbCount )
 {
 	// the aabb array must be cacheline aligned.
+	FATAL_ERROR_IF( aabbCount == 0, "BVH::BuildTLAS( .. ), aabbCount == 0." );
 	FATAL_ERROR_IF( ((long long)(void*)aabbs & 31) != 0, "BVH::Build( bvhaabb* ), array not cacheline aligned." );
 	// take the array and process it
 	fragment = (Fragment*)aabbs;
@@ -867,13 +958,14 @@ void BVH::BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount )
 	Build( (bvhvec4*)0, aabbCount ); // TODO: for very large scenes, use BuildAVX. Mind fragment sign flip!
 }
 
-void BVH::BuildTLAS( const BLASInstance* bvhs, const unsigned instCount )
+void BVH::BuildTLAS( const BLASInstance* bvhs, const uint32_t instCount )
 {
+	FATAL_ERROR_IF( instCount == 0, "BVH::BuildTLAS( .. ), instCount == 0." );
 	if (!fragment) fragment = (Fragment*)AlignedAlloc( instCount );
 	else FATAL_ERROR_IF( instCount != triCount, "BVH::BuildTLAS( .. ), blas count changed." );
 	// copy relevant data from instance array
 	triCount = instCount;
-	for (unsigned i = 0; i < instCount; i++)
+	for (uint32_t i = 0; i < instCount; i++)
 	{
 		fragment[i].bmin = bvhs[i].worldBounds.minBounds;
 		fragment[i].primIdx = i;
@@ -886,10 +978,11 @@ void BVH::BuildTLAS( const BLASInstance* bvhs, const unsigned instCount )
 // This builder yields a correct BVH in little time, but the quality of the
 // structure will be low. Use this only if build time is the bottleneck in
 // your application (e.g., when you need to trace few rays).
-void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount )
+void BVH::BuildQuick( const bvhvec4* vertices, const uint32_t primCount )
 {
+	FATAL_ERROR_IF( primCount == 0, "BVH::BuildQuick( .. ), primCount == 0." );
 	// allocate on first build
-	const unsigned spaceNeeded = primCount * 2; // upper limit
+	const uint32_t spaceNeeded = primCount * 2; // upper limit
 	if (allocatedBVHNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNode );
@@ -898,19 +991,19 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount )
 		bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
 		allocatedBVHNodes = spaceNeeded;
 		memset( &bvhNode[1], 0, 32 );	// node 1 remains unused, for cache line alignment.
-		triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) );
+		triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) );
 		fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::BuildQuick( .. ), bvh not rebuildable." );
-	verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete.
+	verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete.
 	idxCount = triCount = primCount;
 	// reset node pool
-	unsigned newNodePtr = 2;
+	uint32_t newNodePtr = 2;
 	// assign all triangles to the root node
 	BVHNode& root = bvhNode[0];
 	root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR );
 	// initialize fragments and initialize root node bounds
-	for (unsigned i = 0; i < triCount; i++)
+	for (uint32_t i = 0; i < triCount; i++)
 	{
 		fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
 		fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
@@ -918,21 +1011,21 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount )
 		root.aabbMax = tinybvh_max( root.aabbMax, fragment[i].bmax ), triIdx[i] = i;
 	}
 	// subdivide recursively
-	unsigned task[256], taskCount = 0, nodeIdx = 0;
+	uint32_t task[256], taskCount = 0, nodeIdx = 0;
 	while (1)
 	{
 		while (1)
 		{
 			BVHNode& node = bvhNode[nodeIdx];
 			// in-place partition against midpoint on longest axis
-			unsigned j = node.leftFirst + node.triCount, src = node.leftFirst;
+			uint32_t j = node.leftFirst + node.triCount, src = node.leftFirst;
 			bvhvec3 extent = node.aabbMax - node.aabbMin;
-			unsigned axis = 0;
+			uint32_t axis = 0;
 			if (extent.y > extent.x && extent.y > extent.z) axis = 1;
 			if (extent.z > extent.x && extent.z > extent.y) axis = 2;
 			float splitPos = node.aabbMin[axis] + extent[axis] * 0.5f, centroid;
 			bvhvec3 lbmin( BVH_FAR ), lbmax( -BVH_FAR ), rbmin( BVH_FAR ), rbmax( -BVH_FAR ), fmin, fmax;
-			for (unsigned fi, i = 0; i < node.triCount; i++)
+			for (uint32_t fi, i = 0; i < node.triCount; i++)
 			{
 				fi = triIdx[src], fmin = fragment[fi].bmin, fmax = fragment[fi].bmax;
 				centroid = (fmin[axis] + fmax[axis]) * 0.5f;
@@ -945,9 +1038,9 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount )
 				}
 			}
 			// create child nodes
-			const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
 			if (leftCount == 0 || rightCount == 0) break; // split did not work out.
-			const int lci = newNodePtr++, rci = newNodePtr++;
+			const int32_t lci = newNodePtr++, rci = newNodePtr++;
 			bvhNode[lci].aabbMin = lbmin, bvhNode[lci].aabbMax = lbmax;
 			bvhNode[lci].leftFirst = node.leftFirst, bvhNode[lci].triCount = leftCount;
 			bvhNode[rci].aabbMin = rbmin, bvhNode[rci].aabbMax = rbmax;
@@ -966,16 +1059,18 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount )
 	usedBVHNodes = newNodePtr;
 }
 
-// Basic single-function binned-SAH-builder. 
-// This is the reference builder; it yields a decent tree suitable for ray 
-// tracing on the CPU. This code uses no SIMD instructions. 
+// Basic single-function binned-SAH-builder.
+// This is the reference builder; it yields a decent tree suitable for ray
+// tracing on the CPU. This code uses no SIMD instructions.
 // Faster code, using SSE/AVX, is available for x64 CPUs.
 // For GPU rendering: The resulting BVH should be converted to a more optimal
 // format after construction.
-void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
+void BVH::Build( const bvhvec4slice& vertices )
 {
+	FATAL_ERROR_IF( vertices.count == 0, "BVH::Build( .. ), primCount == 0." );
 	// allocate on first build
-	const unsigned spaceNeeded = primCount * 2; // upper limit
+	const uint32_t primCount = vertices.count / 3;
+	const uint32_t spaceNeeded = primCount * 2; // upper limit
 	if (allocatedBVHNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNode );
@@ -984,15 +1079,15 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 		bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
 		allocatedBVHNodes = spaceNeeded;
 		memset( &bvhNode[1], 0, 32 );	// node 1 remains unused, for cache line alignment.
-		triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) );
+		triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) );
 		if (vertices) fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) );
 		else FATAL_ERROR_IF( fragment == 0, "BVH::Build( 0, .. ), not called from ::Build( aabb )." );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::Build( .. ), bvh not rebuildable." );
-	verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete.
+	verts = vertices;
 	idxCount = triCount = primCount;
 	// reset node pool
-	unsigned newNodePtr = 2;
+	uint32_t newNodePtr = 2;
 	// assign all triangles to the root node
 	BVHNode& root = bvhNode[0];
 	root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR );
@@ -1000,7 +1095,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 	if (verts)
 	{
 		// building a BVH over triangles specified as three 16-byte vertices each.
-		for (unsigned i = 0; i < triCount; i++)
+		for (uint32_t i = 0; i < triCount; i++)
 		{
 			fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
 			fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
@@ -1011,14 +1106,14 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 	else
 	{
 		// we are building the BVH over aabbs we received from ::Build( tinyaabb* ): vertices == 0.
-		for (unsigned i = 0; i < triCount; i++)
+		for (uint32_t i = 0; i < triCount; i++)
 		{
 			root.aabbMin = tinybvh_min( root.aabbMin, fragment[i].bmin );
 			root.aabbMax = tinybvh_max( root.aabbMax, fragment[i].bmax ), triIdx[i] = i; // here: aabb index.
 		}
 	}
 	// subdivide recursively
-	unsigned task[256], taskCount = 0, nodeIdx = 0;
+	uint32_t task[256], taskCount = 0, nodeIdx = 0;
 	bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-20f, bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0;
 	while (1)
 	{
@@ -1027,13 +1122,13 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 			BVHNode& node = bvhNode[nodeIdx];
 			// find optimal object split
 			bvhvec3 binMin[3][BVHBINS], binMax[3][BVHBINS];
-			for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR;
-			unsigned count[3][BVHBINS];
-			memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) );
+			for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR;
+			uint32_t count[3][BVHBINS];
+			memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) );
 			const bvhvec3 rpd3 = bvhvec3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin;
-			for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
+			for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
 			{
-				const unsigned fi = triIdx[node.leftFirst + i];
+				const uint32_t fi = triIdx[node.leftFirst + i];
 				bvhint3 bi = bvhint3( ((fragment[fi].bmin + fragment[fi].bmax) * 0.5f - nmin3) * rpd3 );
 				bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 );
 				bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 );
@@ -1047,13 +1142,13 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 			}
 			// calculate per-split totals
 			float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea();
-			unsigned bestAxis = 0, bestPos = 0;
-			for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
+			uint32_t bestAxis = 0, bestPos = 0;
+			for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
 			{
 				bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR;
 				bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR;
 				float ANL[BVHBINS - 1], ANR[BVHBINS - 1];
-				for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
 				{
 					lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] );
 					rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] );
@@ -1064,7 +1159,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 					ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN);
 				}
 				// evaluate bin totals to find best position for object split
-				for (unsigned i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t i = 0; i < BVHBINS - 1; i++)
 				{
 					const float C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]);
 					if (C < splitCost)
@@ -1077,19 +1172,19 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 			float noSplitCost = (float)node.triCount * C_INT;
 			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
-			unsigned j = node.leftFirst + node.triCount, src = node.leftFirst;
+			uint32_t j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis];
-			for (unsigned i = 0; i < node.triCount; i++)
+			for (uint32_t i = 0; i < node.triCount; i++)
 			{
-				const unsigned fi = triIdx[src];
-				int bi = (unsigned)(((fragment[fi].bmin[bestAxis] + fragment[fi].bmax[bestAxis]) * 0.5f - nmin) * rpd);
+				const uint32_t fi = triIdx[src];
+				int32_t bi = (uint32_t)(((fragment[fi].bmin[bestAxis] + fragment[fi].bmax[bestAxis]) * 0.5f - nmin) * rpd);
 				bi = tinybvh_clamp( bi, 0, BVHBINS - 1 );
-				if ((unsigned)bi <= bestPos) src++; else tinybvh_swap( triIdx[src], triIdx[--j] );
+				if ((uint32_t)bi <= bestPos) src++; else tinybvh_swap( triIdx[src], triIdx[--j] );
 			}
 			// create child nodes
-			unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
 			if (leftCount == 0 || rightCount == 0) break; // should not happen.
-			const int lci = newNodePtr++, rci = newNodePtr++;
+			const int32_t lci = newNodePtr++, rci = newNodePtr++;
 			bvhNode[lci].aabbMin = bestLMin, bvhNode[lci].aabbMax = bestLMax;
 			bvhNode[lci].leftFirst = node.leftFirst, bvhNode[lci].triCount = leftCount;
 			bvhNode[rci].aabbMin = bestRMin, bvhNode[rci].aabbMax = bestRMax;
@@ -1109,19 +1204,26 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 	usedBVHNodes = newNodePtr;
 }
 
+void BVH::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice{ vertices, primCount * 3 } );
+}
+
 // SBVH builder.
 // Besides the regular object splits used in the reference builder, the SBVH
 // algorithm also considers spatial splits, where primitives may be cut in
 // multiple parts. This increases primitive count but may reduce overlap of
-// BVH nodes. The cost of each option is considered per split. 
-// For typical geometry, SBVH yields a tree that can be traversed 25% faster. 
-// This comes at greatly increased construction cost, making the SBVH 
+// BVH nodes. The cost of each option is considered per split.
+// For typical geometry, SBVH yields a tree that can be traversed 25% faster.
+// This comes at greatly increased construction cost, making the SBVH
 // primarily useful for static geometry.
-void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
+void BVH::BuildHQ( const bvhvec4slice& vertices )
 {
+	FATAL_ERROR_IF( vertices.count == 0, "BVH::BuildHQ( .. ), primCount == 0." );
 	// allocate on first build
-	const unsigned slack = primCount >> 2; // for split prims
-	const unsigned spaceNeeded = primCount * 3;
+	const uint32_t primCount = vertices.count / 3;
+	const uint32_t slack = primCount >> 2; // for split prims
+	const uint32_t spaceNeeded = primCount * 3;
 	if (allocatedBVHNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNode );
@@ -1130,23 +1232,23 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 		bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
 		allocatedBVHNodes = spaceNeeded;
 		memset( &bvhNode[1], 0, 32 );	// node 1 remains unused, for cache line alignment.
-		triIdx = (unsigned*)AlignedAlloc( (primCount + slack) * sizeof( unsigned ) );
+		triIdx = (uint32_t*)AlignedAlloc( (primCount + slack) * sizeof( uint32_t ) );
 		fragment = (Fragment*)AlignedAlloc( (primCount + slack) * sizeof( Fragment ) );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::BuildHQ( .. ), bvh not rebuildable." );
-	verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete.
+	verts = vertices; // note: we're not copying this data; don't delete.
 	idxCount = primCount + slack;
 	triCount = primCount;
-	unsigned* triIdxA = triIdx, * triIdxB = new unsigned[triCount + slack];
+	uint32_t* triIdxA = triIdx, * triIdxB = new uint32_t[triCount + slack];
 	memset( triIdxA, 0, (triCount + slack) * 4 );
 	memset( triIdxB, 0, (triCount + slack) * 4 );
 	// reset node pool
-	unsigned newNodePtr = 2, nextFrag = triCount;
+	uint32_t newNodePtr = 2, nextFrag = triCount;
 	// assign all triangles to the root node
 	BVHNode& root = bvhNode[0];
 	root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR );
 	// initialize fragments and initialize root node bounds
-	for (unsigned i = 0; i < triCount; i++)
+	for (uint32_t i = 0; i < triCount; i++)
 	{
 		fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
 		fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] );
@@ -1155,9 +1257,9 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 	}
 	const float rootArea = (root.aabbMax - root.aabbMin).halfArea();
 	// subdivide recursively
-	struct Task { unsigned node, sliceStart, sliceEnd, dummy; };
+	struct Task { uint32_t node, sliceStart, sliceEnd, dummy; };
 	ALIGNED( 64 ) Task task[256];
-	unsigned taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack;
+	uint32_t taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack;
 	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f /* don't touch, carefully picked */;
 	bvhvec3 bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0;
 	while (1)
@@ -1167,13 +1269,13 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			BVHNode& node = bvhNode[nodeIdx];
 			// find optimal object split
 			bvhvec3 binMin[3][BVHBINS], binMax[3][BVHBINS];
-			for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR;
-			unsigned count[3][BVHBINS];
-			memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) );
+			for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR;
+			uint32_t count[3][BVHBINS];
+			memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) );
 			const bvhvec3 rpd3 = bvhvec3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin;
-			for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
+			for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
 			{
-				const unsigned fi = triIdx[node.leftFirst + i];
+				const uint32_t fi = triIdx[node.leftFirst + i];
 				bvhint3 bi = bvhint3( ((fragment[fi].bmin + fragment[fi].bmax) * 0.5f - nmin3) * rpd3 );
 				bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 );
 				bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 );
@@ -1187,13 +1289,13 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			}
 			// calculate per-split totals
 			float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea();
-			unsigned bestAxis = 0, bestPos = 0;
-			for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+			uint32_t bestAxis = 0, bestPos = 0;
+			for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
 			{
 				bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR;
 				bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR;
 				float ANL[BVHBINS - 1], ANR[BVHBINS - 1];
-				for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
 				{
 					lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] );
 					rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] );
@@ -1204,7 +1306,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 					ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN);
 				}
 				// evaluate bin totals to find best position for object split
-				for (unsigned i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t i = 0; i < BVHBINS - 1; i++)
 				{
 					const float C = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
 					if (C < splitCost)
@@ -1216,25 +1318,25 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			}
 			// consider a spatial split
 			bool spatial = false;
-			unsigned NL[BVHBINS - 1], NR[BVHBINS - 1], budget = sliceEnd - sliceStart;
+			uint32_t NL[BVHBINS - 1], NR[BVHBINS - 1], budget = sliceEnd - sliceStart;
 			bvhvec3 spatialUnion = bestLMax - bestRMin;
 			float spatialOverlap = (spatialUnion.halfArea()) / rootArea;
 			if (budget > node.triCount && splitCost < BVH_FAR && spatialOverlap > 1e-5f)
 			{
-				for (unsigned a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+				for (uint32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
 				{
 					// setup bins
 					bvhvec3 binMin[BVHBINS], binMax[BVHBINS];
-					for (unsigned i = 0; i < BVHBINS; i++) binMin[i] = BVH_FAR, binMax[i] = -BVH_FAR;
-					unsigned countIn[BVHBINS] = { 0 }, countOut[BVHBINS] = { 0 };
+					for (uint32_t i = 0; i < BVHBINS; i++) binMin[i] = BVH_FAR, binMax[i] = -BVH_FAR;
+					uint32_t countIn[BVHBINS] = { 0 }, countOut[BVHBINS] = { 0 };
 					// populate bins with clipped fragments
 					const float planeDist = (node.aabbMax[a] - node.aabbMin[a]) / (BVHBINS * 0.9999f);
 					const float rPlaneDist = 1.0f / planeDist, nodeMin = node.aabbMin[a];
-					for (unsigned i = 0; i < node.triCount; i++)
+					for (uint32_t i = 0; i < node.triCount; i++)
 					{
-						const unsigned fragIdx = triIdxA[node.leftFirst + i];
-						const int bin1 = tinybvh_clamp( (int)((fragment[fragIdx].bmin[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 );
-						const int bin2 = tinybvh_clamp( (int)((fragment[fragIdx].bmax[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 );
+						const uint32_t fragIdx = triIdxA[node.leftFirst + i];
+						const int32_t bin1 = tinybvh_clamp( (int32_t)((fragment[fragIdx].bmin[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 );
+						const int32_t bin2 = tinybvh_clamp( (int32_t)((fragment[fragIdx].bmax[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 );
 						countIn[bin1]++, countOut[bin2]++;
 						if (bin2 == bin1)
 						{
@@ -1242,7 +1344,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 							binMin[bin1] = tinybvh_min( binMin[bin1], fragment[fragIdx].bmin );
 							binMax[bin1] = tinybvh_max( binMax[bin1], fragment[fragIdx].bmax );
 						}
-						else for (int j = bin1; j <= bin2; j++)
+						else for (int32_t j = bin1; j <= bin2; j++)
 						{
 							// clip fragment to each bin it overlaps
 							bvhvec3 bmin = node.aabbMin, bmax = node.aabbMax;
@@ -1259,7 +1361,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 					bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR;
 					bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR;
 					float ANL[BVHBINS], ANR[BVHBINS];
-					for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
+					for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
 					{
 						lBMin[i] = l1 = tinybvh_min( l1, binMin[i] ), rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[BVHBINS - 1 - i] );
 						lBMax[i] = l2 = tinybvh_max( l2, binMax[i] ), rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[BVHBINS - 1 - i] );
@@ -1268,7 +1370,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 						ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN);
 					}
 					// find best position for spatial split
-					for (unsigned i = 0; i < BVHBINS - 1; i++)
+					for (uint32_t i = 0; i < BVHBINS - 1; i++)
 					{
 						const float Cspatial = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
 						if (Cspatial < splitCost && NL[i] + NR[i] < budget)
@@ -1284,16 +1386,16 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			float noSplitCost = (float)node.triCount * C_INT;
 			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// double-buffered partition
-			unsigned A = sliceStart, B = sliceEnd, src = node.leftFirst;
+			uint32_t A = sliceStart, B = sliceEnd, src = node.leftFirst;
 			if (spatial)
 			{
 				const float planeDist = (node.aabbMax[bestAxis] - node.aabbMin[bestAxis]) / (BVHBINS * 0.9999f);
 				const float rPlaneDist = 1.0f / planeDist, nodeMin = node.aabbMin[bestAxis];
-				for (unsigned i = 0; i < node.triCount; i++)
+				for (uint32_t i = 0; i < node.triCount; i++)
 				{
-					const unsigned fragIdx = triIdxA[src++];
-					const unsigned bin1 = (unsigned)((fragment[fragIdx].bmin[bestAxis] - nodeMin) * rPlaneDist);
-					const unsigned bin2 = (unsigned)((fragment[fragIdx].bmax[bestAxis] - nodeMin) * rPlaneDist);
+					const uint32_t fragIdx = triIdxA[src++];
+					const uint32_t bin1 = (uint32_t)((fragment[fragIdx].bmin[bestAxis] - nodeMin) * rPlaneDist);
+					const uint32_t bin2 = (uint32_t)((fragment[fragIdx].bmax[bestAxis] - nodeMin) * rPlaneDist);
 					if (bin2 <= bestPos) triIdxB[A++] = fragIdx; else if (bin1 > bestPos) triIdxB[--B] = fragIdx; else
 					{
 						// split straddler
@@ -1310,20 +1412,20 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			{
 				// object partitioning
 				const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis];
-				for (unsigned i = 0; i < node.triCount; i++)
+				for (uint32_t i = 0; i < node.triCount; i++)
 				{
-					const unsigned fr = triIdx[src + i];
-					int bi = (int)(((fragment[fr].bmin[bestAxis] + fragment[fr].bmax[bestAxis]) * 0.5f - nmin) * rpd);
+					const uint32_t fr = triIdx[src + i];
+					int32_t bi = (int32_t)(((fragment[fr].bmin[bestAxis] + fragment[fr].bmax[bestAxis]) * 0.5f - nmin) * rpd);
 					bi = tinybvh_clamp( bi, 0, BVHBINS - 1 );
-					if (bi <= (int)bestPos) triIdxB[A++] = fr; else triIdxB[--B] = fr;
+					if (bi <= (int32_t)bestPos) triIdxB[A++] = fr; else triIdxB[--B] = fr;
 				}
 			}
 			// copy back slice data
 			memcpy( triIdxA + sliceStart, triIdxB + sliceStart, (sliceEnd - sliceStart) * 4 );
 			// create child nodes
-			unsigned leftCount = A - sliceStart, rightCount = sliceEnd - B;
+			uint32_t leftCount = A - sliceStart, rightCount = sliceEnd - B;
 			if (leftCount == 0 || rightCount == 0) break;
-			int leftChildIdx = newNodePtr++, rightChildIdx = newNodePtr++;
+			int32_t leftChildIdx = newNodePtr++, rightChildIdx = newNodePtr++;
 			bvhNode[leftChildIdx].aabbMin = bestLMin, bvhNode[leftChildIdx].aabbMax = bestLMax;
 			bvhNode[leftChildIdx].leftFirst = sliceStart, bvhNode[leftChildIdx].triCount = leftCount;
 			bvhNode[rightChildIdx].aabbMin = bestRMin, bvhNode[rightChildIdx].aabbMax = bestRMax;
@@ -1342,7 +1444,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 			sliceEnd = task[taskCount].sliceEnd;
 	}
 	// clean up
-	for (unsigned i = 0; i < triCount + slack; i++) triIdx[i] = fragment[triIdx[i]].primIdx;
+	for (uint32_t i = 0; i < triCount + slack; i++) triIdx[i] = fragment[triIdx[i]].primIdx;
 	// Compact(); - TODO
 	// all done.
 	refittable = false; // can't refit an SBVH
@@ -1351,13 +1453,18 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 	usedBVHNodes = newNodePtr;
 }
 
+void BVH::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
+{
+	BuildHQ( bvhvec4slice{ vertices, primCount * 3 } );
+}
+
 // Convert: Change the BVH layout from one format into another.
 void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* deleteOriginal */ )
 {
 	if (from == WALD_32BYTE && to == AILA_LAINE)
 	{
 		// allocate space
-		const unsigned spaceNeeded = usedBVHNodes;
+		const uint32_t spaceNeeded = usedBVHNodes;
 		if (allocatedAltNodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, AILA_LAINE ), bvhNode == 0." );
@@ -1367,18 +1474,18 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( altNode, 0, sizeof( BVHNodeAlt ) * spaceNeeded );
 		// recursively convert nodes
-		unsigned newAltNode = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+		uint32_t newAltNode = 0, nodeIdx = 0, stack[128], stackPtr = 0;
 		while (1)
 		{
 			const BVHNode& node = bvhNode[nodeIdx];
-			const unsigned idx = newAltNode++;
+			const uint32_t idx = newAltNode++;
 			if (node.isLeaf())
 			{
 				altNode[idx].triCount = node.triCount;
 				altNode[idx].firstTri = node.leftFirst;
 				if (!stackPtr) break;
 				nodeIdx = stack[--stackPtr];
-				unsigned newNodeParent = stack[--stackPtr];
+				uint32_t newNodeParent = stack[--stackPtr];
 				altNode[newNodeParent].right = newAltNode;
 			}
 			else
@@ -1398,7 +1505,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 	else if (from == WALD_32BYTE && to == ALT_SOA)
 	{
 		// allocate space
-		const unsigned spaceNeeded = usedBVHNodes;
+		const uint32_t spaceNeeded = usedBVHNodes;
 		if (allocatedAlt2Nodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, ALT_SOA ), bvhNode == 0." );
@@ -1408,18 +1515,18 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( alt2Node, 0, sizeof( BVHNodeAlt2 ) * spaceNeeded );
 		// recursively convert nodes
-		unsigned newAlt2Node = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+		uint32_t newAlt2Node = 0, nodeIdx = 0, stack[128], stackPtr = 0;
 		while (1)
 		{
 			const BVHNode& node = bvhNode[nodeIdx];
-			const unsigned idx = newAlt2Node++;
+			const uint32_t idx = newAlt2Node++;
 			if (node.isLeaf())
 			{
 				alt2Node[idx].triCount = node.triCount;
 				alt2Node[idx].firstTri = node.leftFirst;
 				if (!stackPtr) break;
 				nodeIdx = stack[--stackPtr];
-				unsigned newNodeParent = stack[--stackPtr];
+				uint32_t newNodeParent = stack[--stackPtr];
 				alt2Node[newNodeParent].right = newAlt2Node;
 			}
 			else
@@ -1442,7 +1549,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 	else if (from == WALD_32BYTE && to == VERBOSE)
 	{
 		// allocate space
-		unsigned spaceNeeded = triCount * (refittable ? 2 : 3); // this one needs space to grow to 2N
+		uint32_t spaceNeeded = triCount * (refittable ? 2 : 3); // this one needs space to grow to 2N
 		if (allocatedVerbose < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, VERBOSE ), bvhNode == 0." );
@@ -1453,7 +1560,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		memset( verbose, 0, sizeof( BVHNodeVerbose ) * spaceNeeded );
 		verbose[0].parent = 0xffffffff; // root sentinel
 		// convert
-		unsigned nodeIdx = 0, parent = 0xffffffff, stack[128], stackPtr = 0;
+		uint32_t nodeIdx = 0, parent = 0xffffffff, stack[128], stackPtr = 0;
 		while (1)
 		{
 			const BVHNode& node = bvhNode[nodeIdx];
@@ -1481,7 +1588,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 	else if (from == WALD_32BYTE && to == BASIC_BVH4)
 	{
 		// allocate space
-		const unsigned spaceNeeded = usedBVHNodes;
+		const uint32_t spaceNeeded = usedBVHNodes;
 		if (allocatedBVH4Nodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, BASIC_BVH4 ), bvhNode == 0." );
@@ -1491,7 +1598,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( bvh4Node, 0, sizeof( BVHNode4 ) * spaceNeeded );
 		// create an mbvh node for each bvh2 node
-		for (unsigned i = 0; i < usedBVHNodes; i++) if (i != 1)
+		for (uint32_t i = 0; i < usedBVHNodes; i++) if (i != 1)
 		{
 			BVHNode& orig = bvhNode[i];
 			BVHNode4& node4 = bvh4Node[i];
@@ -1500,15 +1607,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			else node4.child[0] = orig.leftFirst, node4.child[1] = orig.leftFirst + 1, node4.childCount = 2;
 		}
 		// collapse
-		unsigned stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node
+		uint32_t stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node
 		while (1)
 		{
 			BVHNode4& node = bvh4Node[nodeIdx];
 			while (node.childCount < 4)
 			{
-				int bestChild = -1;
+				int32_t bestChild = -1;
 				float bestChildSA = 0;
-				for (unsigned i = 0; i < node.childCount; i++)
+				for (uint32_t i = 0; i < node.childCount; i++)
 				{
 					// see if we can adopt child i
 					const BVHNode4& child = bvh4Node[node.child[i]];
@@ -1521,13 +1628,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				if (bestChild == -1) break; // could not adopt
 				const BVHNode4& child = bvh4Node[node.child[bestChild]];
 				node.child[bestChild] = child.child[0];
-				for (unsigned i = 1; i < child.childCount; i++)
+				for (uint32_t i = 1; i < child.childCount; i++)
 					node.child[node.childCount++] = child.child[i];
 			}
 			// we're done with the node; proceed with the children
-			for (unsigned i = 0; i < node.childCount; i++)
+			for (uint32_t i = 0; i < node.childCount; i++)
 			{
-				const unsigned childIdx = node.child[i];
+				const uint32_t childIdx = node.child[i];
 				const BVHNode4& child = bvh4Node[childIdx];
 				if (!child.isLeaf()) stack[stackPtr++] = childIdx;
 			}
@@ -1546,7 +1653,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		//           Leaf: 15 bits for tri count, 16 for offset
 		//           Interior: 32 bits for position of child node.
 		// Triangle data ('by value') immediately follows each leaf node.
-		unsigned blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
+		uint32_t blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
 		blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer.
 		if (allocatedAlt4aBlocks < blocksNeeded)
 		{
@@ -1557,14 +1664,14 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( bvh4Alt, 0, 16 * blocksNeeded );
 		// start conversion
-		unsigned nodeIdx = 0, newAlt4Ptr = 0, stack[128], stackPtr = 0, retValPos = 0;
+		uint32_t nodeIdx = 0, newAlt4Ptr = 0, stack[128], stackPtr = 0, retValPos = 0;
 		while (1)
 		{
 			const BVHNode4& node = bvh4Node[nodeIdx];
 			// convert BVH4 node - must be an interior node.
 			assert( !bvh4Node[nodeIdx].isLeaf() );
 			bvhvec4* nodeBase = bvh4Alt + newAlt4Ptr;
-			unsigned baseAlt4Ptr = newAlt4Ptr;
+			uint32_t baseAlt4Ptr = newAlt4Ptr;
 			newAlt4Ptr += 4;
 			nodeBase[0] = bvhvec4( node.aabbMin, 0 );
 			nodeBase[1] = bvhvec4( (node.aabbMax - node.aabbMin) * (1.0f / 255.0f), 0 );
@@ -1573,17 +1680,17 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				&bvh4Node[node.child[2]], &bvh4Node[node.child[3]]
 			};
 			// start with leaf child node conversion
-			unsigned childInfo[4] = { 0, 0, 0, 0 }; // will store in final fields later
-			for (int i = 0; i < 4; i++) if (childNode[i]->isLeaf())
+			uint32_t childInfo[4] = { 0, 0, 0, 0 }; // will store in final fields later
+			for (int32_t i = 0; i < 4; i++) if (childNode[i]->isLeaf())
 			{
 				childInfo[i] = newAlt4Ptr - baseAlt4Ptr;
 				childInfo[i] |= childNode[i]->triCount << 16;
 				childInfo[i] |= 0x80000000;
-				for (unsigned j = 0; j < childNode[i]->triCount; j++)
+				for (uint32_t j = 0; j < childNode[i]->triCount; j++)
 				{
-					unsigned t = triIdx[childNode[i]->firstTri + j];
+					uint32_t t = triIdx[childNode[i]->firstTri + j];
 				#ifdef BVH4_GPU_COMPRESSED_TRIS
-					PrecomputeTriangle( verts + t * 3, (float*)&bvh4Alt[newAlt4Ptr] );
+					PrecomputeTriangle( verts, t * 3, (float*)&bvh4Alt[newAlt4Ptr] );
 					bvh4Alt[newAlt4Ptr + 3] = bvhvec4( 0, 0, 0, *(float*)&t );
 					newAlt4Ptr += 4;
 				#else
@@ -1597,12 +1704,12 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				}
 			}
 			// process interior nodes
-			for (int i = 0; i < 4; i++) if (!childNode[i]->isLeaf())
+			for (int32_t i = 0; i < 4; i++) if (!childNode[i]->isLeaf())
 			{
 				// childInfo[i] = node.child[i] == 0 ? 0 : GPUFormatBVH4( node.child[i] );
 				if (node.child[i] == 0) childInfo[i] = 0; else
 				{
-					stack[stackPtr++] = (unsigned)(((float*)&nodeBase[3] + i) - (float*)bvh4Alt);
+					stack[stackPtr++] = (uint32_t)(((float*)&nodeBase[3] + i) - (float*)bvh4Alt);
 					stack[stackPtr++] = node.child[i];
 				}
 			}
@@ -1612,36 +1719,36 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			scale.x = extent.x > 1e-10f ? (254.999f / extent.x) : 0;
 			scale.y = extent.y > 1e-10f ? (254.999f / extent.y) : 0;
 			scale.z = extent.z > 1e-10f ? (254.999f / extent.z) : 0;
-			unsigned char* slot0 = (unsigned char*)&nodeBase[0] + 12;	// 4 chars
-			unsigned char* slot1 = (unsigned char*)&nodeBase[1] + 12;	// 4 chars
-			unsigned char* slot2 = (unsigned char*)&nodeBase[2];		// 16 chars
+			uint8_t* slot0 = (uint8_t*)&nodeBase[0] + 12;	// 4 chars
+			uint8_t* slot1 = (uint8_t*)&nodeBase[1] + 12;	// 4 chars
+			uint8_t* slot2 = (uint8_t*)&nodeBase[2];		// 16 chars
 			if (node.child[0])
 			{
 				const bvhvec3 relBMin = childNode[0]->aabbMin - node.aabbMin, relBMax = childNode[0]->aabbMax - node.aabbMin;
-				slot0[0] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[0] = (unsigned char)ceilf( relBMax.x * scale.x );
-				slot2[0] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[4] = (unsigned char)ceilf( relBMax.y * scale.y );
-				slot2[8] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[12] = (unsigned char)ceilf( relBMax.z * scale.z );
+				slot0[0] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[0] = (uint8_t)ceilf( relBMax.x * scale.x );
+				slot2[0] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[4] = (uint8_t)ceilf( relBMax.y * scale.y );
+				slot2[8] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[12] = (uint8_t)ceilf( relBMax.z * scale.z );
 			}
 			if (node.child[1])
 			{
 				const bvhvec3 relBMin = childNode[1]->aabbMin - node.aabbMin, relBMax = childNode[1]->aabbMax - node.aabbMin;
-				slot0[1] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[1] = (unsigned char)ceilf( relBMax.x * scale.x );
-				slot2[1] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[5] = (unsigned char)ceilf( relBMax.y * scale.y );
-				slot2[9] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[13] = (unsigned char)ceilf( relBMax.z * scale.z );
+				slot0[1] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[1] = (uint8_t)ceilf( relBMax.x * scale.x );
+				slot2[1] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[5] = (uint8_t)ceilf( relBMax.y * scale.y );
+				slot2[9] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[13] = (uint8_t)ceilf( relBMax.z * scale.z );
 			}
 			if (node.child[2])
 			{
 				const bvhvec3 relBMin = childNode[2]->aabbMin - node.aabbMin, relBMax = childNode[2]->aabbMax - node.aabbMin;
-				slot0[2] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[2] = (unsigned char)ceilf( relBMax.x * scale.x );
-				slot2[2] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[6] = (unsigned char)ceilf( relBMax.y * scale.y );
-				slot2[10] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[14] = (unsigned char)ceilf( relBMax.z * scale.z );
+				slot0[2] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[2] = (uint8_t)ceilf( relBMax.x * scale.x );
+				slot2[2] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[6] = (uint8_t)ceilf( relBMax.y * scale.y );
+				slot2[10] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[14] = (uint8_t)ceilf( relBMax.z * scale.z );
 			}
 			if (node.child[3])
 			{
 				const bvhvec3 relBMin = childNode[3]->aabbMin - node.aabbMin, relBMax = childNode[3]->aabbMax - node.aabbMin;
-				slot0[3] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[3] = (unsigned char)ceilf( relBMax.x * scale.x );
-				slot2[3] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[7] = (unsigned char)ceilf( relBMax.y * scale.y );
-				slot2[11] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[15] = (unsigned char)ceilf( relBMax.z * scale.z );
+				slot0[3] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[3] = (uint8_t)ceilf( relBMax.x * scale.x );
+				slot2[3] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[7] = (uint8_t)ceilf( relBMax.y * scale.y );
+				slot2[11] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[15] = (uint8_t)ceilf( relBMax.z * scale.z );
 			}
 			// finalize node
 			nodeBase[3] = bvhvec4(
@@ -1649,7 +1756,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				*(float*)&childInfo[2], *(float*)&childInfo[3]
 			);
 			// pop new work from the stack
-			if (retValPos > 0) ((unsigned*)bvh4Alt)[retValPos] = baseAlt4Ptr;
+			if (retValPos > 0) ((uint32_t*)bvh4Alt)[retValPos] = baseAlt4Ptr;
 			if (stackPtr == 0) break;
 			nodeIdx = stack[--stackPtr];
 			retValPos = stack[--stackPtr];
@@ -1661,7 +1768,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		// Convert a 4-wide BVH to a format suitable for CPU traversal.
 		// See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout,
 		// Atilla T. Áfra, 2013.
-		unsigned spaceNeeded = usedBVH4Nodes;
+		uint32_t spaceNeeded = usedBVH4Nodes;
 		if (allocatedAlt4bNodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvh4Node == 0, "BVH::Convert( BASIC_BVH4, BVH4_AFRA ), bvh4Node == 0." );
@@ -1673,13 +1780,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( bvh4Alt2, 0, spaceNeeded * sizeof( BVHNode4Alt2 ) );
 		// start conversion
-		unsigned newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+		uint32_t newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
 		while (1)
 		{
 			const BVHNode4& orig = bvh4Node[nodeIdx];
 			BVHNode4Alt2& newNode = bvh4Alt2[newAlt4Ptr++];
-			int cidx = 0;
-			for (int i = 0; i < 4; i++) if (orig.child[i])
+			int32_t cidx = 0;
+			for (int32_t i = 0; i < 4; i++) if (orig.child[i])
 			{
 				const BVHNode4& child = bvh4Node[orig.child[i]];
 				((float*)&newNode.xmin4)[cidx] = child.aabbMin.x;
@@ -1692,7 +1799,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 					newNode.childFirst[cidx] = child.firstTri,
 					newNode.triCount[cidx] = child.triCount;
 				else
-					stack[stackPtr++] = (unsigned)((unsigned*)&newNode.childFirst[cidx] - (unsigned*)bvh4Alt2),
+					stack[stackPtr++] = (uint32_t)((uint32_t*)&newNode.childFirst[cidx] - (uint32_t*)bvh4Alt2),
 					stack[stackPtr++] = orig.child[i];
 				cidx++;
 			}
@@ -1705,28 +1812,28 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			// pop next task
 			if (!stackPtr) break;
 			nodeIdx = stack[--stackPtr];
-			unsigned offset = stack[--stackPtr];
-			((unsigned*)bvh4Alt2)[offset] = newAlt4Ptr;
+			uint32_t offset = stack[--stackPtr];
+			((uint32_t*)bvh4Alt2)[offset] = newAlt4Ptr;
 		}
 		// Convert index list: store primitives 'by value'.
 		// This also allows us to compact and reorder them for best performance.
 		stackPtr = 0, nodeIdx = 0;
-		unsigned triPtr = 0;
+		uint32_t triPtr = 0;
 		while (1)
 		{
 			BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
-			for (int i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0)
+			for (int32_t i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0)
 			{
 				if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else
 				{
-					unsigned first = node.childFirst[i];
-					unsigned count = node.triCount[i];
+					uint32_t first = node.childFirst[i];
+					uint32_t count = node.triCount[i];
 					node.childFirst[i] = triPtr;
 					// assign vertex data
-					for (unsigned j = 0; j < count; j++)
+					for (uint32_t j = 0; j < count; j++)
 					{
-						unsigned fi = triIdx[first + j];
-						PrecomputeTriangle( verts + fi * 3, (float*)&bvh4Tris[triPtr] );
+						uint32_t fi = triIdx[first + j];
+						PrecomputeTriangle( verts, fi * 3, (float*)&bvh4Tris[triPtr] );
 						bvh4Tris[triPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&fi );
 						triPtr += 4;
 					}
@@ -1737,13 +1844,35 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		usedAlt4bNodes = newAlt4Ptr;
 	}
+	else if (from == BASIC_BVH4 && to == BVH4_WIVE)
+	{
+		// Convert a 4-wide BVH to a format suitable for CPU traversal.
+		// See "Accelerated Single Ray Tracing for Wide Vector Units",
+		// Fuetterling et al., 2017.
+		uint32_t spaceNeeded = usedBVH4Nodes;
+		if (allocatedAlt4bNodes < spaceNeeded)
+		{
+			FATAL_ERROR_IF( bvh4Node == 0, "BVH::Convert( BASIC_BVH4, BVH4_WIVE ), bvh4Node == 0." );
+			AlignedFree( bvh4WiVe );
+			bvh4WiVe = (BVHNode4WiVe*)AlignedAlloc( spaceNeeded * sizeof( BVHNode4WiVe ) );
+			allocatedWiVeNodes = spaceNeeded;
+		}
+		memset( bvh4WiVe, 0, spaceNeeded * sizeof( BVHNode4WiVe ) );
+		// start conversion
+		// uint32_t newNodePtr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+		// while (1)
+		// {
+		// 	const BVHNode4& orig = bvh4Node[nodeIdx];
+		// .. TODO - under construction.
+		// }
+	}
 	else if (from == WALD_32BYTE && to == BASIC_BVH8)
 	{
 		// allocate space
 		// Note: The safe upper bound here is usedBVHNodes when converting an existing
 		// BVH2, but we need triCount * 2 to be safe in later conversions, e.g. to
 		// CWBVH, which may further split some leaf nodes.
-		const unsigned spaceNeeded = triCount * 2;
+		const uint32_t spaceNeeded = triCount * 2;
 		if (allocatedBVH8Nodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, BASIC_BVH8 ), bvhNode == 0." );
@@ -1753,7 +1882,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( bvh8Node, 0, sizeof( BVHNode8 ) * spaceNeeded );
 		// create an mbvh node for each bvh2 node
-		for (unsigned i = 0; i < usedBVHNodes; i++) if (i != 1)
+		for (uint32_t i = 0; i < usedBVHNodes; i++) if (i != 1)
 		{
 			BVHNode& orig = bvhNode[i];
 			BVHNode8& node8 = bvh8Node[i];
@@ -1762,15 +1891,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			else node8.child[0] = orig.leftFirst, node8.child[1] = orig.leftFirst + 1, node8.childCount = 2;
 		}
 		// collapse
-		unsigned stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node
+		uint32_t stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node
 		while (1)
 		{
 			BVHNode8& node = bvh8Node[nodeIdx];
 			while (node.childCount < 8)
 			{
-				int bestChild = -1;
+				int32_t bestChild = -1;
 				float bestChildSA = 0;
-				for (unsigned i = 0; i < node.childCount; i++)
+				for (uint32_t i = 0; i < node.childCount; i++)
 				{
 					// see if we can adopt child i
 					const BVHNode8& child = bvh8Node[node.child[i]];
@@ -1783,13 +1912,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				if (bestChild == -1) break; // could not adopt
 				const BVHNode8& child = bvh8Node[node.child[bestChild]];
 				node.child[bestChild] = child.child[0];
-				for (unsigned i = 1; i < child.childCount; i++)
+				for (uint32_t i = 1; i < child.childCount; i++)
 					node.child[node.childCount++] = child.child[i];
 			}
 			// we're done with the node; proceed with the children
-			for (unsigned i = 0; i < node.childCount; i++)
+			for (uint32_t i = 0; i < node.childCount; i++)
 			{
-				const unsigned childIdx = node.child[i];
+				const uint32_t childIdx = node.child[i];
 				const BVHNode8& child = bvh8Node[childIdx];
 				if (!child.isLeaf()) stack[stackPtr++] = childIdx;
 			}
@@ -1806,10 +1935,10 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		FATAL_ERROR_IF( bvh8Node[0].isLeaf(), "BVH::Convert( BASIC_BVH8, CWBVH ), collapsing single-node bvh." );
 		// allocate memory
 		// Note: This can be far lower (specifically: usedBVH8Nodes) if we know that
-		// none of the BVH8 leafs has more than three primitives. 
-		// Without this guarantee, the only safe upper limit is triCount * 2, since 
+		// none of the BVH8 leafs has more than three primitives.
+		// Without this guarantee, the only safe upper limit is triCount * 2, since
 		// we will be splitting fat BVH8 leafs to as we go.
-		unsigned spaceNeeded = triCount * 2 * 5; // CWBVH nodes use 80 bytes each.
+		uint32_t spaceNeeded = triCount * 2 * 5; // CWBVH nodes use 80 bytes each.
 		if (spaceNeeded > allocatedCWBVHBlocks)
 		{
 			FATAL_ERROR_IF( bvh8Node == 0, "BVH::Convert( BASIC_BVH8, CWBVH ), bvh8Node == 0." );
@@ -1820,20 +1949,20 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		memset( bvh8Compact, 0, spaceNeeded * 16 );
 		memset( bvh8Tris, 0, idxCount * 3 * 16 );
 		BVHNode8* stackNodePtr[256];
-		unsigned stackNodeAddr[256], stackPtr = 1, nodeDataPtr = 5, triDataPtr = 0;
+		uint32_t stackNodeAddr[256], stackPtr = 1, nodeDataPtr = 5, triDataPtr = 0;
 		stackNodePtr[0] = &bvh8Node[0], stackNodeAddr[0] = 0;
 		// start conversion
 		while (stackPtr > 0)
 		{
 			BVHNode8* node = stackNodePtr[--stackPtr];
-			const int currentNodeAddr = stackNodeAddr[stackPtr];
+			const int32_t currentNodeAddr = stackNodeAddr[stackPtr];
 			bvhvec3 nodeLo = node->aabbMin, nodeHi = node->aabbMax;
 			// greedy child node ordering
 			const bvhvec3 nodeCentroid = (nodeLo + nodeHi) * 0.5f;
 			float cost[8][8];
-			int assignment[8];
+			int32_t assignment[8];
 			bool isSlotEmpty[8];
-			for (int s = 0; s < 8; s++)
+			for (int32_t s = 0; s < 8; s++)
 			{
 				isSlotEmpty[s] = true, assignment[s] = -1;
 				bvhvec3 ds(
@@ -1841,7 +1970,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 					(((s >> 1) & 1) == 1) ? -1.0f : 1.0f,
 					(((s >> 0) & 1) == 1) ? -1.0f : 1.0f
 				);
-				for (int i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = BVH_FAR; else
+				for (int32_t i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = BVH_FAR; else
 				{
 					BVHNode8* const child = &bvh8Node[node->child[i]];
 					if (child->triCount > 3 /* must be leaf */) SplitBVH8Leaf( node->child[i], 1 );
@@ -1852,67 +1981,67 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			while (1)
 			{
 				float minCost = BVH_FAR;
-				int minEntryx = -1, minEntryy = -1;
-				for (int s = 0; s < 8; s++) for (int i = 0; i < 8; i++)
+				int32_t minEntryx = -1, minEntryy = -1;
+				for (int32_t s = 0; s < 8; s++) for (int32_t i = 0; i < 8; i++)
 					if (assignment[i] == -1 && isSlotEmpty[s] && cost[s][i] < minCost)
 						minCost = cost[s][i], minEntryx = s, minEntryy = i;
 				if (minEntryx == -1 && minEntryy == -1) break;
 				isSlotEmpty[minEntryx] = false, assignment[minEntryy] = minEntryx;
 			}
-			for (int i = 0; i < 8; i++) if (assignment[i] == -1) for (int s = 0; s < 8; s++) if (isSlotEmpty[s])
+			for (int32_t i = 0; i < 8; i++) if (assignment[i] == -1) for (int32_t s = 0; s < 8; s++) if (isSlotEmpty[s])
 			{
 				isSlotEmpty[s] = false, assignment[i] = s;
 				break;
 			}
 			const BVHNode8 oldNode = *node;
-			for (int i = 0; i < 8; i++) node->child[assignment[i]] = oldNode.child[i];
+			for (int32_t i = 0; i < 8; i++) node->child[assignment[i]] = oldNode.child[i];
 			// calculate quantization parameters for each axis
-			const int ex = (int)((char)ceilf( log2f( (nodeHi.x - nodeLo.x) / 255.0f ) ));
-			const int ey = (int)((char)ceilf( log2f( (nodeHi.y - nodeLo.y) / 255.0f ) ));
-			const int ez = (int)((char)ceilf( log2f( (nodeHi.z - nodeLo.z) / 255.0f ) ));
+			const int32_t ex = (int32_t)((int8_t)ceilf( log2f( (nodeHi.x - nodeLo.x) / 255.0f ) ));
+			const int32_t ey = (int32_t)((int8_t)ceilf( log2f( (nodeHi.y - nodeLo.y) / 255.0f ) ));
+			const int32_t ez = (int32_t)((int8_t)ceilf( log2f( (nodeHi.z - nodeLo.z) / 255.0f ) ));
 			// encode output
-			int internalChildCount = 0, leafChildTriCount = 0, childBaseIndex = 0, triangleBaseIndex = 0;
-			unsigned char imask = 0;
-			for (int i = 0; i < 8; i++)
+			int32_t internalChildCount = 0, leafChildTriCount = 0, childBaseIndex = 0, triangleBaseIndex = 0;
+			uint8_t imask = 0;
+			for (int32_t i = 0; i < 8; i++)
 			{
 				if (node->child[i] == 0) continue;
 				BVHNode8* const child = &bvh8Node[node->child[i]];
-				const int qlox = (int)floorf( (child->aabbMin.x - nodeLo.x) / powf( 2, (float)ex ) );
-				const int qloy = (int)floorf( (child->aabbMin.y - nodeLo.y) / powf( 2, (float)ey ) );
-				const int qloz = (int)floorf( (child->aabbMin.z - nodeLo.z) / powf( 2, (float)ez ) );
-				const int qhix = (int)ceilf( (child->aabbMax.x - nodeLo.x) / powf( 2, (float)ex ) );
-				const int qhiy = (int)ceilf( (child->aabbMax.y - nodeLo.y) / powf( 2, (float)ey ) );
-				const int qhiz = (int)ceilf( (child->aabbMax.z - nodeLo.z) / powf( 2, (float)ez ) );
-				unsigned char* const baseAddr = (unsigned char*)&bvh8Compact[currentNodeAddr + 2];
-				baseAddr[i + 0] = (unsigned char)qlox, baseAddr[i + 24] = (unsigned char)qhix;
-				baseAddr[i + 8] = (unsigned char)qloy, baseAddr[i + 32] = (unsigned char)qhiy;
-				baseAddr[i + 16] = (unsigned char)qloz, baseAddr[i + 40] = (unsigned char)qhiz;
+				const int32_t qlox = (int32_t)floorf( (child->aabbMin.x - nodeLo.x) / powf( 2, (float)ex ) );
+				const int32_t qloy = (int32_t)floorf( (child->aabbMin.y - nodeLo.y) / powf( 2, (float)ey ) );
+				const int32_t qloz = (int32_t)floorf( (child->aabbMin.z - nodeLo.z) / powf( 2, (float)ez ) );
+				const int32_t qhix = (int32_t)ceilf( (child->aabbMax.x - nodeLo.x) / powf( 2, (float)ex ) );
+				const int32_t qhiy = (int32_t)ceilf( (child->aabbMax.y - nodeLo.y) / powf( 2, (float)ey ) );
+				const int32_t qhiz = (int32_t)ceilf( (child->aabbMax.z - nodeLo.z) / powf( 2, (float)ez ) );
+				uint8_t* const baseAddr = (uint8_t*)&bvh8Compact[currentNodeAddr + 2];
+				baseAddr[i + 0] = (uint8_t)qlox, baseAddr[i + 24] = (uint8_t)qhix;
+				baseAddr[i + 8] = (uint8_t)qloy, baseAddr[i + 32] = (uint8_t)qhiy;
+				baseAddr[i + 16] = (uint8_t)qloz, baseAddr[i + 40] = (uint8_t)qhiz;
 				if (!child->isLeaf())
 				{
 					// interior node, set params and push onto stack
-					const int childNodeAddr = nodeDataPtr;
+					const int32_t childNodeAddr = nodeDataPtr;
 					if (internalChildCount++ == 0) childBaseIndex = childNodeAddr / 5;
 					nodeDataPtr += 5, imask |= 1 << i;
 					// set the meta field - This calculation assumes children are stored contiguously.
-					unsigned char* const childMetaField = ((unsigned char*)&bvh8Compact[currentNodeAddr + 1]) + 8;
-					childMetaField[i] = (1 << 5) | (24 + (unsigned char)i); // I don't see how this accounts for empty children?
+					uint8_t* const childMetaField = ((uint8_t*)&bvh8Compact[currentNodeAddr + 1]) + 8;
+					childMetaField[i] = (1 << 5) | (24 + (uint8_t)i); // I don't see how this accounts for empty children?
 					stackNodePtr[stackPtr] = child, stackNodeAddr[stackPtr++] = childNodeAddr; // counted in float4s
 					internalChildCount++;
 					continue;
 				}
 				// leaf node
-				const unsigned tcount = tinybvh_min( child->triCount, 3u ); // TODO: ensure that's the case; clamping for now.
+				const uint32_t tcount = tinybvh_min( child->triCount, 3u ); // TODO: ensure that's the case; clamping for now.
 				if (leafChildTriCount == 0) triangleBaseIndex = triDataPtr;
-				int unaryEncodedTriCount = tcount == 1 ? 0b001 : tcount == 2 ? 0b011 : 0b111;
+				int32_t unaryEncodedTriCount = tcount == 1 ? 0b001 : tcount == 2 ? 0b011 : 0b111;
 				// set the meta field - This calculation assumes children are stored contiguously.
-				unsigned char* const childMetaField = ((unsigned char*)&bvh8Compact[currentNodeAddr + 1]) + 8;
-				childMetaField[i] = (unsigned char)((unaryEncodedTriCount << 5) | leafChildTriCount);
+				uint8_t* const childMetaField = ((uint8_t*)&bvh8Compact[currentNodeAddr + 1]) + 8;
+				childMetaField[i] = (uint8_t)((unaryEncodedTriCount << 5) | leafChildTriCount);
 				leafChildTriCount += tcount;
-				for (unsigned j = 0; j < tcount; j++)
+				for (uint32_t j = 0; j < tcount; j++)
 				{
-					int primitiveIndex = triIdx[child->firstTri + j];
+					int32_t primitiveIndex = triIdx[child->firstTri + j];
 				#ifdef CWBVH_COMPRESSED_TRIS
-					PrecomputeTriangle( verts + primitiveIndex * 3, (float*)&bvh8Tris[triDataPtr] );
+					PrecomputeTriangle( verts, +primitiveIndex * 3, (float*)&bvh8Tris[triDataPtr] );
 					bvh8Tris[triDataPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&primitiveIndex );
 					triDataPtr += 4;
 				#else
@@ -1924,7 +2053,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 				#endif
 				}
 			}
-			unsigned char exyzAndimask[4] = { *(unsigned char*)&ex, *(unsigned char*)&ey, *(unsigned char*)&ez, imask };
+			uint8_t exyzAndimask[4] = { *(uint8_t*)&ex, *(uint8_t*)&ey, *(uint8_t*)&ez, imask };
 			bvh8Compact[currentNodeAddr + 0] = bvhvec4( nodeLo, *(float*)&exyzAndimask );
 			bvh8Compact[currentNodeAddr + 1].x = *(float*)&childBaseIndex;
 			bvh8Compact[currentNodeAddr + 1].y = *(float*)&triangleBaseIndex;
@@ -1934,7 +2063,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 	else if (from == VERBOSE && to == WALD_32BYTE)
 	{
 		// allocate space
-		const unsigned spaceNeeded = usedVerboseNodes;
+		const uint32_t spaceNeeded = usedVerboseNodes;
 		if (allocatedBVHNodes < spaceNeeded)
 		{
 			FATAL_ERROR_IF( verbose == 0, "BVH::Convert( VERBOSE, WALD_32BYTE ), verbose == 0." );
@@ -1944,8 +2073,8 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 		}
 		memset( bvhNode, 0, sizeof( BVHNode ) * spaceNeeded );
 		// start conversion
-		unsigned srcNodeIdx = 0, dstNodeIdx = 0, newNodePtr = 2;
-		unsigned srcStack[64], dstStack[64], stackPtr = 0;
+		uint32_t srcNodeIdx = 0, dstNodeIdx = 0, newNodePtr = 2;
+		uint32_t srcStack[64], dstStack[64], stackPtr = 0;
 		while (1)
 		{
 			const BVHNodeVerbose& srcNode = verbose[srcNodeIdx];
@@ -1962,7 +2091,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 			else
 			{
 				bvhNode[dstNodeIdx].leftFirst = newNodePtr;
-				unsigned srcRightIdx = srcNode.right;
+				uint32_t srcRightIdx = srcNode.right;
 				srcNodeIdx = srcNode.left, dstNodeIdx = newNodePtr++;
 				srcStack[stackPtr] = srcRightIdx;
 				dstStack[stackPtr++] = newNodePtr++;
@@ -1977,12 +2106,12 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet
 	}
 }
 
-int BVH::NodeCount( const BVHLayout layout ) const
+int32_t BVH::NodeCount( const BVHLayout layout ) const
 {
 	// Determine the number of nodes in the tree. Typically the result should
-	// be usedBVHNodes - 1 (second node is always unused), but some builders may 
+	// be usedBVHNodes - 1 (second node is always unused), but some builders may
 	// have unused nodes besides node 1. TODO: Support more layouts.
-	unsigned retVal = 0, nodeIdx = 0, stack[64], stackPtr = 0;
+	uint32_t retVal = 0, nodeIdx = 0, stack[64], stackPtr = 0;
 	if (layout == WALD_32BYTE)
 	{
 		FATAL_ERROR_IF( bvhNode == 0, "BVH::NodeCount( WALD_32BYTE ), bvhNode == 0." );
@@ -2012,22 +2141,22 @@ int BVH::NodeCount( const BVHLayout layout ) const
 // includes trees waving in the wind, or subsequent frames for skinned
 // animations. Repeated refitting tends to lead to deteriorated BVHs and
 // slower ray tracing. Rebuild when this happens.
-void BVH::Refit( const BVHLayout layout, const unsigned nodeIdx )
+void BVH::Refit( const BVHLayout layout, const uint32_t nodeIdx )
 {
 	FATAL_ERROR_IF( !refittable, "BVH::Refit( .. ), refitting an SBVH." );
 	if (layout == WALD_32BYTE)
 	{
 		FATAL_ERROR_IF( bvhNode == 0, "BVH::Refit( WALD_32BYTE ), bvhNode == 0." );
 		FATAL_ERROR_IF( may_have_holes, "BVH::Refit( WALD_32BYTE ), bvh may have holes." );
-		for (int i = usedBVHNodes - 1; i >= 0; i--)
+		for (int32_t i = usedBVHNodes - 1; i >= 0; i--)
 		{
 			BVHNode& node = bvhNode[i];
 			if (node.isLeaf()) // leaf: adjust to current triangle vertex positions
 			{
 				bvhvec4 aabbMin( BVH_FAR ), aabbMax( -BVH_FAR );
-				for (unsigned first = node.leftFirst, j = 0; j < node.triCount; j++)
+				for (uint32_t first = node.leftFirst, j = 0; j < node.triCount; j++)
 				{
-					const unsigned vertIdx = triIdx[first + j] * 3;
+					const uint32_t vertIdx = triIdx[first + j] * 3;
 					aabbMin = tinybvh_min( aabbMin, verts[vertIdx] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx] );
 					aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 1] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 1] );
 					aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 2] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 2] );
@@ -2049,9 +2178,9 @@ void BVH::Refit( const BVHLayout layout, const unsigned nodeIdx )
 		if (node.isLeaf()) // leaf: adjust to current triangle vertex positions
 		{
 			bvhvec4 aabbMin( BVH_FAR ), aabbMax( -BVH_FAR );
-			for (unsigned first = node.firstTri, j = 0; j < node.triCount; j++)
+			for (uint32_t first = node.firstTri, j = 0; j < node.triCount; j++)
 			{
-				const unsigned vertIdx = triIdx[first + j] * 3;
+				const uint32_t vertIdx = triIdx[first + j] * 3;
 				aabbMin = tinybvh_min( aabbMin, verts[vertIdx] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx] );
 				aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 1] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 1] );
 				aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 2] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 2] );
@@ -2084,14 +2213,14 @@ void BVH::Compact( const BVHLayout layout )
 		FATAL_ERROR_IF( bvhNode == 0, "BVH::Compact( WALD_32BYTE ), bvhNode == 0." );
 		BVHNode* tmp = (BVHNode*)AlignedAlloc( sizeof( BVHNode ) * usedBVHNodes );
 		memcpy( tmp, bvhNode, 2 * sizeof( BVHNode ) );
-		unsigned newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0;
+		uint32_t newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0;
 		while (1)
 		{
 			BVHNode& node = tmp[nodeIdx];
 			const BVHNode& left = bvhNode[node.leftFirst];
 			const BVHNode& right = bvhNode[node.leftFirst + 1];
 			tmp[newNodePtr] = left, tmp[newNodePtr + 1] = right;
-			const unsigned todo1 = newNodePtr, todo2 = newNodePtr + 1;
+			const uint32_t todo1 = newNodePtr, todo2 = newNodePtr + 1;
 			node.leftFirst = newNodePtr, newNodePtr += 2;
 			if (!left.isLeaf()) stack[stackPtr++] = todo1;
 			if (!right.isLeaf()) stack[stackPtr++] = todo2;
@@ -2110,14 +2239,14 @@ void BVH::Compact( const BVHLayout layout )
 		FATAL_ERROR_IF( verbose == 0, "BVH::Compact( VERBOSE ), verbose == 0." );
 		BVHNodeVerbose* tmp = (BVHNodeVerbose*)AlignedAlloc( sizeof( BVHNodeVerbose ) * usedVerboseNodes );
 		memcpy( tmp, verbose, 2 * sizeof( BVHNodeVerbose ) );
-		unsigned newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0;
+		uint32_t newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0;
 		while (1)
 		{
 			BVHNodeVerbose& node = tmp[nodeIdx];
 			const BVHNodeVerbose& left = verbose[node.left];
 			const BVHNodeVerbose& right = verbose[node.right];
 			tmp[newNodePtr] = left, tmp[newNodePtr + 1] = right;
-			const unsigned todo1 = newNodePtr, todo2 = newNodePtr + 1;
+			const uint32_t todo1 = newNodePtr, todo2 = newNodePtr + 1;
 			node.left = newNodePtr++, node.right = newNodePtr++;
 			if (!left.isLeaf()) stack[stackPtr++] = todo1;
 			if (!right.isLeaf()) stack[stackPtr++] = todo2;
@@ -2138,10 +2267,10 @@ void BVH::Compact( const BVHLayout layout )
 // necessary to have a single primitive per leaf, it will yield a slightly better
 // optimized BVH. The leafs of the optimized BVH should be collapsed ('MergeLeafs')
 // to obtain the final tree.
-void BVH::SplitLeafs( const unsigned maxPrims )
+void BVH::SplitLeafs( const uint32_t maxPrims )
 {
 	FATAL_ERROR_IF( verbose == 0, "BVH::SplitLeafs(), requires VERBOSE bvh." );
-	unsigned nodeIdx = 0, stack[64], stackPtr = 0;
+	uint32_t nodeIdx = 0, stack[64], stackPtr = 0;
 	float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
 	while (1)
 	{
@@ -2151,8 +2280,8 @@ void BVH::SplitLeafs( const unsigned maxPrims )
 			// split this leaf
 			if (node.triCount > maxPrims)
 			{
-				const unsigned newIdx1 = usedVerboseNodes++;
-				const unsigned newIdx2 = usedVerboseNodes++;
+				const uint32_t newIdx1 = usedVerboseNodes++;
+				const uint32_t newIdx2 = usedVerboseNodes++;
 				BVHNodeVerbose& new1 = verbose[newIdx1], & new2 = verbose[newIdx2];
 				new1.firstTri = node.firstTri;
 				new1.triCount = node.triCount / 2;
@@ -2163,11 +2292,11 @@ void BVH::SplitLeafs( const unsigned maxPrims )
 				new2.left = new2.right = 0;
 				node.left = newIdx1, node.right = newIdx2, node.triCount = 0;
 				new1.aabbMin = new2.aabbMin = BVH_FAR, new1.aabbMax = new2.aabbMax = -BVH_FAR;
-				for (unsigned fi, i = 0; i < new1.triCount; i++)
+				for (uint32_t fi, i = 0; i < new1.triCount; i++)
 					fi = triIdx[new1.firstTri + i],
 					new1.aabbMin = tinybvh_min( new1.aabbMin, fragment[fi].bmin * fragMinFix ),
 					new1.aabbMax = tinybvh_max( new1.aabbMax, fragment[fi].bmax );
-				for (unsigned fi, i = 0; i < new2.triCount; i++)
+				for (uint32_t fi, i = 0; i < new2.triCount; i++)
 					fi = triIdx[new2.firstTri + i],
 					new2.aabbMin = tinybvh_min( new2.aabbMin, fragment[fi].bmin * fragMinFix ),
 					new2.aabbMax = tinybvh_max( new2.aabbMax, fragment[fi].bmax );
@@ -2183,7 +2312,7 @@ void BVH::SplitLeafs( const unsigned maxPrims )
 // SplitBVH8Leaf: CWBVH requires that a leaf has no more than 3 primitives,
 // but regular BVH construction does not guarantee this. So, here we split
 // busy leafs recursively in multiple leaves, until the requirement is met.
-void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims )
+void BVH::SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims )
 {
 	float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
 	BVHNode8& node = bvh8Node[nodeIdx];
@@ -2192,7 +2321,7 @@ void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims )
 	BVHNode8& firstChild = bvh8Node[node.child[0] = usedBVH8Nodes++];
 	firstChild.triCount = node.triCount;
 	firstChild.firstTri = node.firstTri;
-	unsigned nextChild = 1;
+	uint32_t nextChild = 1;
 	// share with new sibling nodes
 	while (firstChild.triCount > maxPrims && nextChild < 8)
 	{
@@ -2201,14 +2330,14 @@ void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims )
 		child.firstTri = firstChild.firstTri + firstChild.triCount;
 		nextChild++;
 	}
-	for (unsigned i = 0; i < nextChild; i++)
+	for (uint32_t i = 0; i < nextChild; i++)
 	{
 		BVHNode8& child = bvh8Node[node.child[i]];
 		if (!refittable) child.aabbMin = node.aabbMin, child.aabbMax = node.aabbMax; else
 		{
 			// TODO: why is this producing wrong aabbs for SBVH?
 			child.aabbMin = bvhvec3( BVH_FAR ), child.aabbMax = bvhvec3( -BVH_FAR );
-			for (unsigned fi, j = 0; j < child.triCount; j++)
+			for (uint32_t fi, j = 0; j < child.triCount; j++)
 				fi = triIdx[child.firstTri + j],
 				child.aabbMin = tinybvh_min( child.aabbMin, fragment[fi].bmin * fragMinFix ),
 				child.aabbMax = tinybvh_max( child.aabbMax, fragment[fi].bmax );
@@ -2225,17 +2354,17 @@ void BVH::MergeLeafs()
 {
 	// allocate some working space
 	FATAL_ERROR_IF( verbose == 0, "BVH::MergeLeafs(), requires VERBOSE bvh." );
-	unsigned* subtreeTriCount = (unsigned*)AlignedAlloc( usedVerboseNodes * 4 );
-	unsigned* newIdx = (unsigned*)AlignedAlloc( idxCount * 4 );
+	uint32_t* subtreeTriCount = (uint32_t*)AlignedAlloc( usedVerboseNodes * 4 );
+	uint32_t* newIdx = (uint32_t*)AlignedAlloc( idxCount * 4 );
 	memset( subtreeTriCount, 0, usedVerboseNodes * 4 );
 	CountSubtreeTris( 0, subtreeTriCount );
-	unsigned stack[64], stackPtr = 0, nodeIdx = 0, newIdxPtr = 0;
+	uint32_t stack[64], stackPtr = 0, nodeIdx = 0, newIdxPtr = 0;
 	while (1)
 	{
 		BVHNodeVerbose& node = verbose[nodeIdx];
 		if (node.isLeaf())
 		{
-			unsigned start = newIdxPtr;
+			uint32_t start = newIdxPtr;
 			MergeSubtree( nodeIdx, newIdx, newIdxPtr );
 			node.firstTri = start;
 			// pop new task
@@ -2244,9 +2373,9 @@ void BVH::MergeLeafs()
 		}
 		else
 		{
-			const unsigned leftCount = subtreeTriCount[node.left];
-			const unsigned rightCount = subtreeTriCount[node.right];
-			const unsigned mergedCount = leftCount + rightCount;
+			const uint32_t leftCount = subtreeTriCount[node.left];
+			const uint32_t rightCount = subtreeTriCount[node.right];
+			const uint32_t mergedCount = leftCount + rightCount;
 			// cost of unsplit
 			float Cunsplit = SA( node.aabbMin, node.aabbMax ) * mergedCount * C_INT;
 			// cost of leaving things as they are
@@ -2259,7 +2388,7 @@ void BVH::MergeLeafs()
 			if (Cunsplit <= Ckeepsplit)
 			{
 				// collapse the subtree
-				unsigned start = newIdxPtr;
+				uint32_t start = newIdxPtr;
 				MergeSubtree( nodeIdx, newIdx, newIdxPtr );
 				node.firstTri = start;
 				node.triCount = mergedCount;
@@ -2285,18 +2414,18 @@ void BVH::MergeLeafs()
 
 // Optimizing a BVH: BVH must be in 'verbose' format.
 // Implements "Fast Insertion-Based Optimization of Bounding Volume Hierarchies",
-void BVH::Optimize( const unsigned iterations )
+void BVH::Optimize( const uint32_t iterations )
 {
 	// Optimize by reinserting a random subtree.
 	// Suggested iteration count: ~1M for best results.
 	// TODO: Implement Section 3.4 of the paper to speed up the process.
 	FATAL_ERROR_IF( verbose == 0, "BVH::Optimize(), requires VERBOSE bvh." );
-	for (unsigned i = 0; i < iterations; i++)
+	for (uint32_t i = 0; i < iterations; i++)
 	{
-		unsigned Nid, valid = 0;
+		uint32_t Nid, valid = 0;
 		do
 		{
-			static unsigned seed = 0x12345678;
+			static uint32_t seed = 0x12345678;
 			seed ^= seed << 13, seed ^= seed >> 17, seed ^= seed << 5; // xor32
 			valid = 1, Nid = 2 + seed % (usedVerboseNodes - 2);
 			if (verbose[Nid].parent == 0 || verbose[Nid].isLeaf()) valid = 0;
@@ -2304,12 +2433,12 @@ void BVH::Optimize( const unsigned iterations )
 		} while (valid == 0);
 		// snip it loose
 		const BVHNodeVerbose& N = verbose[Nid], & P = verbose[N.parent];
-		const unsigned Pid = N.parent, X1 = P.parent;
-		const unsigned X2 = P.left == Nid ? P.right : P.left;
+		const uint32_t Pid = N.parent, X1 = P.parent;
+		const uint32_t X2 = P.left == Nid ? P.right : P.left;
 		if (verbose[X1].left == Pid) verbose[X1].left = X2;
 		else /* verbose[X1].right == Pid */ verbose[X1].right = X2;
 		verbose[X2].parent = X1;
-		unsigned L = N.left, R = N.right;
+		uint32_t L = N.left, R = N.right;
 		// fix affected node bounds
 		RefitUpVerbose( X1 );
 		ReinsertNodeVerbose( L, Pid, X1 );
@@ -2321,7 +2450,7 @@ void BVH::Optimize( const unsigned iterations )
 // This function returns the intersection details in Ray::hit. Additionally,
 // the number of steps through the BVH is returned. Visualize this to get a
 // visual impression of the structure of the BVH.
-int BVH::Intersect( Ray& ray, const BVHLayout layout ) const
+int32_t BVH::Intersect( Ray& ray, const BVHLayout layout ) const
 {
 	switch (layout)
 	{
@@ -2356,11 +2485,13 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const
 		FATAL_ERROR_IF( bvh4Alt2 == 0, "BVH::Intersect( .. , BVH4_AFRA ), bvh not available." );
 		return Intersect_Afra( ray );
 		break;
+	#if defined BVH_USEAVX
 	case CWBVH:
 		FATAL_ERROR_IF( bvh8Compact == 0, "BVH::Intersect( .. , CWBVH ), bvh not available." );
 		return Intersect_CWBVH( ray );
 		break;
 	#endif
+	#endif
 	default:
 		FATAL_ERROR_IF( true, "BVH::Intersect( .. , ? ), unsupported bvh layout." );
 		break;
@@ -2368,15 +2499,15 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const
 	return 0;
 }
 
-void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout, const TraceDevice /* device */ ) const
+void BVH::BatchIntersect( Ray* rayBatch, const uint32_t N, const BVHLayout layout, const TraceDevice /* device */ ) const
 {
-	for (unsigned i = 0; i < N; i++) Intersect( rayBatch[i], layout );
+	for (uint32_t i = 0; i < N; i++) Intersect( rayBatch[i], layout );
 }
 
 // Detect if a ray is occluded / shadow ray query.
-// Unlike Intersect, this function only returns a yes/no answer: Yes if any 
-// geometry blocks it (taking into account ray length); no if the ray can 
-// travel the specified distance without encountering anything. 
+// Unlike Intersect, this function only returns a yes/no answer: Yes if any
+// geometry blocks it (taking into account ray length); no if the ray can
+// travel the specified distance without encountering anything.
 bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const
 {
 	switch (layout)
@@ -2404,25 +2535,25 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const
 // For now this exists only to establish the interface.
 // A future implementation will exploit the batch to trace the rays faster.
 // BatchIsOccluded returns the hits as a bit array in result:
-// Each unsigned integer in this array stores 32 hits.
-void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout, const TraceDevice /* device */ ) const
+// Each uint32_t integer in this array stores 32 hits.
+void BVH::BatchIsOccluded( Ray* rayBatch, const uint32_t N, uint32_t* result, const BVHLayout layout, const TraceDevice /* device */ ) const
 {
-	unsigned words = (N + 31 /* round up */) / 32;
+	uint32_t words = (N + 31 /* round up */) / 32;
 	memset( result, 0, words * 4 );
-	for (unsigned i = 0; i < N; i++) if (IsOccluded( rayBatch[i], layout )) result[i >> 5] |= 1 << (i & 31);
+	for (uint32_t i = 0; i < N; i++) if (IsOccluded( rayBatch[i], layout )) result[i >> 5] |= 1 << (i & 31);
 }
 
 // Traverse the default BVH layout (WALD_32BYTE).
-int BVH::Intersect_Wald32Byte( Ray& ray ) const
+int32_t BVH::Intersect_Wald32Byte( Ray& ray ) const
 {
 	BVHNode* node = &bvhNode[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	while (1)
 	{
 		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->leftFirst + i] );
+			for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->leftFirst + i] );
 			if (stackPtr == 0) break; else node = stack[--stackPtr];
 			continue;
 		}
@@ -2446,15 +2577,15 @@ int BVH::Intersect_Wald32Byte( Ray& ray ) const
 bool BVH::IsOccluded_Wald32Byte( const Ray& ray ) const
 {
 	BVHNode* node = &bvhNode[0], * stack[64];
-	unsigned stackPtr = 0;
+	uint32_t stackPtr = 0;
 	while (1)
 	{
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++)
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
 				// Moeller-Trumbore ray/triangle intersection algorithm
-				const unsigned vertIdx = triIdx[node->leftFirst + i] * 3;
+				const uint32_t vertIdx = triIdx[node->leftFirst + i] * 3;
 				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 				const bvhvec3 h = cross( ray.D, edge2 );
@@ -2491,16 +2622,16 @@ bool BVH::IsOccluded_Wald32Byte( const Ray& ray ) const
 }
 
 // Traverse the alternative BVH layout (AILA_LAINE).
-int BVH::Intersect_AilaLaine( Ray& ray ) const
+int32_t BVH::Intersect_AilaLaine( Ray& ray ) const
 {
 	BVHNodeAlt* node = &altNode[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	while (1)
 	{
 		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] );
+			for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] );
 			if (stackPtr == 0) break; else node = stack[--stackPtr];
 			continue;
 		}
@@ -2515,11 +2646,11 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const
 		const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) );
 		if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina;
 		if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb;
-		unsigned lidx = node->left, ridx = node->right;
+		uint32_t lidx = node->left, ridx = node->right;
 		if (dist1 > dist2)
 		{
 			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned i = lidx; lidx = ridx; ridx = i;
+			uint32_t i = lidx; lidx = ridx; ridx = i;
 		}
 		if (dist1 == BVH_FAR)
 		{
@@ -2538,14 +2669,14 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const
 bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const
 {
 	BVHNodeAlt* node = &altNode[0], * stack[64];
-	unsigned stackPtr = 0;
+	uint32_t stackPtr = 0;
 	while (1)
 	{
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++)
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
-				const unsigned vertIdx = triIdx[node->firstTri + i] * 3;
+				const uint32_t vertIdx = triIdx[node->firstTri + i] * 3;
 				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 				const bvhvec3 h = cross( ray.D, edge2 );
@@ -2575,11 +2706,11 @@ bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const
 		const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) );
 		if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina;
 		if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb;
-		unsigned lidx = node->left, ridx = node->right;
+		uint32_t lidx = node->left, ridx = node->right;
 		if (dist1 > dist2)
 		{
 			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned i = lidx; lidx = ridx; ridx = i;
+			uint32_t i = lidx; lidx = ridx; ridx = i;
 		}
 		if (dist1 == BVH_FAR)
 		{
@@ -2595,16 +2726,16 @@ bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const
 }
 
 // Intersect_BasicBVH4. For testing the converted data only; not efficient.
-int BVH::Intersect_BasicBVH4( Ray& ray ) const
+int32_t BVH::Intersect_BasicBVH4( Ray& ray ) const
 {
 	BVHNode4* node = &bvh4Node[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	while (1)
 	{
 		steps++;
-		if (node->isLeaf()) for (unsigned i = 0; i < node->triCount; i++)
+		if (node->isLeaf()) for (uint32_t i = 0; i < node->triCount; i++)
 			IntersectTri( ray, triIdx[node->firstTri + i] );
-		else for (unsigned i = 0; i < node->childCount; i++)
+		else for (uint32_t i = 0; i < node->childCount; i++)
 		{
 			BVHNode4* child = bvh4Node + node->child[i];
 			float dist = IntersectAABB( ray, child->aabbMin, child->aabbMax );
@@ -2616,16 +2747,16 @@ int BVH::Intersect_BasicBVH4( Ray& ray ) const
 }
 
 // Intersect_BasicBVH8. For testing the converted data only; not efficient.
-int BVH::Intersect_BasicBVH8( Ray& ray ) const
+int32_t BVH::Intersect_BasicBVH8( Ray& ray ) const
 {
 	BVHNode8* node = &bvh8Node[0], * stack[512];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	while (1)
 	{
 		steps++;
-		if (node->isLeaf()) for (unsigned i = 0; i < node->triCount; i++)
+		if (node->isLeaf()) for (uint32_t i = 0; i < node->triCount; i++)
 			IntersectTri( ray, triIdx[node->firstTri + i] );
-		else for (unsigned i = 0; i < 8; i++) if (node->child[i])
+		else for (uint32_t i = 0; i < 8; i++) if (node->child[i])
 		{
 			BVHNode8* child = bvh8Node + node->child[i];
 			float dist = IntersectAABB( ray, child->aabbMin, child->aabbMax );
@@ -2639,14 +2770,14 @@ int BVH::Intersect_BasicBVH8( Ray& ray ) const
 // IntersectAlt4Nodes. For testing the converted data only; not efficient.
 // This code replicates how traversal on GPU happens.
 #define SWAP(A,B,C,D) t=A,A=B,B=t,t2=C,C=D,D=t2;
-struct uchar4 { unsigned char x, y, z, w; };
+struct uchar4 { uint8_t x, y, z, w; };
 static uchar4 as_uchar4( const float v ) { union { float t; uchar4 t4; }; t = v; return t4; }
-static unsigned as_uint( const float v ) { return *(unsigned*)&v; }
-int BVH::Intersect_Alt4BVH( Ray& ray ) const
+static uint32_t as_uint( const float v ) { return *(uint32_t*)&v; }
+int32_t BVH::Intersect_Alt4BVH( Ray& ray ) const
 {
 	// traverse a blas
-	unsigned offset = 0, stack[128], stackPtr = 0, t2 /* for SWAP macro */;
-	unsigned steps = 0;
+	uint32_t offset = 0, stack[128], stackPtr = 0, t2 /* for SWAP macro */;
+	uint32_t steps = 0;
 	while (1)
 	{
 		steps++;
@@ -2682,16 +2813,16 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const
 		float dist0 = tmina > tmaxa ? BVH_FAR : tmina, dist1 = tminb > tmaxb ? BVH_FAR : tminb;
 		float dist2 = tminc > tmaxc ? BVH_FAR : tminc, dist3 = tmind > tmaxd ? BVH_FAR : tmind, t;
 		// get child node info fields
-		unsigned c0info = as_uint( data3.x ), c1info = as_uint( data3.y );
-		unsigned c2info = as_uint( data3.z ), c3info = as_uint( data3.w );
+		uint32_t c0info = as_uint( data3.x ), c1info = as_uint( data3.y );
+		uint32_t c2info = as_uint( data3.z ), c3info = as_uint( data3.w );
 		if (dist0 < dist2) SWAP( dist0, dist2, c0info, c2info );
 		if (dist1 < dist3) SWAP( dist1, dist3, c1info, c3info );
 		if (dist0 < dist1) SWAP( dist0, dist1, c0info, c1info );
 		if (dist2 < dist3) SWAP( dist2, dist3, c2info, c3info );
 		if (dist1 < dist2) SWAP( dist1, dist2, c1info, c2info );
 		// process results, starting with farthest child, so nearest ends on top of stack
-		unsigned nextNode = 0;
-		unsigned leaf[4] = { 0, 0, 0, 0 }, leafs = 0;
+		uint32_t nextNode = 0;
+		uint32_t leaf[4] = { 0, 0, 0, 0 }, leafs = 0;
 		if (dist0 < BVH_FAR)
 		{
 			if (c0info & 0x80000000) leaf[leafs++] = c0info; else if (c0info) stack[stackPtr++] = c0info;
@@ -2709,11 +2840,11 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const
 			if (c3info & 0x80000000) leaf[leafs++] = c3info; else if (c3info) stack[stackPtr++] = c3info;
 		}
 		// process encountered leafs, if any
-		for (unsigned i = 0; i < leafs; i++)
+		for (uint32_t i = 0; i < leafs; i++)
 		{
-			const unsigned N = (leaf[i] >> 16) & 0x7fff;
-			unsigned triStart = offset + (leaf[i] & 0xffff);
-			for (unsigned j = 0; j < N; j++, triStart += 3)
+			const uint32_t N = (leaf[i] >> 16) & 0x7fff;
+			uint32_t triStart = offset + (leaf[i] & 0xffff);
+			for (uint32_t j = 0; j < N; j++, triStart += 3)
 			{
 				const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] );
 				const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] );
@@ -2745,7 +2876,7 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const
 }
 
 // Intersect a WALD_32BYTE BVH with a ray packet.
-// The 256 rays travel together to better utilize the caches and to amortize the cost 
+// The 256 rays travel together to better utilize the caches and to amortize the cost
 // of memory transfers over the rays in the bundle.
 // Note that this basic implementation assumes a specific layout of the rays. Provided
 // as 'proof of concept', should not be used in production code.
@@ -2768,27 +2899,27 @@ void BVH::Intersect256Rays( Ray* packet ) const
 	const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
 	const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
 	const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
-	const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
-	const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
-	const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
-	const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
+	const int32_t sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
+	const int32_t sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
+	const int32_t sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
+	const int32_t sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
 	const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 );
 	const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 );
 	// Traverse the tree with the packet
-	int first = 0, last = 255; // first and last active ray in the packet
+	int32_t first = 0, last = 255; // first and last active ray in the packet
 	const BVHNode* node = &bvhNode[0];
-	ALIGNED( 64 ) unsigned stack[64], stackPtr = 0;
+	ALIGNED( 64 ) uint32_t stack[64], stackPtr = 0;
 	while (1)
 	{
 		if (node->isLeaf())
 		{
 			// handle leaf node
-			for (unsigned j = 0; j < node->triCount; j++)
+			for (uint32_t j = 0; j < node->triCount; j++)
 			{
-				const unsigned idx = triIdx[node->leftFirst + j], vid = idx * 3;
+				const uint32_t idx = triIdx[node->leftFirst + j], vid = idx * 3;
 				const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid];
 				const bvhvec3 s = O - bvhvec3( verts[vid] );
-				for (int i = first; i <= last; i++)
+				for (int32_t i = first; i <= last; i++)
 				{
 					Ray& ray = packet[i];
 					const bvhvec3 h = cross( ray.D, edge2 );
@@ -2814,7 +2945,7 @@ void BVH::Intersect256Rays( Ray* packet ) const
 			const BVHNode* left = bvhNode + node->leftFirst;
 			const BVHNode* right = bvhNode + node->leftFirst + 1;
 			bool visitLeft = true, visitRight = true;
-			int leftFirst = first, leftLast = last, rightFirst = first, rightLast = last;
+			int32_t leftFirst = first, leftLast = last, rightFirst = first, rightLast = last;
 			float distLeft, distRight;
 			{
 				// see if we want to intersect the left child
@@ -2918,7 +3049,7 @@ void BVH::Intersect256Rays( Ray* packet ) const
 // ============================================================================
 //
 //        I M P L E M E N T A T I O N  -  A V X / S S E  C O D E
-// 
+//
 // ============================================================================
 
 #ifdef BVH_USEAVX
@@ -2966,52 +3097,54 @@ inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz n
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
-void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
+void BVH::BuildAVX( const bvhvec4* vertices, const uint32_t primCount )
 {
-	int test = BVHBINS;
+	FATAL_ERROR_IF( primCount == 0, "BVH::BuildAVX( .. ), primCount == 0." );
+	int32_t test = BVHBINS;
 	if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
 	assert( ((long long)vertices & 63) == 0 ); // buffer must be cacheline-aligned
 	// aligned data
 	ALIGNED( 64 ) __m256 binbox[3 * BVHBINS];			// 768 bytes
 	ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS];		// 768 bytes
-	ALIGNED( 64 ) unsigned count[3][BVHBINS]{};			// 96 bytes
+	ALIGNED( 64 ) uint32_t count[3][BVHBINS]{};			// 96 bytes
 	ALIGNED( 64 ) __m256 bestLBox, bestRBox;			// 64 bytes
 	// some constants
 	static const __m128 max4 = _mm_set1_ps( -BVH_FAR ), half4 = _mm_set1_ps( 0.5f );
 	static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 );
-	static const __m256 max8 = _mm256_set1_ps( -BVH_FAR );
-	static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f );
+	static const __m128i maxbin4 = _mm_set1_epi32( 7 );
 	static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f );
 	static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() );
 	static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f );
-	for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
+	static const __m256 max8 = _mm256_set1_ps( -BVH_FAR );
+	static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f );
+	for (uint32_t i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
 	// reset node pool
-	const unsigned spaceNeeded = primCount * 2;
+	const uint32_t spaceNeeded = primCount * 2;
 	if (allocatedBVHNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNode );
 		AlignedFree( triIdx );
 		AlignedFree( fragment );
-		triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) );
+		triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) );
 		bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
 		allocatedBVHNodes = spaceNeeded;
 		memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
 		fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::BuildAVX( .. ), bvh not rebuildable." );
-	verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete.
+	verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete.
 	triCount = idxCount = primCount;
-	unsigned newNodePtr = 2;
+	uint32_t newNodePtr = 2;
 	struct FragSSE { __m128 bmin4, bmax4; };
 	FragSSE* frag4 = (FragSSE*)fragment;
 	__m256* frag8 = (__m256*)fragment;
-	const __m128* verts4 = (__m128*)verts;
+	const __m128* verts4 = (__m128*)vertices;
 	// assign all triangles to the root node
 	BVHNode& root = bvhNode[0];
 	root.leftFirst = 0, root.triCount = triCount;
 	// initialize fragments and update root bounds
 	__m128 rootMin = max4, rootMax = max4;
-	for (unsigned i = 0; i < triCount; i++)
+	for (uint32_t i = 0; i < triCount; i++)
 	{
 		const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) );
 		const __m128 v2 = _mm_max_ps( _mm_max_ps( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] );
@@ -3020,7 +3153,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 	rootMin = _mm_xor_ps( rootMin, signFlip4 );
 	root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
 	// subdivide recursively
-	ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0;
+	ALIGNED( 64 ) uint32_t task[128], taskCount = 0, nodeIdx = 0;
 	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f;
 	while (1)
 	{
@@ -3034,26 +3167,28 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 			const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) );
 			// implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
 			// main loop operates on two fragments to minimize dependencies and maximize ILP.
-			unsigned fi = triIdx[node.leftFirst];
+			uint32_t fi = triIdx[node.leftFirst];
 			memset( count, 0, sizeof( count ) );
 			__m256 r0, r1, r2, f = frag8[fi];
-			__m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) );
+			const __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) );
+			const __m128i b4c = _mm_max_epi32( _mm_min_epi32( bi4, maxbin4 ), _mm_setzero_si128() ); // clamp needed after all
 			memcpy( binbox, binboxOrig, sizeof( binbox ) );
-			unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
-			for (unsigned i = 0; i < node.triCount - 1; i++)
+			uint32_t i0 = ILANE( b4c, 0 ), i1 = ILANE( b4c, 1 ), i2 = ILANE( b4c, 2 ), * ti = triIdx + node.leftFirst + 1;
+			for (uint32_t i = 0; i < node.triCount - 1; i++)
 			{
-				unsigned fid = *ti++;
+				uint32_t fid = *ti++;
 			#if defined __GNUC__ || _MSC_VER < 1920
 				if (fid > triCount) fid = triCount - 1; // never happens but g++ *and* vs2017 need this to not crash...
 			#endif
 				const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
 				const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
 				r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
-				const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) );
+				const __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) );
+				const __m128i b4c = _mm_max_epi32( _mm_min_epi32( bi4, maxbin4 ), _mm_setzero_si128() ); // clamp needed after all
 				f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
-				binbox[i0] = r0, i0 = ILANE( b4, 0 );
-				binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
-				binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
+				binbox[i0] = r0, i0 = ILANE( b4c, 0 );
+				binbox[BVHBINS + i1] = r1, i1 = ILANE( b4c, 1 );
+				binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4c, 2 );
 			}
 			// final business for final fragment
 			const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
@@ -3062,21 +3197,21 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
 			// calculate per-split totals
 			float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea();
-			unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
+			uint32_t bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const __m256* bb = binbox;
-			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+			for (int32_t a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
 			{
 				// hardcoded bin processing for BVHBINS == 8
 				assert( BVHBINS == 8 );
-				const unsigned lN0 = count[a][0], rN0 = count[a][7];
+				const uint32_t lN0 = count[a][0], rN0 = count[a][7];
 				const __m256 lb0 = bb[0], rb0 = bb[7];
-				const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
-				const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
+				const uint32_t lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
+				const uint32_t rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
 				const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] );
 				const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] );
 				const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] );
-				const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
-				const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
+				const uint32_t lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
+				const uint32_t rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
 				const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] );
 				const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] );
 				const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] );
@@ -3092,14 +3227,14 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
 			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
-			unsigned t, fr = triIdx[src];
-			for (unsigned i = 0; i < node.triCount; i++)
+			uint32_t t, fr = triIdx[src];
+			for (uint32_t i = 0; i < node.triCount; i++)
 			{
-				const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
+				const uint32_t bi = (uint32_t)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
 				if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
 			}
 			// create child nodes and recurse
-			const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
 			if (leftCount == 0 || rightCount == 0) break; // should not happen.
 			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 );
 			bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
@@ -3142,27 +3277,27 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const
 	bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
 	bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
 	bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
-	int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
-	int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
-	int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
-	int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
+	int32_t sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
+	int32_t sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
+	int32_t sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
+	int32_t sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
 	float t0 = dot( O, plane0 ), t1 = dot( O, plane1 );
 	float t2 = dot( O, plane2 ), t3 = dot( O, plane3 );
 	// Traverse the tree with the packet
-	int first = 0, last = 255; // first and last active ray in the packet
+	int32_t first = 0, last = 255; // first and last active ray in the packet
 	BVHNode* node = &bvhNode[0];
-	ALIGNED( 64 ) unsigned stack[64], stackPtr = 0;
+	ALIGNED( 64 ) uint32_t stack[64], stackPtr = 0;
 	while (1)
 	{
 		if (node->isLeaf())
 		{
 			// handle leaf node
-			for (unsigned j = 0; j < node->triCount; j++)
+			for (uint32_t j = 0; j < node->triCount; j++)
 			{
-				const unsigned idx = triIdx[node->leftFirst + j], vid = idx * 3;
+				const uint32_t idx = triIdx[node->leftFirst + j], vid = idx * 3;
 				const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid];
 				const bvhvec3 s = O - bvhvec3( verts[vid] );
-				for (int i = first; i <= last; i++)
+				for (int32_t i = first; i <= last; i++)
 				{
 					Ray& ray = packet[i];
 					const bvhvec3 h = cross( ray.D, edge2 );
@@ -3188,7 +3323,7 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const
 			BVHNode* left = bvhNode + node->leftFirst;
 			BVHNode* right = bvhNode + node->leftFirst + 1;
 			bool visitLeft = true, visitRight = true;
-			int leftFirst = first, leftLast = last, rightFirst = first, rightLast = last;
+			int32_t leftFirst = first, leftLast = last, rightFirst = first, rightLast = last;
 			float distLeft, distRight;
 			{
 				// see if we want to intersect the left child
@@ -3319,10 +3454,10 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const
 }
 
 // Traverse the second alternative BVH layout (ALT_SOA).
-int BVH::Intersect_AltSoA( Ray& ray ) const
+int32_t BVH::Intersect_AltSoA( Ray& ray ) const
 {
 	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x );
 	const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y );
 	const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z );
@@ -3331,9 +3466,9 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++)
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
-				const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+				const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
 				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 				const bvhvec3 h = cross( ray.D, edge2 );
@@ -3372,7 +3507,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
 		y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
 		z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		unsigned lidx = node->left, ridx = node->right;
+		uint32_t lidx = node->left, ridx = node->right;
 		const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() );
 		const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) );
 	#if 0
@@ -3392,7 +3527,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		if (dist1 > dist2)
 		{
 			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned i = lidx; lidx = ridx; ridx = i;
+			uint32_t i = lidx; lidx = ridx; ridx = i;
 		}
 		if (dist1 == BVH_FAR)
 		{
@@ -3411,7 +3546,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 bool BVH::IsOccluded_AltSoA( const Ray& ray ) const
 {
 	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
-	unsigned stackPtr = 0;
+	uint32_t stackPtr = 0;
 	const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x );
 	const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y );
 	const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z );
@@ -3419,9 +3554,9 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const
 	{
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++)
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
-				const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+				const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
 				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 				const bvhvec3 h = cross( ray.D, edge2 );
@@ -3459,7 +3594,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const
 		x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
 		y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
 		z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		unsigned lidx = node->left, ridx = node->right;
+		uint32_t lidx = node->left, ridx = node->right;
 		const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() );
 		const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) );
 	#if 0
@@ -3479,7 +3614,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const
 		if (dist1 > dist2)
 		{
 			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned i = lidx; lidx = ridx; ridx = i;
+			uint32_t i = lidx; lidx = ridx; ridx = i;
 		}
 		if (dist1 == BVH_FAR)
 		{
@@ -3499,19 +3634,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const
 // Not technically limited to BVH_USEAVX, but __lzcnt and __popcnt will require
 // exotic compiler flags (in combination with __builtin_ia32_lzcnt_u32), so... Since
 // this is just here to test data before it goes to the GPU: MSVC-only for now.
-static unsigned __bfind( unsigned x ) // https://github.com/mackron/refcode/blob/master/lzcnt.c
-{
-#if defined(_MSC_VER) && !defined(__clang__)
-	return 31 - __lzcnt( x );
-#elif defined(__EMSCRIPTEN__)
-	return 31 - __builtin_clz( x );
-#elif defined(__GNUC__) || defined(__clang__)
-	unsigned r;
-	__asm__ __volatile__( "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" );
-	return 31 - r;
-#endif
-}
-static unsigned __popc( unsigned x )
+static uint32_t __popc( uint32_t x )
 {
 #if defined(_MSC_VER) && !defined(__clang__)
 	return __popcnt( x );
@@ -3521,61 +3644,61 @@ static unsigned __popc( unsigned x )
 }
 #define STACK_POP() { ngroup = traversalStack[--stackPtr]; }
 #define STACK_PUSH() { traversalStack[stackPtr++] = ngroup; }
-static inline unsigned extract_byte( const unsigned i, const unsigned n ) { return (i >> (n * 8)) & 0xFF; }
-static inline unsigned sign_extend_s8x4( const unsigned i )
+static inline uint32_t extract_byte( const uint32_t i, const uint32_t n ) { return (i >> (n * 8)) & 0xFF; }
+static inline uint32_t sign_extend_s8x4( const uint32_t i )
 {
-	// asm("prmt.b32 %0, %1, 0x0, 0x0000BA98;" : "=r"(v) : "r"(i)); // BA98: 1011`1010`1001`1000 
-	// with the given parameters, prmt will extend the sign to all bits in a byte. 
-	unsigned b0 = (i & 0b10000000000000000000000000000000) ? 0xff000000 : 0;
-	unsigned b1 = (i & 0b00000000100000000000000000000000) ? 0x00ff0000 : 0;
-	unsigned b2 = (i & 0b00000000000000001000000000000000) ? 0x0000ff00 : 0;
-	unsigned b3 = (i & 0b00000000000000000000000010000000) ? 0x000000ff : 0;
+	// asm("prmt.b32 %0, %1, 0x0, 0x0000BA98;" : "=r"(v) : "r"(i)); // BA98: 1011`1010`1001`1000
+	// with the given parameters, prmt will extend the sign to all bits in a byte.
+	uint32_t b0 = (i & 0b10000000000000000000000000000000) ? 0xff000000 : 0;
+	uint32_t b1 = (i & 0b00000000100000000000000000000000) ? 0x00ff0000 : 0;
+	uint32_t b2 = (i & 0b00000000000000001000000000000000) ? 0x0000ff00 : 0;
+	uint32_t b3 = (i & 0b00000000000000000000000010000000) ? 0x000000ff : 0;
 	return b0 + b1 + b2 + b3; // probably can do better than this.
 }
-int BVH::Intersect_CWBVH( Ray& ray ) const
+int32_t BVH::Intersect_CWBVH( Ray& ray ) const
 {
 	bvhuint2 traversalStack[128];
-	unsigned hitAddr = 0, stackPtr = 0;
+	uint32_t hitAddr = 0, stackPtr = 0;
 	bvhvec2 triangleuv( 0, 0 );
 	const bvhvec4* blasNodes = bvh8Compact;
 	const bvhvec4* blasTris = bvh8Tris;
 	float tmin = 0, tmax = ray.hit.t;
-	const unsigned octinv = (7 - ((ray.D.x < 0 ? 4 : 0) | (ray.D.y < 0 ? 2 : 0) | (ray.D.z < 0 ? 1 : 0))) * 0x1010101;
+	const uint32_t octinv = (7 - ((ray.D.x < 0 ? 4 : 0) | (ray.D.y < 0 ? 2 : 0) | (ray.D.z < 0 ? 1 : 0))) * 0x1010101;
 	bvhuint2 ngroup = bvhuint2( 0, 0b10000000000000000000000000000000 ), tgroup = bvhuint2( 0 );
 	do
 	{
 		if (ngroup.y > 0x00FFFFFF)
 		{
-			const unsigned hits = ngroup.y, imask = ngroup.y;
-			const unsigned child_bit_index = __bfind( hits );
-			const unsigned child_node_base_index = ngroup.x;
+			const uint32_t hits = ngroup.y, imask = ngroup.y;
+			const uint32_t child_bit_index = __bfind( hits );
+			const uint32_t child_node_base_index = ngroup.x;
 			ngroup.y &= ~(1 << child_bit_index);
 			if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( /* nodeGroup */ ); }
 			{
-				const unsigned slot_index = (child_bit_index - 24) ^ (octinv & 255);
-				const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
-				const unsigned child_node_index = child_node_base_index + relative_index;
+				const uint32_t slot_index = (child_bit_index - 24) ^ (octinv & 255);
+				const uint32_t relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
+				const uint32_t child_node_index = child_node_base_index + relative_index;
 				const bvhvec4 n0 = blasNodes[child_node_index * 5 + 0], n1 = blasNodes[child_node_index * 5 + 1];
 				const bvhvec4 n2 = blasNodes[child_node_index * 5 + 2], n3 = blasNodes[child_node_index * 5 + 3];
 				const bvhvec4 n4 = blasNodes[child_node_index * 5 + 4], p = n0;
 				bvhint3 e;
-				e.x = (int)*((char*)&n0.w + 0), e.y = (int)*((char*)&n0.w + 1), e.z = (int)*((char*)&n0.w + 2);
+				e.x = (int32_t)*((int8_t*)&n0.w + 0), e.y = (int32_t)*((int8_t*)&n0.w + 1), e.z = (int32_t)*((int8_t*)&n0.w + 2);
 				ngroup.x = as_uint( n1.x ), tgroup.x = as_uint( n1.y ), tgroup.y = 0;
-				unsigned hitmask = 0;
-				const unsigned vx = (e.x + 127) << 23u; const float adjusted_idirx = *(float*)&vx * ray.rD.x;
-				const unsigned vy = (e.y + 127) << 23u; const float adjusted_idiry = *(float*)&vy * ray.rD.y;
-				const unsigned vz = (e.z + 127) << 23u; const float adjusted_idirz = *(float*)&vz * ray.rD.z;
+				uint32_t hitmask = 0;
+				const uint32_t vx = (e.x + 127) << 23u; const float adjusted_idirx = *(float*)&vx * ray.rD.x;
+				const uint32_t vy = (e.y + 127) << 23u; const float adjusted_idiry = *(float*)&vy * ray.rD.y;
+				const uint32_t vz = (e.z + 127) << 23u; const float adjusted_idirz = *(float*)&vz * ray.rD.z;
 				const float origx = -(ray.O.x - p.x) * ray.rD.x;
 				const float origy = -(ray.O.y - p.y) * ray.rD.y;
 				const float origz = -(ray.O.z - p.z) * ray.rD.z;
 				{	// First 4
-					const unsigned meta4 = *(unsigned*)&n1.z, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
-					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
-					const unsigned bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F;
-					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
-					unsigned swizzledLox = (ray.rD.x < 0) ? *(unsigned*)&n3.z : *(unsigned*)&n2.x, swizzledHix = (ray.rD.x < 0) ? *(unsigned*)&n2.x : *(unsigned*)&n3.z;
-					unsigned swizzledLoy = (ray.rD.y < 0) ? *(unsigned*)&n4.x : *(unsigned*)&n2.z, swizzledHiy = (ray.rD.y < 0) ? *(unsigned*)&n2.z : *(unsigned*)&n4.x;
-					unsigned swizzledLoz = (ray.rD.z < 0) ? *(unsigned*)&n4.z : *(unsigned*)&n3.x, swizzledHiz = (ray.rD.z < 0) ? *(unsigned*)&n3.x : *(unsigned*)&n4.z;
+					const uint32_t meta4 = *(uint32_t*)&n1.z, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const uint32_t inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const uint32_t bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F;
+					const uint32_t child_bits4 = (meta4 >> 5) & 0x07070707;
+					uint32_t swizzledLox = (ray.rD.x < 0) ? *(uint32_t*)&n3.z : *(uint32_t*)&n2.x, swizzledHix = (ray.rD.x < 0) ? *(uint32_t*)&n2.x : *(uint32_t*)&n3.z;
+					uint32_t swizzledLoy = (ray.rD.y < 0) ? *(uint32_t*)&n4.x : *(uint32_t*)&n2.z, swizzledHiy = (ray.rD.y < 0) ? *(uint32_t*)&n2.z : *(uint32_t*)&n4.x;
+					uint32_t swizzledLoz = (ray.rD.z < 0) ? *(uint32_t*)&n4.z : *(uint32_t*)&n3.x, swizzledHiz = (ray.rD.z < 0) ? *(uint32_t*)&n3.x : *(uint32_t*)&n4.z;
 					float tminx[4], tminy[4], tminz[4], tmaxx[4], tmaxy[4], tmaxz[4];
 					tminx[0] = ((swizzledLox >> 0) & 0xFF) * adjusted_idirx + origx, tminx[1] = ((swizzledLox >> 8) & 0xFF) * adjusted_idirx + origx, tminx[2] = ((swizzledLox >> 16) & 0xFF) * adjusted_idirx + origx;
 					tminx[3] = ((swizzledLox >> 24) & 0xFF) * adjusted_idirx + origx, tminy[0] = ((swizzledLoy >> 0) & 0xFF) * adjusted_idiry + origy, tminy[1] = ((swizzledLoy >> 8) & 0xFF) * adjusted_idiry + origy;
@@ -3585,7 +3708,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 					tmaxx[3] = ((swizzledHix >> 24) & 0xFF) * adjusted_idirx + origx, tmaxy[0] = ((swizzledHiy >> 0) & 0xFF) * adjusted_idiry + origy, tmaxy[1] = ((swizzledHiy >> 8) & 0xFF) * adjusted_idiry + origy;
 					tmaxy[2] = ((swizzledHiy >> 16) & 0xFF) * adjusted_idiry + origy, tmaxy[3] = ((swizzledHiy >> 24) & 0xFF) * adjusted_idiry + origy, tmaxz[0] = ((swizzledHiz >> 0) & 0xFF) * adjusted_idirz + origz;
 					tmaxz[1] = ((swizzledHiz >> 8) & 0xFF) * adjusted_idirz + origz, tmaxz[2] = ((swizzledHiz >> 16) & 0xFF) * adjusted_idirz + origz, tmaxz[3] = ((swizzledHiz >> 24) & 0xFF) * adjusted_idirz + origz;
-					for (int i = 0; i < 4; i++)
+					for (int32_t i = 0; i < 4; i++)
 					{
 						// Use VMIN, VMAX to compute the slabs
 						const float cmin = tinybvh_max( tinybvh_max( tinybvh_max( tminx[i], tminy[i] ), tminz[i] ), tmin );
@@ -3594,13 +3717,13 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 					}
 				}
 				{	// Second 4
-					const unsigned meta4 = *(unsigned*)&n1.w, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
-					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
-					const unsigned bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F;
-					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
-					unsigned swizzledLox = (ray.rD.x < 0) ? *(unsigned*)&n3.w : *(unsigned*)&n2.y, swizzledHix = (ray.rD.x < 0) ? *(unsigned*)&n2.y : *(unsigned*)&n3.w;
-					unsigned swizzledLoy = (ray.rD.y < 0) ? *(unsigned*)&n4.y : *(unsigned*)&n2.w, swizzledHiy = (ray.rD.y < 0) ? *(unsigned*)&n2.w : *(unsigned*)&n4.y;
-					unsigned swizzledLoz = (ray.rD.z < 0) ? *(unsigned*)&n4.w : *(unsigned*)&n3.y, swizzledHiz = (ray.rD.z < 0) ? *(unsigned*)&n3.y : *(unsigned*)&n4.w;
+					const uint32_t meta4 = *(uint32_t*)&n1.w, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const uint32_t inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const uint32_t bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F;
+					const uint32_t child_bits4 = (meta4 >> 5) & 0x07070707;
+					uint32_t swizzledLox = (ray.rD.x < 0) ? *(uint32_t*)&n3.w : *(uint32_t*)&n2.y, swizzledHix = (ray.rD.x < 0) ? *(uint32_t*)&n2.y : *(uint32_t*)&n3.w;
+					uint32_t swizzledLoy = (ray.rD.y < 0) ? *(uint32_t*)&n4.y : *(uint32_t*)&n2.w, swizzledHiy = (ray.rD.y < 0) ? *(uint32_t*)&n2.w : *(uint32_t*)&n4.y;
+					uint32_t swizzledLoz = (ray.rD.z < 0) ? *(uint32_t*)&n4.w : *(uint32_t*)&n3.y, swizzledHiz = (ray.rD.z < 0) ? *(uint32_t*)&n3.y : *(uint32_t*)&n4.w;
 					float tminx[4], tminy[4], tminz[4], tmaxx[4], tmaxy[4], tmaxz[4];
 					tminx[0] = ((swizzledLox >> 0) & 0xFF) * adjusted_idirx + origx, tminx[1] = ((swizzledLox >> 8) & 0xFF) * adjusted_idirx + origx, tminx[2] = ((swizzledLox >> 16) & 0xFF) * adjusted_idirx + origx;
 					tminx[3] = ((swizzledLox >> 24) & 0xFF) * adjusted_idirx + origx, tminy[0] = ((swizzledLoy >> 0) & 0xFF) * adjusted_idiry + origy, tminy[1] = ((swizzledLoy >> 8) & 0xFF) * adjusted_idiry + origy;
@@ -3610,7 +3733,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 					tmaxx[3] = ((swizzledHix >> 24) & 0xFF) * adjusted_idirx + origx, tmaxy[0] = ((swizzledHiy >> 0) & 0xFF) * adjusted_idiry + origy, tmaxy[1] = ((swizzledHiy >> 8) & 0xFF) * adjusted_idiry + origy;
 					tmaxy[2] = ((swizzledHiy >> 16) & 0xFF) * adjusted_idiry + origy, tmaxy[3] = ((swizzledHiy >> 24) & 0xFF) * adjusted_idiry + origy, tmaxz[0] = ((swizzledHiz >> 0) & 0xFF) * adjusted_idirz + origz;
 					tmaxz[1] = ((swizzledHiz >> 8) & 0xFF) * adjusted_idirz + origz, tmaxz[2] = ((swizzledHiz >> 16) & 0xFF) * adjusted_idirz + origz, tmaxz[3] = ((swizzledHiz >> 24) & 0xFF) * adjusted_idirz + origz;
-					for (int i = 0; i < 4; i++)
+					for (int32_t i = 0; i < 4; i++)
 					{
 						const float cmin = tinybvh_max( tinybvh_max( tinybvh_max( tminx[i], tminy[i] ), tminz[i] ), tmin );
 						const float cmax = tinybvh_min( tinybvh_min( tinybvh_min( tmaxx[i], tmaxy[i] ), tmaxz[i] ), tmax );
@@ -3623,7 +3746,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 		else tgroup = ngroup, ngroup = bvhuint2( 0 );
 		while (tgroup.y != 0)
 		{
-			unsigned triangleIndex = __bfind( tgroup.y );
+			uint32_t triangleIndex = __bfind( tgroup.y );
 		#ifdef CWBVH_COMPRESSED_TRIS
 			const float* T = (float*)&blasTris[tgroup.x + triangleIndex * 4];
 			const float transS = T[8] * ray.O.x + T[9] * ray.O.y + T[10] * ray.O.z + T[11];
@@ -3635,10 +3758,10 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 				const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3];
 				const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7];
 				const bool hit = u >= 0 && v >= 0 && u + v < 1;
-				if (hit) triangleuv = bvhvec2( u, v ), tmax = ta, hitAddr = *(unsigned*)&T[15];
+				if (hit) triangleuv = bvhvec2( u, v ), tmax = ta, hitAddr = *(uint32_t*)&T[15];
 			}
 		#else
-			int triAddr = tgroup.x + triangleIndex * 3;
+			int32_t triAddr = tgroup.x + triangleIndex * 3;
 			const bvhvec3 v0 = blasTris[triAddr];
 			const bvhvec3 edge1 = bvhvec3( blasTris[triAddr + 1] ) - v0;
 			const bvhvec3 edge2 = bvhvec3( blasTris[triAddr + 2] ) - v0;
@@ -3694,11 +3817,11 @@ inline void IntersectCompactTri( Ray& r, __m128& t4, const float* T )
 	const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3];
 	const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7];
 	const bool hit = u >= 0 && v >= 0 && u + v < 1;
-	if (hit) r.hit = { ta, u, v, *(unsigned*)&T[15] }, t4 = _mm_set1_ps( ta );
+	if (hit) r.hit = { ta, u, v, *(uint32_t*)&T[15] }, t4 = _mm_set1_ps( ta );
 }
-int BVH::Intersect_Afra( Ray& ray ) const
+int32_t BVH::Intersect_Afra( Ray& ray ) const
 {
-	unsigned nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0;
+	uint32_t nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0;
 	const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x );
 	const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y );
 	const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z );
@@ -3725,15 +3848,15 @@ int BVH::Intersect_Afra( Ray& ray ) const
 		const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin );
 		const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax );
 		const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) );
-		const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
+		const int32_t hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
 		if (hits == 1 /* 43% */)
 		{
 			// just one node was hit - no sorting needed.
-			const unsigned lane = __bfind( hitBits ), count = node.triCount[lane];
+			const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane];
 			if (count == 0) nodeIdx = node.childFirst[lane]; else
 			{
-				const unsigned first = node.childFirst[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 				if (stackPtr == 0) break;
 				nodeIdx = stack[--stackPtr];
@@ -3749,19 +3872,19 @@ int BVH::Intersect_Afra( Ray& ray ) const
 		if (hits == 2 /* 16% */)
 		{
 			// two nodes hit
-			unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
+			uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
 			float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1];
 			if (dist1 < dist0)
 			{
-				unsigned t = lane0; lane0 = lane1; lane1 = t;
+				uint32_t t = lane0; lane0 = lane1; lane1 = t;
 				float ft = dist0; dist0 = dist1; dist1 = ft;
 			}
-			const unsigned triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
+			const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
 			// process first lane
 			if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else
 			{
-				const unsigned first = node.childFirst[lane0];
-				for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane0];
+				for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 				nodeIdx = 0;
 			}
@@ -3773,8 +3896,8 @@ int BVH::Intersect_Afra( Ray& ray ) const
 			}
 			else
 			{
-				const unsigned first = node.childFirst[lane1];
-				for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane1];
+				for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
 		}
@@ -3792,18 +3915,18 @@ int BVH::Intersect_Afra( Ray& ray ) const
 			// process hits
 			float d[4] = { d0, d1, d2, d3 };
 			nodeIdx = 0;
-			for (int i = 1; i < 4; i++)
+			for (int32_t i = 1; i < 4; i++)
 			{
-				unsigned lane = *(unsigned*)&d[i] & 3;
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
 				if (node.triCount[lane] == 0)
 				{
-					const unsigned childIdx = node.childFirst[lane];
+					const uint32_t childIdx = node.childFirst[lane];
 					if (nodeIdx) stack[stackPtr++] = nodeIdx;
 					nodeIdx = childIdx;
 					continue;
 				}
-				const unsigned first = node.childFirst[lane], count = node.triCount[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
 		}
@@ -3821,19 +3944,19 @@ int BVH::Intersect_Afra( Ray& ray ) const
 			// process hits
 			float d[4] = { d0, d1, d2, d3 };
 			nodeIdx = 0;
-			for (int i = 0; i < 4; i++)
+			for (int32_t i = 0; i < 4; i++)
 			{
-				unsigned lane = *(unsigned*)&d[i] & 3;
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
 				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
-					const unsigned childIdx = node.childFirst[lane];
+					const uint32_t childIdx = node.childFirst[lane];
 					if (nodeIdx) stack[stackPtr++] = nodeIdx;
 					nodeIdx = childIdx;
 					continue;
 				}
-				const unsigned first = node.childFirst[lane], count = node.triCount[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
 		}
@@ -3862,7 +3985,7 @@ inline bool OccludedCompactTri( const Ray& r, const float* T )
 #endif
 bool BVH::IsOccluded_Afra( const Ray& ray ) const
 {
-	unsigned nodeIdx = 0, stack[1024], stackPtr = 0;
+	uint32_t nodeIdx = 0, stack[1024], stackPtr = 0;
 	const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x );
 	const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y );
 	const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z );
@@ -3888,15 +4011,15 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 		const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin );
 		const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax );
 		const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) );
-		const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
+		const int32_t hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
 		if (hits == 1 /* 43% */)
 		{
 			// just one node was hit - no sorting needed.
-			const unsigned lane = __bfind( hitBits ), count = node.triCount[lane];
+			const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane];
 			if (count == 0) nodeIdx = node.childFirst[lane]; else
 			{
-				const unsigned first = node.childFirst[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 				if (stackPtr == 0) break;
 				nodeIdx = stack[--stackPtr];
@@ -3912,19 +4035,19 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 		if (hits == 2 /* 16% */)
 		{
 			// two nodes hit
-			unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
+			uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
 			float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1];
 			if (dist1 < dist0)
 			{
-				unsigned t = lane0; lane0 = lane1; lane1 = t;
+				uint32_t t = lane0; lane0 = lane1; lane1 = t;
 				float ft = dist0; dist0 = dist1; dist1 = ft;
 			}
-			const unsigned triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
+			const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
 			// process first lane
 			if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else
 			{
-				const unsigned first = node.childFirst[lane0];
-				for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane0];
+				for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 				nodeIdx = 0;
 			}
@@ -3936,8 +4059,8 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 			}
 			else
 			{
-				const unsigned first = node.childFirst[lane1];
-				for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane1];
+				for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
 		}
@@ -3955,18 +4078,18 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 			// process hits
 			float d[4] = { d0, d1, d2, d3 };
 			nodeIdx = 0;
-			for (int i = 1; i < 4; i++)
+			for (int32_t i = 1; i < 4; i++)
 			{
-				unsigned lane = *(unsigned*)&d[i] & 3;
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
 				if (node.triCount[lane] == 0)
 				{
-					const unsigned childIdx = node.childFirst[lane];
+					const uint32_t childIdx = node.childFirst[lane];
 					if (nodeIdx) stack[stackPtr++] = nodeIdx;
 					nodeIdx = childIdx;
 					continue;
 				}
-				const unsigned first = node.childFirst[lane], count = node.triCount[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
 		}
@@ -3984,19 +4107,19 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 			// process hits
 			float d[4] = { d0, d1, d2, d3 };
 			nodeIdx = 0;
-			for (int i = 0; i < 4; i++)
+			for (int32_t i = 0; i < 4; i++)
 			{
-				unsigned lane = *(unsigned*)&d[i] & 3;
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
 				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
-					const unsigned childIdx = node.childFirst[lane];
+					const uint32_t childIdx = node.childFirst[lane];
 					if (nodeIdx) stack[stackPtr++] = nodeIdx;
 					nodeIdx = childIdx;
 					continue;
 				}
-				const unsigned first = node.childFirst[lane], count = node.triCount[lane];
-				for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf 
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
 		}
@@ -4015,7 +4138,7 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const
 // ============================================================================
 //
 //        I M P L E M E N T A T I O N  -  A R M / N E O N  C O D E
-// 
+//
 // ============================================================================
 
 #ifdef BVH_USENEON
@@ -4044,20 +4167,34 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi
 	float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2];
 	return ex * ey + ey * ez + ez * ex;
 }
+
+#if defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON) && __ARM_ARCH >= 85
+// Use the native vrnd32xq_f32 if NEON 8.5 is available
+#else
+// Custom implementation of vrnd32xq_f32
+static inline int32x4_t vrnd32xq_f32( float32x4_t a ) {
+	const float32x4_t half = vdupq_n_f32( 0.5f );
+	uint32x4_t isNegative = vcltq_f32( a, vdupq_n_f32( 0.0f ) ); // Mask for negative numbers
+	float32x4_t adjustment = vbslq_f32( isNegative, vnegq_f32( half ), half );
+	return vcvtq_s32_f32( vaddq_f32( a, adjustment ) );
+}
+#endif
+
 #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
 	ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \
 	const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \
 	splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
 
-void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
+void BVH::BuildNEON( const bvhvec4* vertices, const uint32_t primCount )
 {
-	int test = BVHBINS;
+	FATAL_ERROR_IF( primCount == 0, "BVH::BuildNEON( .. ), primCount == 0." );
+	int32_t test = BVHBINS;
 	if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
 	assert( ((long long)vertices & 63) == 0 ); // buffer must be cacheline-aligned
 	// aligned data
 	ALIGNED( 64 ) float32x4x2_t binbox[3 * BVHBINS];		// 768 bytes
 	ALIGNED( 64 ) float32x4x2_t binboxOrig[3 * BVHBINS];	// 768 bytes
-	ALIGNED( 64 ) unsigned count[3][BVHBINS]{};				// 96 bytes
+	ALIGNED( 64 ) uint32_t count[3][BVHBINS]{};				// 96 bytes
 	ALIGNED( 64 ) float32x4x2_t bestLBox, bestRBox;			// 64 bytes
 	// some constants
 	static const float32x4_t max4 = vdupq_n_f32( -BVH_FAR ), half4 = vdupq_n_f32( 0.5f );
@@ -4067,34 +4204,34 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 	static const float32x4x2_t signFlip8 = { signFlip4, vdupq_n_f32( 0 ) }; // TODO: Check me
 	static const float32x4_t mask3 = vceqq_f32( SIMD_SETRVEC( 0, 0, 0, 1 ), vdupq_n_f32( 0 ) );
 	static const float32x4_t binmul3 = vdupq_n_f32( BVHBINS * 0.49999f );
-	for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
+	for (uint32_t i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
 	// reset node pool
-	const unsigned spaceNeeded = primCount * 2;
+	const uint32_t spaceNeeded = primCount * 2;
 	if (allocatedBVHNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNode );
 		AlignedFree( triIdx );
 		AlignedFree( fragment );
-		triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) );
+		triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) );
 		bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
 		allocatedBVHNodes = spaceNeeded;
 		memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
 		fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::BuildNEON( .. ), bvh not rebuildable." );
-	verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete.
+	verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete.
 	triCount = idxCount = primCount;
-	unsigned newNodePtr = 2;
+	uint32_t newNodePtr = 2;
 	struct FragSSE { float32x4_t bmin4, bmax4; };
 	FragSSE* frag4 = (FragSSE*)fragment;
 	float32x4x2_t* frag8 = (float32x4x2_t*)fragment;
-	const float32x4_t* verts4 = (float32x4_t*)verts;
+	const float32x4_t* verts4 = (float32x4_t*)vertices;
 	// assign all triangles to the root node
 	BVHNode& root = bvhNode[0];
 	root.leftFirst = 0, root.triCount = triCount;
 	// initialize fragments and update root bounds
 	float32x4_t rootMin = max4, rootMax = max4;
-	for (unsigned i = 0; i < triCount; i++)
+	for (uint32_t i = 0; i < triCount; i++)
 	{
 		const float32x4_t v1 = veorq_s32( signFlip4, vminq_f32( vminq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) );
 		const float32x4_t v2 = vmaxq_f32( vmaxq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] );
@@ -4103,7 +4240,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 	rootMin = veorq_s32( rootMin, signFlip4 );
 	root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
 	// subdivide recursively
-	ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0;
+	ALIGNED( 64 ) uint32_t task[128], taskCount = 0, nodeIdx = 0;
 	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f;
 	while (1)
 	{
@@ -4117,15 +4254,15 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 			const float32x4_t rpd4 = vandq_s32( vdivq_f32( binmul3, d4 ), vmvnq_u32( vceqq_f32( d4, vdupq_n_f32( 0 ) ) ) );
 			// implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
 			// main loop operates on two fragments to minimize dependencies and maximize ILP.
-			unsigned fi = triIdx[node.leftFirst];
+			uint32_t fi = triIdx[node.leftFirst];
 			memset( count, 0, sizeof( count ) );
 			float32x4x2_t r0, r1, r2, f = frag8[fi];
 			int32x4_t bi4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ) );
 			memcpy( binbox, binboxOrig, sizeof( binbox ) );
-			unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
-			for (unsigned i = 0; i < node.triCount - 1; i++)
+			uint32_t i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
+			for (uint32_t i = 0; i < node.triCount - 1; i++)
 			{
-				unsigned fid = *ti++;
+				uint32_t fid = *ti++;
 				const float32x4x2_t b0 = binbox[i0];
 				const float32x4x2_t b1 = binbox[BVHBINS + i1];
 				const float32x4x2_t b2 = binbox[2 * BVHBINS + i2];
@@ -4149,21 +4286,21 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
 			// calculate per-split totals
 			float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea();
-			unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
+			uint32_t bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const float32x4x2_t* bb = binbox;
-			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+			for (int32_t a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
 			{
 				// hardcoded bin processing for BVHBINS == 8
 				assert( BVHBINS == 8 );
-				const unsigned lN0 = count[a][0], rN0 = count[a][7];
+				const uint32_t lN0 = count[a][0], rN0 = count[a][7];
 				const float32x4x2_t lb0 = bb[0], rb0 = bb[7];
-				const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
-				const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
+				const uint32_t lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
+				const uint32_t rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
 				const float32x4x2_t lb1 = vmaxq_f32x2( lb0, bb[1] ), rb1 = vmaxq_f32x2( rb0, bb[6] );
 				const float32x4x2_t lb2 = vmaxq_f32x2( lb1, bb[2] ), rb2 = vmaxq_f32x2( rb1, bb[5] );
 				const float32x4x2_t lb3 = vmaxq_f32x2( lb2, bb[3] ), rb3 = vmaxq_f32x2( rb2, bb[4] );
-				const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
-				const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
+				const uint32_t lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
+				const uint32_t rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
 				const float32x4x2_t lb4 = vmaxq_f32x2( lb3, bb[4] ), rb4 = vmaxq_f32x2( rb3, bb[3] );
 				const float32x4x2_t lb5 = vmaxq_f32x2( lb4, bb[5] ), rb5 = vmaxq_f32x2( rb4, bb[2] );
 				const float32x4x2_t lb6 = vmaxq_f32x2( lb5, bb[6] ), rb6 = vmaxq_f32x2( rb5, bb[1] );
@@ -4179,14 +4316,14 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
 			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
-			unsigned t, fr = triIdx[src];
-			for (unsigned i = 0; i < node.triCount; i++)
+			uint32_t t, fr = triIdx[src];
+			for (uint32_t i = 0; i < node.triCount; i++)
 			{
-				const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
+				const uint32_t bi = (uint32_t)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
 				if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
 			}
 			// create child nodes and recurse
-			const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
 			if (leftCount == 0 || rightCount == 0) break; // should not happen.
 			(*(float32x4x2_t*)&bvhNode[n]).val[0] = veorq_s32( bestLBox.val[0], signFlip8.val[0] );
 			(*(float32x4x2_t*)&bvhNode[n]).val[1] = veorq_s32( bestLBox.val[1], signFlip8.val[1] );
@@ -4208,10 +4345,10 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 }
 
 // Traverse the second alternative BVH layout (ALT_SOA).
-int BVH::Intersect_AltSoA( Ray& ray ) const
+int32_t BVH::Intersect_AltSoA( Ray& ray ) const
 {
 	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	const float32x4_t Ox4 = vdupq_n_f32( ray.O.x ), rDx4 = vdupq_n_f32( ray.rD.x );
 	const float32x4_t Oy4 = vdupq_n_f32( ray.O.y ), rDy4 = vdupq_n_f32( ray.rD.y );
 	const float32x4_t Oz4 = vdupq_n_f32( ray.O.z ), rDz4 = vdupq_n_f32( ray.rD.z );
@@ -4221,9 +4358,9 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++)
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
-				const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+				const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
 				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 				const bvhvec3 h = cross( ray.D, edge2 );
@@ -4262,7 +4399,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		x4 = vcombine_f32( vget_low_f32( t0 ), vget_low_f32( t2 ) );
 		y4 = vcombine_f32( vget_high_f32( t0 ), vget_high_f32( t2 ) );
 		z4 = vcombine_f32( vget_low_f32( t1 ), vget_low_f32( t3 ) );
-		unsigned lidx = node->left, ridx = node->right;
+		uint32_t lidx = node->left, ridx = node->right;
 		const float32x4_t min4 = vmaxq_f32( vmaxq_f32( vmaxq_f32( x4, y4 ), z4 ), vdupq_n_f32( 0 ) );
 		const float32x4_t max4 = vminq_f32( vminq_f32( vminq_f32( x4, y4 ), z4 ), vdupq_n_f32( ray.hit.t ) );
 	#if 0
@@ -4282,7 +4419,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 		if (dist1 > dist2)
 		{
 			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned i = lidx; lidx = ridx; ridx = i;
+			uint32_t i = lidx; lidx = ridx; ridx = i;
 		}
 		if (dist1 == BVH_FAR)
 		{
@@ -4297,21 +4434,356 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 	return steps;
 }
 
+// Traverse a 4-way BVH stored in 'Atilla Áfra' layout.
+inline void IntersectCompactTri( Ray& r, float32x4_t& t4, const float* T )
+{
+	const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11];
+	const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z;
+	const float ta = -transS / transD;
+	if (ta <= 0 || ta >= r.hit.t) return;
+	const bvhvec3 wr = r.O + ta * r.D;
+	const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3];
+	const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7];
+	const bool hit = u >= 0 && v >= 0 && u + v < 1;
+	if (hit) r.hit = { ta, u, v, *(uint32_t*)&T[15] }, t4 = vdupq_n_f32( ta );
+}
+
+inline int32_t ARMVecMovemask( uint32x4_t v ) {
+	const int32_t shiftArr[4] = { 0, 1, 2, 3 };
+	int32x4_t shift = vld1q_s32( shiftArr );
+	return vaddvq_u32( vshlq_u32( vshrq_n_u32( v, 31 ), shift ) );
+}
+
+int32_t BVH::Intersect_Afra( Ray& ray ) const
+{
+	uint32_t nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0;
+	const float32x4_t ox4 = vdupq_n_f32( ray.O.x ), rdx4 = vdupq_n_f32( ray.rD.x );
+	const float32x4_t oy4 = vdupq_n_f32( ray.O.y ), rdy4 = vdupq_n_f32( ray.rD.y );
+	const float32x4_t oz4 = vdupq_n_f32( ray.O.z ), rdz4 = vdupq_n_f32( ray.rD.z );
+	float32x4_t t4 = vdupq_n_f32( ray.hit.t ), zero4 = vdupq_n_f32( 0.0f );
+	const uint32x4_t idx4 = SIMD_SETRVECU( 0, 1, 2, 3 );
+	const uint32x4_t idxMask = vdupq_n_u32( 0xfffffffc );
+	const float32x4_t inf4 = vdupq_n_f32( BVH_FAR );
+	while (1)
+	{
+		steps++;
+		const BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
+		// intersect the ray with four AABBs
+		const float32x4_t xmin4 = node.xmin4, xmax4 = node.xmax4;
+		const float32x4_t ymin4 = node.ymin4, ymax4 = node.ymax4;
+		const float32x4_t zmin4 = node.zmin4, zmax4 = node.zmax4;
+		const float32x4_t x0 = vsubq_f32( xmin4, ox4 ), x1 = vsubq_f32( xmax4, ox4 );
+		const float32x4_t y0 = vsubq_f32( ymin4, oy4 ), y1 = vsubq_f32( ymax4, oy4 );
+		const float32x4_t z0 = vsubq_f32( zmin4, oz4 ), z1 = vsubq_f32( zmax4, oz4 );
+		const float32x4_t tx1 = vmulq_f32( x0, rdx4 ), tx2 = vmulq_f32( x1, rdx4 );
+		const float32x4_t ty1 = vmulq_f32( y0, rdy4 ), ty2 = vmulq_f32( y1, rdy4 );
+		const float32x4_t tz1 = vmulq_f32( z0, rdz4 ), tz2 = vmulq_f32( z1, rdz4 );
+		const float32x4_t txmin = vminq_f32( tx1, tx2 ), tymin = vminq_f32( ty1, ty2 ), tzmin = vminq_f32( tz1, tz2 );
+		const float32x4_t txmax = vmaxq_f32( tx1, tx2 ), tymax = vmaxq_f32( ty1, ty2 ), tzmax = vmaxq_f32( tz1, tz2 );
+		const float32x4_t tmin = vmaxq_f32( vmaxq_f32( txmin, tymin ), tzmin );
+		const float32x4_t tmax = vminq_f32( vminq_f32( txmax, tymax ), tzmax );
+
+		uint32x4_t hit = vandq_u32( vandq_u32( vcgeq_f32( tmax, tmin ), vcltq_f32( tmin, t4 ) ), vcgeq_f32( tmax, zero4 ) );
+		int32_t hitBits = ARMVecMovemask( hit ), hits = vcnt_s8( vreinterpret_s8_s32( vcreate_u32( hitBits ) ) )[0];
+
+		if (hits == 1 /* 43% */)
+		{
+			// just one node was hit - no sorting needed.
+			const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane];
+			if (count == 0) nodeIdx = node.childFirst[lane]; else
+			{
+				const uint32_t first = node.childFirst[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+				if (stackPtr == 0) break;
+				nodeIdx = stack[--stackPtr];
+			}
+			continue;
+		}
+		if (hits == 0 /* 29% */)
+		{
+			if (stackPtr == 0) break;
+			nodeIdx = stack[--stackPtr];
+			continue;
+		}
+		if (hits == 2 /* 16% */)
+		{
+			// two nodes hit
+			uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
+			float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1];
+			if (dist1 < dist0)
+			{
+				uint32_t t = lane0; lane0 = lane1; lane1 = t;
+				float ft = dist0; dist0 = dist1; dist1 = ft;
+			}
+			const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
+			// process first lane
+			if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else
+			{
+				const uint32_t first = node.childFirst[lane0];
+				for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf
+					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+				nodeIdx = 0;
+			}
+			// process second lane
+			if (triCount1 == 0)
+			{
+				if (nodeIdx) stack[stackPtr++] = nodeIdx;
+				nodeIdx = node.childFirst[lane1];
+			}
+			else
+			{
+				const uint32_t first = node.childFirst[lane1];
+				for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf
+					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+			}
+		}
+		else if (hits == 3 /* 8% */)
+		{
+			// blend in lane indices
+			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+
+			// sort
+			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
+			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
+			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
+			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
+			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
+			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
+			// process hits
+			float d[4] = { d0, d1, d2, d3 };
+			nodeIdx = 0;
+			for (int32_t i = 1; i < 4; i++)
+			{
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				if (node.triCount[lane] == 0)
+				{
+					const uint32_t childIdx = node.childFirst[lane];
+					if (nodeIdx) stack[stackPtr++] = nodeIdx;
+					nodeIdx = childIdx;
+					continue;
+				}
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+			}
+		}
+		else /* hits == 4, 2%: rare */
+		{
+			// blend in lane indices
+			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+			// sort
+			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
+			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
+			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
+			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
+			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
+			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
+			// process hits
+			float d[4] = { d0, d1, d2, d3 };
+			nodeIdx = 0;
+			for (int32_t i = 0; i < 4; i++)
+			{
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+				if (node.triCount[lane] == 0)
+				{
+					const uint32_t childIdx = node.childFirst[lane];
+					if (nodeIdx) stack[stackPtr++] = nodeIdx;
+					nodeIdx = childIdx;
+					continue;
+				}
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+			}
+		}
+		// get next task
+		if (nodeIdx) continue;
+		if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr];
+	}
+	return steps;
+}
+
+// Find occlusions in a 4-way BVH stored in 'Atilla Áfra' layout.
+inline bool OccludedCompactTri( const Ray& r, const float* T )
+{
+	const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11];
+	const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z;
+	const float ta = -transS / transD;
+	if (ta <= 0 || ta >= r.hit.t) return false;
+	const bvhvec3 wr = r.O + ta * r.D;
+	const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3];
+	const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7];
+	return u >= 0 && v >= 0 && u + v < 1;
+}
+
+bool BVH::IsOccluded_Afra( const Ray& ray ) const
+{
+	uint32_t nodeIdx = 0, stack[1024], stackPtr = 0;
+	const float32x4_t ox4 = vdupq_n_f32( ray.O.x ), rdx4 = vdupq_n_f32( ray.rD.x );
+	const float32x4_t oy4 = vdupq_n_f32( ray.O.y ), rdy4 = vdupq_n_f32( ray.rD.y );
+	const float32x4_t oz4 = vdupq_n_f32( ray.O.z ), rdz4 = vdupq_n_f32( ray.rD.z );
+	float32x4_t t4 = vdupq_n_f32( ray.hit.t ), zero4 = vdupq_n_f32( 0.0f );
+	const uint32x4_t idx4 = SIMD_SETRVECU( 0, 1, 2, 3 );
+	const uint32x4_t idxMask = vdupq_n_u32( 0xfffffffc );
+	const float32x4_t inf4 = vdupq_n_f32( BVH_FAR );
+
+	while (1)
+	{
+		const BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
+		// intersect the ray with four AABBs
+		const float32x4_t xmin4 = node.xmin4, xmax4 = node.xmax4;
+		const float32x4_t ymin4 = node.ymin4, ymax4 = node.ymax4;
+		const float32x4_t zmin4 = node.zmin4, zmax4 = node.zmax4;
+		const float32x4_t x0 = vsubq_f32( xmin4, ox4 ), x1 = vsubq_f32( xmax4, ox4 );
+		const float32x4_t y0 = vsubq_f32( ymin4, oy4 ), y1 = vsubq_f32( ymax4, oy4 );
+		const float32x4_t z0 = vsubq_f32( zmin4, oz4 ), z1 = vsubq_f32( zmax4, oz4 );
+		const float32x4_t tx1 = vmulq_f32( x0, rdx4 ), tx2 = vmulq_f32( x1, rdx4 );
+		const float32x4_t ty1 = vmulq_f32( y0, rdy4 ), ty2 = vmulq_f32( y1, rdy4 );
+		const float32x4_t tz1 = vmulq_f32( z0, rdz4 ), tz2 = vmulq_f32( z1, rdz4 );
+		const float32x4_t txmin = vminq_f32( tx1, tx2 ), tymin = vminq_f32( ty1, ty2 ), tzmin = vminq_f32( tz1, tz2 );
+		const float32x4_t txmax = vmaxq_f32( tx1, tx2 ), tymax = vmaxq_f32( ty1, ty2 ), tzmax = vmaxq_f32( tz1, tz2 );
+		const float32x4_t tmin = vmaxq_f32( vmaxq_f32( txmin, tymin ), tzmin );
+		const float32x4_t tmax = vminq_f32( vminq_f32( txmax, tymax ), tzmax );
+
+		uint32x4_t hit = vandq_u32( vandq_u32( vcgeq_f32( tmax, tmin ), vcltq_f32( tmin, t4 ) ), vcgeq_f32( tmax, zero4 ) );
+		int32_t hitBits = ARMVecMovemask( hit ), hits = vcnt_s8( vreinterpret_s8_s32( vcreate_u32( hitBits ) ) )[0];
+
+		if (hits == 1 /* 43% */)
+		{
+			// just one node was hit - no sorting needed.
+			const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane];
+			if (count == 0) nodeIdx = node.childFirst[lane]; else
+			{
+				const uint32_t first = node.childFirst[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+				if (stackPtr == 0) break;
+				nodeIdx = stack[--stackPtr];
+			}
+			continue;
+		}
+		if (hits == 0 /* 29% */)
+		{
+			if (stackPtr == 0) break;
+			nodeIdx = stack[--stackPtr];
+			continue;
+		}
+		if (hits == 2 /* 16% */)
+		{
+			// two nodes hit
+			uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
+			float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1];
+			if (dist1 < dist0)
+			{
+				uint32_t t = lane0; lane0 = lane1; lane1 = t;
+				float ft = dist0; dist0 = dist1; dist1 = ft;
+			}
+			const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1];
+			// process first lane
+			if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else
+			{
+				const uint32_t first = node.childFirst[lane0];
+				for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf
+					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+				nodeIdx = 0;
+			}
+			// process second lane
+			if (triCount1 == 0)
+			{
+				if (nodeIdx) stack[stackPtr++] = nodeIdx;
+				nodeIdx = node.childFirst[lane1];
+			}
+			else
+			{
+				const uint32_t first = node.childFirst[lane1];
+				for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf
+					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+			}
+		}
+		else if (hits == 3 /* 8% */)
+		{
+			// blend in lane indices
+			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+			// sort
+			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
+			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
+			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
+			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
+			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
+			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
+			// process hits
+			float d[4] = { d0, d1, d2, d3 };
+			nodeIdx = 0;
+			for (int32_t i = 1; i < 4; i++)
+			{
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				if (node.triCount[lane] == 0)
+				{
+					const uint32_t childIdx = node.childFirst[lane];
+					if (nodeIdx) stack[stackPtr++] = nodeIdx;
+					nodeIdx = childIdx;
+					continue;
+				}
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+			}
+		}
+		else /* hits == 4, 2%: rare */
+		{
+			// blend in lane indices
+			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+			// sort
+			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
+			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
+			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
+			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
+			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
+			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
+			// process hits
+			float d[4] = { d0, d1, d2, d3 };
+			nodeIdx = 0;
+			for (int32_t i = 0; i < 4; i++)
+			{
+				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+				if (node.triCount[lane] == 0)
+				{
+					const uint32_t childIdx = node.childFirst[lane];
+					if (nodeIdx) stack[stackPtr++] = nodeIdx;
+					nodeIdx = childIdx;
+					continue;
+				}
+				const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+			}
+		}
+		// get next task
+		if (nodeIdx) continue;
+		if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr];
+	}
+	return false;
+}
+
 #endif // BVH_USENEON
 
 // ============================================================================
 //
 //        D O U B L E   P R E C I S I O N   S U P P O R T
-// 
+//
 // ============================================================================
 
 #ifdef DOUBLE_PRECISION_SUPPORT
 
-// Basic single-function binned-SAH-builder, double-precision version. 
-void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
+// Basic single-function binned-SAH-builder, double-precision version.
+void BVH::BuildEx( const bvhdbl3* vertices, const uint32_t primCount )
 {
 	// allocate on first build
-	const unsigned spaceNeeded = primCount * 2; // upper limit
+	FATAL_ERROR_IF( primCount == 0, "BVH::BuildEx( .. ), primCount == 0." );
+	const uint32_t spaceNeeded = primCount * 2; // upper limit
 	if (allocatedBVHExNodes < spaceNeeded)
 	{
 		AlignedFree( bvhNodeEx );
@@ -4319,14 +4791,14 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 		AlignedFree( fragEx );
 		bvhNodeEx = (BVHNodeEx*)AlignedAlloc( spaceNeeded * sizeof( BVHNodeEx ) );
 		allocatedBVHExNodes = spaceNeeded;
-		triIdxEx = (unsigned long long int*)AlignedAlloc( primCount * sizeof( unsigned long long int ) );
+		triIdxEx = (uint64_t*)AlignedAlloc( primCount * sizeof( uint64_t ) );
 		fragEx = (FragmentEx*)AlignedAlloc( primCount * sizeof( FragmentEx ) );
 	}
 	else FATAL_ERROR_IF( !rebuildable, "BVH::BuildEx( .. ), bvh not rebuildable." );
 	vertsEx = (bvhdbl3*)vertices; // note: we're not copying this data; don't delete.
 	idxCount = triCount = primCount;
 	// reset node pool
-	unsigned newNodePtr = 2;
+	uint32_t newNodePtr = 2;
 	// assign all triangles to the root node
 	BVHNodeEx& root = bvhNodeEx[0];
 	root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhdbl3( BVH_DBL_FAR ), root.aabbMax = bvhdbl3( -BVH_DBL_FAR );
@@ -4334,7 +4806,7 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 	if (vertsEx)
 	{
 		// building a BVH over triangles specified as three 16-byte vertices each.
-		for (unsigned i = 0; i < triCount; i++)
+		for (uint32_t i = 0; i < triCount; i++)
 		{
 			fragEx[i].bmin = tinybvh_min( tinybvh_min( vertsEx[i * 3], vertsEx[i * 3 + 1] ), vertsEx[i * 3 + 2] );
 			fragEx[i].bmax = tinybvh_max( tinybvh_max( vertsEx[i * 3], vertsEx[i * 3 + 1] ), vertsEx[i * 3 + 2] );
@@ -4345,15 +4817,16 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 	else
 	{
 		// we are building the BVH over aabbs we received from ::BuildEx( tinyaabb* ): vertices == 0.
-		for (unsigned i = 0; i < triCount; i++)
+		for (uint32_t i = 0; i < triCount; i++)
 		{
 			root.aabbMin = tinybvh_min( root.aabbMin, fragEx[i].bmin );
 			root.aabbMax = tinybvh_max( root.aabbMax, fragEx[i].bmax ), triIdxEx[i] = i; // here: aabb index.
 		}
 	}
 	// subdivide recursively
-	unsigned task[256], taskCount = 0, nodeIdx = 0;
-	bvhdbl3 minDim = (root.aabbMax - root.aabbMin) * 1e-20, bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0;
+	uint32_t task[256], taskCount = 0, nodeIdx = 0;
+	bvhdbl3 minDim = (root.aabbMax - root.aabbMin) * 1e-20;
+	bvhdbl3 bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0;
 	while (1)
 	{
 		while (1)
@@ -4361,15 +4834,15 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 			BVHNodeEx& node = bvhNodeEx[nodeIdx];
 			// find optimal object split
 			bvhdbl3 binMin[3][BVHBINS], binMax[3][BVHBINS];
-			for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_DBL_FAR, binMax[a][i] = -BVH_DBL_FAR;
-			unsigned count[3][BVHBINS];
-			memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) );
+			for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_DBL_FAR, binMax[a][i] = -BVH_DBL_FAR;
+			uint32_t count[3][BVHBINS];
+			memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) );
 			const bvhdbl3 rpd3 = bvhdbl3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin;
-			for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
+			for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once
 			{
-				const unsigned long long int fi = triIdxEx[node.leftFirst + i];
+				const uint64_t fi = triIdxEx[node.leftFirst + i];
 				const bvhdbl3 fbi = ((fragEx[fi].bmin + fragEx[fi].bmax) * 0.5 - nmin3) * rpd3;
-				bvhint3 bi( (int)fbi.x, (int)fbi.y, (int)fbi.z );
+				bvhint3 bi( (int32_t)fbi.x, (int32_t)fbi.y, (int32_t)fbi.z );
 				bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 );
 				bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 );
 				bi.z = tinybvh_clamp( bi.z, 0, BVHBINS - 1 );
@@ -4382,13 +4855,13 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 			}
 			// calculate per-split totals
 			double splitCost = BVH_DBL_FAR, rSAV = 1.0 / node.SurfaceArea();
-			unsigned bestAxis = 0, bestPos = 0;
-			for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
+			uint32_t bestAxis = 0, bestPos = 0;
+			for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
 			{
 				bvhdbl3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_DBL_FAR, l2 = -BVH_DBL_FAR;
 				bvhdbl3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_DBL_FAR, r2 = -BVH_DBL_FAR;
 				double ANL[BVHBINS - 1], ANR[BVHBINS - 1];
-				for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++)
 				{
 					lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] );
 					rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] );
@@ -4399,7 +4872,7 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 					ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : ((r2 - r1).halfArea() * (double)rN);
 				}
 				// evaluate bin totals to find best position for object split
-				for (unsigned i = 0; i < BVHBINS - 1; i++)
+				for (uint32_t i = 0; i < BVHBINS - 1; i++)
 				{
 					const double C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]);
 					if (C < splitCost)
@@ -4412,19 +4885,19 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount )
 			double noSplitCost = (double)node.triCount * C_INT;
 			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
-			unsigned long long int j = node.leftFirst + node.triCount, src = node.leftFirst;
+			uint64_t j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const double rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis];
-			for (unsigned long long int i = 0; i < node.triCount; i++)
+			for (uint64_t i = 0; i < node.triCount; i++)
 			{
-				const unsigned long long int fi = triIdxEx[src];
-				int bi = (unsigned)(((fragEx[fi].bmin[bestAxis] + fragEx[fi].bmax[bestAxis]) * 0.5 - nmin) * rpd);
+				const uint64_t fi = triIdxEx[src];
+				int32_t bi = (uint32_t)(((fragEx[fi].bmin[bestAxis] + fragEx[fi].bmax[bestAxis]) * 0.5 - nmin) * rpd);
 				bi = tinybvh_clamp( bi, 0, BVHBINS - 1 );
-				if ((unsigned)bi <= bestPos) src++; else tinybvh_swap( triIdxEx[src], triIdxEx[--j] );
+				if ((uint32_t)bi <= bestPos) src++; else tinybvh_swap( triIdxEx[src], triIdxEx[--j] );
 			}
 			// create child nodes
-			unsigned long long int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			uint64_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
 			if (leftCount == 0 || rightCount == 0) break; // should not happen.
-			const int lci = newNodePtr++, rci = newNodePtr++;
+			const int32_t lci = newNodePtr++, rci = newNodePtr++;
 			bvhNodeEx[lci].aabbMin = bestLMin, bvhNodeEx[lci].aabbMax = bestLMax;
 			bvhNodeEx[lci].leftFirst = node.leftFirst, bvhNodeEx[lci].triCount = leftCount;
 			bvhNodeEx[rci].aabbMin = bestRMin, bvhNodeEx[rci].aabbMax = bestRMax;
@@ -4450,7 +4923,7 @@ double BVH::BVHNodeEx::SurfaceArea() const
 	return e.x * e.y + e.y * e.z + e.z * e.x;
 }
 
-int BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const
+int32_t BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const
 {
 	switch (layout)
 	{
@@ -4461,23 +4934,23 @@ int BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const
 		FATAL_ERROR_IF( true, "BVH::IntersectEx( .. , ? ), unsupported double-precision bvh layout." );
 		break;
 	}
-	return 0;		
+	return 0;
 }
 
 // Traverse the default BVH layout, double-precision.
-int BVH::IntersectEx_WaldDouble( RayEx& ray ) const
+int32_t BVH::IntersectEx_WaldDouble( RayEx& ray ) const
 {
 	BVHNodeEx* node = &bvhNodeEx[0], * stack[64];
-	unsigned stackPtr = 0, steps = 0;
+	uint32_t stackPtr = 0, steps = 0;
 	while (1)
 	{
 		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned i = 0; i < node->triCount; i++) 
+			for (uint32_t i = 0; i < node->triCount; i++)
 			{
-				const unsigned long long int idx = triIdxEx[node->leftFirst + i];
-				const unsigned long long int vertIdx = idx * 3;
+				const uint64_t idx = triIdxEx[node->leftFirst + i];
+				const uint64_t vertIdx = idx * 3;
 				const bvhdbl3 edge1 = vertsEx[vertIdx + 1] - vertsEx[vertIdx];
 				const bvhdbl3 edge2 = vertsEx[vertIdx + 2] - vertsEx[vertIdx];
 				const bvhdbl3 h = cross( ray.D, edge2 );
@@ -4537,7 +5010,7 @@ double BVH::BVHNodeEx::Intersect( const RayEx& ray ) const
 // ============================================================================
 //
 //        H E L P E R S
-// 
+//
 // ============================================================================
 
 // TransformPoint
@@ -4560,10 +5033,10 @@ bvhvec3 BVH::BLASInstance::TransformVector( const bvhvec3& v ) const
 }
 
 // IntersectTri
-void BVH::IntersectTri( Ray& ray, const unsigned idx ) const
+void BVH::IntersectTri( Ray& ray, const uint32_t idx ) const
 {
 	// Moeller-Trumbore ray/triangle intersection algorithm
-	const unsigned vertIdx = idx * 3;
+	const uint32_t vertIdx = idx * 3;
 	const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
 	const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
 	const bvhvec3 h = cross( ray.D, edge2 );
@@ -4600,10 +5073,10 @@ float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3&
 }
 
 // PrecomputeTriangle (helper), transforms a triangle to the format used in:
-// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016. 
-void BVH::PrecomputeTriangle( const bvhvec4* const vert, float* T )
+// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016.
+void BVH::PrecomputeTriangle( const bvhvec4slice& vert, uint32_t triIndex, float* T )
 {
-	bvhvec3 v0 = vert[0], v1 = vert[1], v2 = vert[2];
+	bvhvec3 v0 = vert[triIndex], v1 = vert[triIndex + 1], v2 = vert[triIndex + 2];
 	bvhvec3 e1 = v1 - v0, e2 = v2 - v0, N = cross( e1, e2 );
 	float x1, x2, n = dot( v0, N ), rN;
 	if (fabs( N[0] ) > fabs( N[1] ) && fabs( N[0] ) > fabs( N[2] ))
@@ -4639,16 +5112,16 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve
 	bmax = tinybvh_min( bmax, orig.bmax );
 	const bvhvec3 extent = bmax - bmin;
 	// Sutherland-Hodgeman against six bounding planes
-	unsigned Nin = 3, vidx = orig.primIdx * 3;
+	uint32_t Nin = 3, vidx = orig.primIdx * 3;
 	bvhvec3 vin[10] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[10];
-	for (unsigned a = 0; a < 3; a++)
+	for (uint32_t a = 0; a < 3; a++)
 	{
 		const float eps = minDim.cell[a];
 		if (extent.cell[a] > eps)
 		{
-			unsigned Nout = 0;
+			uint32_t Nout = 0;
 			const float l = bmin[a], r = bmax[a];
-			for (unsigned v = 0; v < Nin; v++)
+			for (uint32_t v = 0; v < Nin; v++)
 			{
 				bvhvec3 v0 = vin[v], v1 = vin[(v + 1) % Nin];
 				const bool v0in = v0[a] >= l - eps, v1in = v1[a] >= l - eps;
@@ -4660,7 +5133,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve
 				if (v1in) vout[Nout++] = v1;
 			}
 			Nin = 0;
-			for (unsigned v = 0; v < Nout; v++)
+			for (uint32_t v = 0; v < Nout; v++)
 			{
 				bvhvec3 v0 = vout[v], v1 = vout[(v + 1) % Nout];
 				const bool v0in = v0[a] <= r + eps, v1in = v1[a] <= r + eps;
@@ -4674,7 +5147,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve
 		}
 	}
 	bvhvec3 mn( BVH_FAR ), mx( -BVH_FAR );
-	for (unsigned i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] );
+	for (uint32_t i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] );
 	newFrag.primIdx = orig.primIdx;
 	newFrag.bmin = tinybvh_max( mn, bmin ), newFrag.bmax = tinybvh_min( mx, bmax );
 	newFrag.clipped = 1;
@@ -4682,7 +5155,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve
 }
 
 // RefitUpVerbose: Update bounding boxes of ancestors of the specified node.
-void BVH::RefitUpVerbose( unsigned nodeIdx )
+void BVH::RefitUpVerbose( uint32_t nodeIdx )
 {
 	while (nodeIdx != 0xffffffff)
 	{
@@ -4697,21 +5170,21 @@ void BVH::RefitUpVerbose( unsigned nodeIdx )
 
 // FindBestNewPosition
 // Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies"
-unsigned BVH::FindBestNewPosition( const unsigned Lid )
+uint32_t BVH::FindBestNewPosition( const uint32_t Lid )
 {
 	const BVHNodeVerbose& L = verbose[Lid];
 	const float SA_L = SA( L.aabbMin, L.aabbMax );
 	// reinsert L into BVH
-	unsigned taskNode[512], tasks = 1, Xbest = 0;
+	uint32_t taskNode[512], tasks = 1, Xbest = 0;
 	float taskCi[512], taskInvCi[512], Cbest = BVH_FAR, epsilon = 1e-10f;
 	taskNode[0] = 0 /* root */, taskCi[0] = 0, taskInvCi[0] = 1 / epsilon;
 	while (tasks > 0)
 	{
 		// 'pop' task with createst taskInvCi
 		float maxInvCi = 0;
-		unsigned bestTask = 0;
-		for (unsigned j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j;
-		const unsigned Xid = taskNode[bestTask];
+		uint32_t bestTask = 0;
+		for (uint32_t j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j;
+		const uint32_t Xid = taskNode[bestTask];
 		const float CiLX = taskCi[bestTask];
 		taskNode[bestTask] = taskNode[--tasks], taskCi[bestTask] = taskCi[tasks], taskInvCi[bestTask] = taskInvCi[tasks];
 		// execute task
@@ -4732,11 +5205,11 @@ unsigned BVH::FindBestNewPosition( const unsigned Lid )
 
 // ReinsertNodeVerbose
 // Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies"
-void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin )
+void BVH::ReinsertNodeVerbose( const uint32_t Lid, const uint32_t Nid, const uint32_t origin )
 {
-	unsigned Xbest = FindBestNewPosition( Lid );
+	uint32_t Xbest = FindBestNewPosition( Lid );
 	if (Xbest == 0 || verbose[Xbest].parent == 0) Xbest = origin;
-	const unsigned X1 = verbose[Xbest].parent;
+	const uint32_t X1 = verbose[Xbest].parent;
 	BVHNodeVerbose& N = verbose[Nid];
 	N.left = Xbest, N.right = Lid;
 	N.aabbMin = tinybvh_min( verbose[Xbest].aabbMin, verbose[Lid].aabbMin );
@@ -4749,10 +5222,10 @@ void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const uns
 
 // Determine for each node in the tree the number of primitives
 // stored in that subtree. Helper function for MergeLeafs.
-unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters )
+uint32_t BVH::CountSubtreeTris( const uint32_t nodeIdx, uint32_t* counters )
 {
 	BVHNodeVerbose& node = verbose[nodeIdx];
-	unsigned result = node.triCount;
+	uint32_t result = node.triCount;
 	if (!result)
 		result = CountSubtreeTris( node.left, counters ) + CountSubtreeTris( node.right, counters );
 	counters[nodeIdx] = result;
@@ -4761,7 +5234,7 @@ unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters )
 
 // Write the triangle indices stored in a subtree to a continuous
 // slice in the 'newIdx' array. Helper function for MergeLeafs.
-void BVH::MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr )
+void BVH::MergeSubtree( const uint32_t nodeIdx, uint32_t* newIdx, uint32_t& newIdxPtr )
 {
 	BVHNodeVerbose& node = verbose[nodeIdx];
 	if (node.isLeaf())
diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp
index 1823a21..8325009 100644
--- a/tiny_bvh_fenster.cpp
+++ b/tiny_bvh_fenster.cpp
@@ -3,8 +3,6 @@
 #define SCRHEIGHT 600
 #include "external/fenster.h" // https://github.com/zserge/fenster
 
-// #define USE_EMBREE // enable to verify correct implementation, win64 only for now.
-#define TEST_DOUBLE // enable to verify correct implementation of double-precision path.
 #define LOADSCENE
 
 #define TINYBVH_IMPLEMENTATION
@@ -13,20 +11,10 @@
 
 using namespace tinybvh;
 
-#if defined(USE_EMBREE)
-#include "embree4/rtcore.h"
-static RTCScene embreeScene;
-void embreeError( void* userPtr, enum RTCError error, const char* str )
-{
-	printf( "error %d: %s\n", error, str );
-}
-#else
 BVH bvh;
-#endif
 
 #ifdef LOADSCENE
 bvhvec4* triangles = 0;
-bvhdbl3* triEx = 0;
 const char scene[] = "cryteksponza.bin";
 #else
 ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
@@ -83,35 +71,6 @@ void Init()
 	sphere_flake( 0, 0, 0, 1.5f );
 #endif
 
-#if defined USE_EMBREE
-
-	RTCDevice embreeDevice = rtcNewDevice( NULL );
-	rtcSetDeviceErrorFunction( embreeDevice, embreeError, NULL );
-	embreeScene = rtcNewScene( embreeDevice );
-	RTCGeometry embreeGeom = rtcNewGeometry( embreeDevice, RTC_GEOMETRY_TYPE_TRIANGLE );
-	float* vertices = (float*)rtcSetNewGeometryBuffer( embreeGeom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, 3 * sizeof( float ), verts );
-	unsigned* indices = (unsigned*)rtcSetNewGeometryBuffer( embreeGeom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, 3 * sizeof( unsigned ), verts / 3 );
-	for (int i = 0; i < verts; i++)
-	{
-		vertices[i * 3 + 0] = triangles[i].x, vertices[i * 3 + 1] = triangles[i].y;
-		vertices[i * 3 + 2] = triangles[i].z, indices[i] = i; // Note: not using shared vertices.
-	}
-	rtcCommitGeometry( embreeGeom );
-	rtcAttachGeometry( embreeScene, embreeGeom );
-	rtcReleaseGeometry( embreeGeom );
-	rtcCommitScene( embreeScene );
-
-#elif defined TEST_DOUBLE
-
-	triEx = (tinybvh::bvhdbl3*)malloc64( verts * sizeof(  tinybvh::bvhdbl3 ));
-	for( int i = 0; i < verts; i++ ) 
-		triEx[i].x = (double)triangles[i].x,
-		triEx[i].y = (double)triangles[i].y,
-		triEx[i].z = (double)triangles[i].z;
-	bvh.BuildEx( triEx, verts / 3 );
-
-#else
-
 	// build a BVH over the scene
 #if defined(BVH_USEAVX)
 	bvh.BuildAVX( triangles, verts / 3 );
@@ -119,8 +78,6 @@ void Init()
 	bvh.BuildNEON( triangles, verts / 3 );
 #else
 	bvh.Build( triangles, verts / 3 );
-#endif
-
 #endif
 
 	// load camera position / direction from file
@@ -132,15 +89,12 @@ void Init()
 	t.close();
 }
 
-
 void UpdateCamera(float delta_time_s, fenster& f)
 {
 	bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) );
 	bvhvec3 up = 0.8f * cross( view, right );
-	int64_t new_fenster_time = fenster_time();
 
 	// get camera controls.
-
 	if (f.keys['A']) eye += right * -1.0f * delta_time_s * 10;
 	if (f.keys['D']) eye += right * delta_time_s * 10;
 	if (f.keys['W']) eye += view * delta_time_s * 10;
@@ -179,33 +133,13 @@ void Tick(float delta_time_s, fenster & f, uint32_t* buf)
 	}
 
 	// trace primary rays
-#if defined TEST_DOUBLE
-	for (int i = 0; i < N; i++)
-	{
-		RayEx r( rays[i].O, rays[i].D );
-		depths[i] = bvh.IntersectEx( r ) & 127; 
-	}
-#elif !defined USE_EMBREE
 	for (int i = 0; i < N; i++) depths[i] = bvh.Intersect( rays[i] );
-#else
-	struct RTCRayHit rayhit;
-	for (int i = 0; i < N; i++)
-	{
-		rayhit.ray.org_x = rays[i].O.x, rayhit.ray.org_y = rays[i].O.y, rayhit.ray.org_z = rays[i].O.z;
-		rayhit.ray.dir_x = rays[i].D.x, rayhit.ray.dir_y = rays[i].D.y, rayhit.ray.dir_z = rays[i].D.z;
-		rayhit.ray.tnear = 0, rayhit.ray.tfar = rays[i].hit.t, rayhit.ray.mask = -1, rayhit.ray.flags = 0;
-		rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID, rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-		rtcIntersect1( embreeScene, &rayhit );
-		rays[i].hit.u = rayhit.hit.u, rays[i].hit.u = rayhit.hit.v;
-		rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar;
-	}
-#endif
 
 	// visualize result
 	const bvhvec3 L = normalize( bvhvec3( 1, 2, 3 ) );
 	for (int i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++) for (int tx = 0; tx < SCRWIDTH / 4; tx++)
 	{
-		for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++, i++) // if (rays[i].hit.t < 10000)
+		for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++, i++) if (rays[i].hit.t < 10000)
 		{
 			int pixel_x = tx * 4 + x, pixel_y = ty * 4 + y, primIdx = rays[i].hit.prim;
 			bvhvec3 v0 = triangles[primIdx * 3 + 0];
@@ -213,9 +147,9 @@ void Tick(float delta_time_s, fenster & f, uint32_t* buf)
 			bvhvec3 v2 = triangles[primIdx * 3 + 2];
 			bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) );
 			int c = (int)(255.9f * fabs( dot( N, L ) ));
-			// buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16);
+			buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16);
 			// buf[pixel_x + pixel_y * SCRWIDTH] = (primIdx * 0xdeece66d + 0xb) & 0xFFFFFF; // color is hashed primitive index
-			buf[pixel_x + pixel_y * SCRWIDTH] = depths[i] << 17; // render depth as red
+			// buf[pixel_x + pixel_y * SCRWIDTH] = depths[i] << 17; // render depth as red
 		}
 	}
 	tinybvh::free64( rays );
diff --git a/tiny_bvh_pt.cpp b/tiny_bvh_pt.cpp
new file mode 100644
index 0000000..a9095ca
--- /dev/null
+++ b/tiny_bvh_pt.cpp
@@ -0,0 +1,199 @@
+#define FENSTER_APP_IMPLEMENTATION
+#define SCRWIDTH 800
+#define SCRHEIGHT 600
+#define TILESIZE 8
+#include "external/fenster.h" // https://github.com/zserge/fenster
+
+#define TINYBVH_IMPLEMENTATION
+#include "tiny_bvh.h"
+#include <fstream>
+
+using namespace tinybvh;
+
+// Application variables
+static BVH bvh;
+static bvhvec4* tris = 0;
+static int triCount = 0, frameIdx = 0, spp = 0;
+static bvhvec3 accumulator[SCRWIDTH * SCRHEIGHT];
+static BVH::BVHLayout layout = BVH::WALD_32BYTE;
+
+// Setup view pyramid for a pinhole camera: 
+// eye, p1 (top-left), p2 (top-right) and p3 (bottom-left)
+static bvhvec3 eye( 0, 30, 0 ), p1, p2, p3;
+static bvhvec3 view = normalize( bvhvec3( -1, 0, 0 ) );
+
+// Xor32 RNG
+static unsigned RandomUInt( unsigned& seed ) { seed ^= seed << 13, seed ^= seed >> 17, seed ^= seed << 5; return seed; }
+static float RandomFloat( unsigned& seed ) { return RandomUInt( seed ) * 2.3283064365387e-10f; }
+
+// Ray tracing math
+bvhvec3 DiffuseReflection( const bvhvec3 N, unsigned& seed )
+{
+	bvhvec3 R;
+	do
+	{
+		R = bvhvec3( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 );
+	} while (dot( R, R ) > 1);
+	return normalize( dot( R, N ) < 0 ? R : -R );
+}
+bvhvec3 CosWeightedDiffReflection( const bvhvec3 N, unsigned& seed )
+{
+	bvhvec3 R = DiffuseReflection( N, seed );
+	return normalize( N + R );
+}
+
+// Color conversion
+bvhvec3 rgb32_to_vec3( const unsigned c )
+{
+	return bvhvec3( (float)(c >> 16), (float)((c >> 8) & 255), (float)(c & 255) ) * (1 / 255.f);
+}
+
+// Geometry access
+bvhvec3 TriangleColor( const unsigned idx ) { return rgb32_to_vec3( *(unsigned*)&tris[idx * 3].w ); }
+bvhvec3 TriangleNormal( const unsigned idx )
+{
+	bvhvec3 a = tris[idx * 3], b = tris[idx * 3 + 1], c = tris[idx * 3 + 2];
+	return normalize( cross( b - a, a - c ) );
+}
+
+// Scene management - Append a file, with optional position, scale and color override, tinyfied
+void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, int N = 0 )
+{
+	std::fstream s{ file, s.binary | s.in };
+	s.read( (char*)&N, 4 );
+	bvhvec4* data = (bvhvec4*)malloc64( (N + triCount) * 48 );
+	if (tris) memcpy( data, tris, triCount * 48 ), free64( tris );
+	tris = data, s.read( (char*)tris + triCount * 48, N * 48 ), triCount += N;
+	for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
+		*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
+}
+
+// Application init
+void Init()
+{
+	// load raw vertex data
+	AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
+	AddMesh( "./testdata/dragon.bin", 1.1f, bvhvec3( 29, 3.01f, 0 ), 0xffbb88 );
+	AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
+	AddMesh( "./testdata/bunny.bin", 0.2f, bvhvec3( -7, 0.13f, 0 ), 0x333333 );
+	AddMesh( "./testdata/legocar.bin", 0.3f, bvhvec3( -12, 0.8f, -5 ) );
+	AddMesh( "./testdata/armadillo.bin", 0.3f, bvhvec3( 7, 1, 3 ), 0xff2020 );
+	AddMesh( "./testdata/xyzrgb_dragon.bin", 0.5f, bvhvec3( -22, 0.95f, 0 ), 0xffffaa );
+	AddMesh( "./testdata/suzanne.bin", 0.2f, bvhvec3( -18, 0.95f, -16 ), 0x90ff90 );
+	AddMesh( "./testdata/head.bin", 0.5f, bvhvec3( 0, 3, 9 ) );
+	// build bvh
+	bvh.BuildAVX( tris, triCount );
+#if defined BVH_USEAVX || defined BVH_USENEON
+	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
+	bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA );
+	layout = BVH::BVH4_AFRA;
+#endif
+	// load camera position / direction from file
+	std::fstream t = std::fstream{ "camera.bin", t.binary | t.in };
+	if (!t.is_open()) return;
+	t.read( (char*)&eye, sizeof( eye ) );
+	t.read( (char*)&view, sizeof( view ) );
+	t.close();
+}
+
+// Keyboard handling
+bool UpdateCamera( float delta_time_s, fenster& f )
+{
+	bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ), up = 0.8f * cross( view, right );
+	// get camera controls.
+	bool moved = false;
+	if (f.keys['A']) eye += right * -1.0f * delta_time_s * 10, moved = true;
+	if (f.keys['D']) eye += right * delta_time_s * 10, moved = true;
+	if (f.keys['W']) eye += view * delta_time_s * 10, moved = true;
+	if (f.keys['S']) eye += view * -1.0f * delta_time_s * 10, moved = true;
+	if (f.keys['R']) eye += up * delta_time_s * 20, moved = true;
+	if (f.keys['F']) eye += up * -1.0f * delta_time_s * 20, moved = true;
+	if (f.keys[20]) view = normalize( view + right * -1.0f * delta_time_s ), moved = true;
+	if (f.keys[19]) view = normalize( view + right * delta_time_s ), moved = true;
+	if (f.keys[17]) view = normalize( view + up * -1.0f * delta_time_s ), moved = true;
+	if (f.keys[18]) view = normalize( view + up * delta_time_s ), moved = true;
+	// recalculate right, up
+	right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ), up = 0.8f * cross( view, right );
+	bvhvec3 C = eye + 1.2f * view;
+	p1 = C - right + up, p2 = C + right + up, p3 = C - right - up;
+	return moved;
+}
+
+// Light transport calculation - Basic recursive Path Tracer with IS and Next Event Estimation
+bvhvec3 Trace( Ray ray, unsigned& seed, unsigned depth = 0 )
+{
+	// find primary intersection
+	bvh.Intersect( ray, layout );
+	// shade
+	if (ray.hit.t == 1e30f) return bvhvec3( 0.6f, 0.7f, 1 ); // hit nothing
+	bvhvec3 I = ray.O + ray.hit.t * ray.D;
+	bvhvec3 N = TriangleNormal( ray.hit.prim );
+	if (dot( N, ray.D ) > 0) N = -N;
+	bvhvec3 BRDF = TriangleColor( ray.hit.prim ) * (1.0f / 3.14159f);
+	bvhvec3 Lpos( RandomFloat( seed ) * 30 - 15, 40, RandomFloat( seed ) * 6 - 3 ); // virtual
+	float dist = length( Lpos - I );
+	bvhvec3 L = (Lpos - I) * (1.0f / dist); // normalize
+	bvhvec3 direct = {}, indirect = {};
+	float NdotL = dot( N, L ), NLdotL = fabs( dot( L, bvhvec3( 0, 1, 0 ) ) );
+	if (NdotL > 0)
+		if (!bvh.IsOccluded( Ray( I + L * 0.001f, L, dist ), layout ) )
+			direct = BRDF * NdotL * NLdotL * bvhvec3( 9, 9, 8 ) * 500 * (1.0f / (dist * dist));
+	// random bounce
+	if (depth < 4)
+	{
+		bvhvec3 R = CosWeightedDiffReflection( N, seed );
+		float pdf = 1.0f / dot( N, R );
+		bvhvec3 irradiance = Trace( Ray( I + R * 0.001f, R ), seed, depth + 1 );
+		indirect = BRDF * irradiance * (1.0f / pdf);
+	}
+	// finalize
+	return direct + indirect;
+}
+
+// Application Tick
+void Tick( float delta_time_s, fenster& f, uint32_t* buf )
+{
+	// handle user input and update camera
+	if (UpdateCamera( delta_time_s, f ) || frameIdx++ == 0 )
+	{
+		memset( accumulator, 0, SCRWIDTH * SCRHEIGHT * sizeof( bvhvec3 ) );
+		spp = 1;
+	}
+
+	// render tiles
+	const int xtiles = SCRWIDTH / TILESIZE, ytiles = SCRHEIGHT / TILESIZE;
+	const int tiles = xtiles * ytiles;
+	const float scale = 1.0f / spp++;
+#pragma omp parallel for schedule(dynamic)
+	for (int tile = 0; tile < tiles; tile++)
+	{
+		const int tx = tile % xtiles, ty = tile / xtiles;
+		unsigned seed = (tile + 17) * 171717 + frameIdx * 1023;
+		for (int y = 0; y < TILESIZE; y++) for (int x = 0; x < TILESIZE; x++)
+		{
+			const int pixel_x = tx * TILESIZE + x, pixel_y = ty * TILESIZE + y;
+			const int pixelIdx = pixel_x + pixel_y * SCRWIDTH;
+			// setup primary ray
+			const float u = (float)pixel_x / SCRWIDTH, v = (float)pixel_y / SCRHEIGHT;
+			const bvhvec3 D = normalize( p1 + u * (p2 - p1) + v * (p3 - p1) - eye );
+			// trace
+			accumulator[pixelIdx] += Trace( Ray( eye, D ), seed );
+			const bvhvec3 E = accumulator[pixelIdx] * scale;
+			// visualize, with a poor man's gamma correct
+			const int r = (int)tinybvh_min( 255.0f, sqrtf( E.x ) * 255.0f );
+			const int g = (int)tinybvh_min( 255.0f, sqrtf( E.y ) * 255.0f );
+			const int b = (int)tinybvh_min( 255.0f, sqrtf( E.z ) * 255.0f );
+			buf[pixelIdx] = b + (g << 8) + (r << 16);
+		}
+	}
+}
+
+// Application Shutdown
+void Shutdown()
+{
+	// save camera position / direction to file
+	std::fstream s = std::fstream{ "camera.bin", s.binary | s.out };
+	s.write( (char*)&eye, sizeof( eye ) );
+	s.write( (char*)&view, sizeof( view ) );
+	s.close();
+}
\ No newline at end of file
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index b4ae620..1c6eea6 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -68,7 +68,7 @@ ALIGNED( 64 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
 int verts = 0;
 BVH bvh;
 float traceTime, buildTime, * refDist = 0, * refDistFull = 0;
-unsigned refOccluded = 0, *refOccl = 0;
+unsigned refOccluded = 0, * refOccl = 0;
 
 #if defined EMBREE_BUILD || defined EMBREE_TRAVERSE
 #include "embree4/rtcore.h"
@@ -183,7 +183,7 @@ float TestShadowRays( BVH::BVHLayout layout, Ray* batch, unsigned N, unsigned pa
 	// kernels will lead to some diverging results. We check if no more than about
 	// 1/1000 checks differ. Shadow rays also use an origin offset, based on scene
 	// extend, to account for limited floating point accuracy.
-	if (abs( (int)occluded - (int)refOccluded) > 500) // allow some slack, we're using various tri intersectors
+	if (abs( (int)occluded - (int)refOccluded ) > 500) // allow some slack, we're using various tri intersectors
 	{
 		fprintf( stderr, "\nValidation for shadow rays failed (%i != %i).\n", (int)occluded, (int)refOccluded );
 		exit( 1 );
@@ -288,7 +288,7 @@ int main()
 	// setup view pyramid for a pinhole camera: 
 	// eye, p1 (top-left), p2 (top-right) and p3 (bottom-left)
 #ifdef LOADSPONZA
-	bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) );
+	bvhvec3 eye( -15.24f, 21.5f, 2.54f ), view = normalize( bvhvec3( 0.826f, -0.438f, -0.356f ) );
 #else
 	bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) );
 #endif
@@ -316,7 +316,7 @@ int main()
 				float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
 				bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
 				fullBatch[Nfull++] = Ray( eye, normalize( P - eye ) );
-				if ((s & 7) == 0) 
+				if ((s & 7) == 0)
 				{
 					smallBatch[Nsmall] = fullBatch[Nfull - 1];
 				#ifdef DOUBLE_PRECISION_SUPPORT
@@ -369,7 +369,7 @@ int main()
 	printf( "- 'double' builder:  " );
 	t.reset();
 	tinybvh::bvhdbl3* triEx = (tinybvh::bvhdbl3*)tinybvh::malloc64( verts * sizeof( tinybvh::bvhdbl3 ) );
-	for( int i = 0; i < verts; i++ ) 
+	for (int i = 0; i < verts; i++)
 		triEx[i].x = (double)triangles[i].x,
 		triEx[i].y = (double)triangles[i].y,
 		triEx[i].z = (double)triangles[i].z;
@@ -452,13 +452,13 @@ int main()
 
 	// estimate correct shadow ray epsilon based on scene extends
 	tinybvh::bvhvec4 bmin( 1e30f ), bmax( -1e30f );
-	for( int i = 0; i < verts; i++ )
+	for (int i = 0; i < verts; i++)
 		bmin = tinybvh::tinybvh_min( bmin, triangles[i] ),
 		bmax = tinybvh::tinybvh_max( bmax, triangles[i] );
 	tinybvh::bvhvec3 e = bmax - bmin;
 	float maxExtent = tinybvh::tinybvh_max( tinybvh::tinybvh_max( e.x, e.y ), e.z );
 	float shadowEpsilon = maxExtent * 5e-7f;
-	
+
 	// setup proper shadow ray batch
 	traceTime = TestPrimaryRays( BVH::WALD_32BYTE, smallBatch, Nsmall, 1 ); // just to generate intersection points
 	Ray* shadowBatch = (Ray*)tinybvh::malloc64( sizeof( Ray ) * Nsmall );
@@ -472,7 +472,7 @@ int main()
 	}
 	// get reference shadow ray query result
 	refOccluded = 0, refOccl = new unsigned[Nsmall];
-	for (int i = 0; i < Nsmall; i++) 
+	for (int i = 0; i < Nsmall; i++)
 		refOccluded += (refOccl[i] = bvh.IsOccluded( shadowBatch[i], BVH::WALD_32BYTE ) ? 1 : 0);
 
 #ifdef TRAVERSE_2WAY_ST
@@ -643,7 +643,7 @@ int main()
 	// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
 	tinyocl::Buffer gpuNodes( bvh.usedAltNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
 	tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx );
-	tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts );
+	tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), triangles );
 	// synchronize the host-side data to the gpu side
 	gpuNodes.CopyToDevice();
 	idxData.CopyToDevice();
diff --git a/tiny_bvh_test.sln b/tiny_bvh_test.sln
index e799e62..023e562 100644
--- a/tiny_bvh_test.sln
+++ b/tiny_bvh_test.sln
@@ -11,6 +11,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_speedtest", "vcpro
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_minimal", "vcproj\tiny_bvh_minimal.vcxproj", "{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_pt", "vcproj\tiny_bvh_pt.vcxproj", "{430FCECF-7685-40A7-A415-4A0DE487158F}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -51,6 +53,14 @@ Global
 		{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x64.Build.0 = Release|x64
 		{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.ActiveCfg = Release|Win32
 		{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.Build.0 = Release|Win32
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x64.ActiveCfg = Debug|x64
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x64.Build.0 = Debug|x64
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x86.ActiveCfg = Debug|Win32
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x86.Build.0 = Debug|Win32
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x64.ActiveCfg = Release|x64
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x64.Build.0 = Release|x64
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x86.ActiveCfg = Release|Win32
+		{430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/vcproj/tiny_bvh_fenster.vcxproj.user b/vcproj/tiny_bvh_fenster.vcxproj.user
new file mode 100644
index 0000000..58dbbfa
--- /dev/null
+++ b/vcproj/tiny_bvh_fenster.vcxproj.user
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_minimal.vcxproj.user b/vcproj/tiny_bvh_minimal.vcxproj.user
new file mode 100644
index 0000000..58dbbfa
--- /dev/null
+++ b/vcproj/tiny_bvh_minimal.vcxproj.user
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_pt.vcxproj b/vcproj/tiny_bvh_pt.vcxproj
new file mode 100644
index 0000000..b668694
--- /dev/null
+++ b/vcproj/tiny_bvh_pt.vcxproj
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\tiny_bvh_pt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\tiny_bvh.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{430FCECF-7685-40A7-A415-4A0DE487158F}</ProjectGuid>
+    <RootNamespace>tinybvhpt</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>NotSet</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>NotSet</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(SolutionDir)</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>$(SolutionDir)</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <OpenMPSupport>false</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>../external/embree/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>$(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions);	_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>../external/embree/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>$(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_pt.vcxproj.user b/vcproj/tiny_bvh_pt.vcxproj.user
new file mode 100644
index 0000000..58dbbfa
--- /dev/null
+++ b/vcproj/tiny_bvh_pt.vcxproj.user
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_renderer.vcxproj.user b/vcproj/tiny_bvh_renderer.vcxproj.user
new file mode 100644
index 0000000..58dbbfa
--- /dev/null
+++ b/vcproj/tiny_bvh_renderer.vcxproj.user
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_speedtest.vcxproj.user b/vcproj/tiny_bvh_speedtest.vcxproj.user
new file mode 100644
index 0000000..58dbbfa
--- /dev/null
+++ b/vcproj/tiny_bvh_speedtest.vcxproj.user
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerWorkingDirectory>$(SolutionDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file