diff --git a/.github/workflows/tinybvh_sdk_ci.yml b/.github/workflows/tinybvh_sdk_ci.yml index 017fc97..1507e44 100644 --- a/.github/workflows/tinybvh_sdk_ci.yml +++ b/.github/workflows/tinybvh_sdk_ci.yml @@ -2,9 +2,9 @@ name: tinybvh CI on: push: - branches: [ main ] + branches: [ main, dev ] pull_request: - branches: [ main ] + branches: [ main, dev ] jobs: build: diff --git a/README.md b/README.md index 0fc5475..43da909 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# dev +This is the **development branch** for tinybvh. Potentially unstable code, consider yourself warned. + # tinybvh Single-header BVH construction and traversal library written as "Sane C++" (or "C with classes"). The library has no dependencies. @@ -9,6 +12,8 @@ Single-header OpenCL library, which helps you select and initialize a device. It * Vendor and architecture detection and propagation to #defines in OpenCL code * ..And many other things. +![Rendered with tinybvh](images/test.png) + To use tinyocl, just include ````tiny_ocl.h````; this will automatically cause linking with ````OpenCL.lib```` in the 'external' folder, which in turn passes on work to vendor-specific driver code. But all that is not your problem! Note that the ````tiny_bvh.h```` library will work without ````tiny_ocl.h```` and remains dependency-free. The new ````tiny_ocl.h```` is only needed in projects that wish to trace rays _on the GPU_ using BVHs created by ````tiny_bvh.h````. @@ -31,6 +36,7 @@ A constructed BVH can be used to quickly intersect a ray with the geometry, usin The constructed BVH will have a layout suitable for construction ('````WALD_32BYTE````'). Several other layouts for the same data are available, which all serve one or more specific purposes. You can convert between layouts using ````BVH::Convert````. The available layouts are: * ````BVH::WALD_32BYTE```` : A compact format that stores the AABB for a node, along with child pointers and leaf information in a cross-platform-friendly way. The 32-byte size allows for cache-line alignment. +* ````BVH::ALT_SOA```` : This format stores bounding box information in a SIMD-friendly format, making the BVH faster to traverse. * ````BVH::WALD_DOUBLE```` : Double-precision version of ````BVH::WALD_32BYTE````. * ````BVH::VERBOSE```` : A format designed for modifying BVHs, e.g. for post-build optimizations using ````BVH::Optimize()````. * ````BVH::AILA_LAINE```` : This format uses 64 bytes per node and stores the AABBs of the two child nodes. This is the format presented in the [2009 Aila & Laine paper](https://research.nvidia.com/sites/default/files/pubs/2009-08_Understanding-the-Efficiency/aila2009hpg_paper.pdf) and recommended for basic GPU ray tracing. @@ -57,6 +63,10 @@ The cross-platform fenster-based single-source **bitmap renderer** can be compil ```g++ -std=c++20 -mavx -O3 -framework Cocoa tiny_bvh_fenster.cpp -o tiny_bvh_fenster``` (on macOS) +The multi-threaded **ambient occlusion** demo can be compiled with + +````g++ -std=c++20 -mavx -mwindows -fopenmp -O3 tiny_bvh_pt.cpp -o tiny_bvh_pt```` (on windows) + The **performance measurement tool** uses OpenMP and can be compiled with: ````g++ -std=c++20 -mavx -Ofast -fopenmp tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest```` diff --git a/external/fenster.h b/external/fenster.h index c823d66..89225ca 100644 --- a/external/fenster.h +++ b/external/fenster.h @@ -199,8 +199,8 @@ static LRESULT CALLBACK fenster_wndproc(HWND hwnd, UINT msg, WPARAM wParam, PAINTSTRUCT ps; HDC hdc = BeginPaint(hwnd, &ps); HDC memdc = CreateCompatibleDC(hdc); - auto hbmp = CreateCompatibleBitmap(hdc, f->width, f->height); - auto oldbmp = SelectObject(memdc, hbmp); + HBITMAP hbmp = CreateCompatibleBitmap(hdc, f->width, f->height); + HBITMAP oldbmp = static_cast(SelectObject(memdc, hbmp)); BINFO bi = {{sizeof(bi), f->width, -f->height, 1, 32, BI_BITFIELDS}}; bi.bmiColors[0].rgbRed = 0xff; bi.bmiColors[1].rgbGreen = 0xff; @@ -249,10 +249,13 @@ FENSTER_API int fenster_open(struct fenster *f) { wc.hInstance = hInstance; wc.lpszClassName = f->title; RegisterClassEx(&wc); + RECT desiredRect = {0, 0, f->width, f->height}; + AdjustWindowRectEx(&desiredRect, WS_OVERLAPPEDWINDOW, FALSE, WS_EX_CLIENTEDGE); + int adjustedWidth = desiredRect.right - desiredRect.left; + int adjustedHeight = desiredRect.bottom - desiredRect.top; f->hwnd = CreateWindowEx(WS_EX_CLIENTEDGE, f->title, f->title, WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT, - f->width, f->height, NULL, NULL, hInstance, NULL); - + adjustedWidth, adjustedHeight, NULL, NULL, hInstance, NULL); if (f->hwnd == NULL) return -1; SetWindowLongPtr(f->hwnd, GWLP_USERDATA, (LONG_PTR)f); @@ -261,7 +264,10 @@ FENSTER_API int fenster_open(struct fenster *f) { return 0; } -FENSTER_API void fenster_close(struct fenster *f) { (void)f; } +FENSTER_API void fenster_close(struct fenster *f) { + PostMessage(f->hwnd, WM_CLOSE, 0, 0); + (void)f; +} FENSTER_API int fenster_loop(struct fenster *f) { MSG msg; diff --git a/images/test.png b/images/test.png new file mode 100644 index 0000000..b5487ad Binary files /dev/null and b/images/test.png differ diff --git a/testdata/armadillo.bin b/testdata/armadillo.bin new file mode 100644 index 0000000..3de5c32 Binary files /dev/null and b/testdata/armadillo.bin differ diff --git a/testdata/bunny.bin b/testdata/bunny.bin new file mode 100644 index 0000000..d9f584a Binary files /dev/null and b/testdata/bunny.bin differ diff --git a/testdata/cryteksponza.bin b/testdata/cryteksponza.bin index 7fa6e98..a96bd61 100644 Binary files a/testdata/cryteksponza.bin and b/testdata/cryteksponza.bin differ diff --git a/testdata/dragon.bin b/testdata/dragon.bin new file mode 100644 index 0000000..3ad2c33 Binary files /dev/null and b/testdata/dragon.bin differ diff --git a/testdata/happybuddha.bin b/testdata/happybuddha.bin index 9adea58..e63ba81 100644 Binary files a/testdata/happybuddha.bin and b/testdata/happybuddha.bin differ diff --git a/testdata/head.bin b/testdata/head.bin new file mode 100644 index 0000000..5f228d8 Binary files /dev/null and b/testdata/head.bin differ diff --git a/testdata/legocar.bin b/testdata/legocar.bin new file mode 100644 index 0000000..bf331ca Binary files /dev/null and b/testdata/legocar.bin differ diff --git a/testdata/lucy.bin b/testdata/lucy.bin new file mode 100644 index 0000000..b8c0122 Binary files /dev/null and b/testdata/lucy.bin differ diff --git a/testdata/suzanne.bin b/testdata/suzanne.bin new file mode 100644 index 0000000..e4d8269 Binary files /dev/null and b/testdata/suzanne.bin differ diff --git a/testdata/xyzrgb_dragon.bin b/testdata/xyzrgb_dragon.bin new file mode 100644 index 0000000..8349e43 Binary files /dev/null and b/testdata/xyzrgb_dragon.bin differ diff --git a/tiny_bvh.h b/tiny_bvh.h index 48001b8..be0c95d 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -23,6 +23,8 @@ THE SOFTWARE. */ // 2024 +// Dec 07: version 1.0.6 : Introduction of slices. +// Dec 05: version 1.0.5 : Bug fixes, path tracing example. // Dec 02: version 1.0.1 : Shadow ray validation, double precision. // Nov 28: version 1.0.0 : Validation in speedtest, faster tri test. // Nov 25: version 0.9.8 : FATAL_ERROR_IF interface. @@ -34,7 +36,7 @@ THE SOFTWARE. // Mov 16: version 0.9.0 : (external) OpenCL in speedtest. // Nov 15: version 0.8.3 : Incremental update / bugfixes. // Nov 14: version 0.8.0 : ARM/NEON support. -// Nov 13: version 0.7.5 : Support for WASM with EMSCRIPTEN. +// Nov 13: version 0.7.5 : Support for WASM with EMSCRIPTEN. // Nov 12: version 0.7.0 : CWBVH construction and traversal. // Nov 11: version 0.5.1 : SBVH builder, BVH4_GPU traversal. // Nov 10: version 0.4.2 : BVH4/8, gpu-friendly BVH4. @@ -90,11 +92,15 @@ THE SOFTWARE. // Jefferson Amstutz: CMake surpport // Christian Oliveros: WASM / EMSCRIPTEN support // Thierry Cantenot: user-defined alloc & free +// David Peicho: slices & Rust bindings #ifndef TINY_BVH_H_ #define TINY_BVH_H_ -// binned BVH building: bin count +// Run-time checks; disabled by default. +// #define PARANOID + +// Binned BVH building: bin count. #define BVHBINS 8 // SAH BVH building: Heuristic parameters @@ -127,12 +133,12 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 1 #define TINY_BVH_VERSION_MINOR 0 -#define TINY_BVH_VERSION_SUB 5 +#define TINY_BVH_VERSION_SUB 6 // ============================================================================ // // P R E L I M I N A R I E S -// +// // ============================================================================ // needful includes @@ -148,6 +154,7 @@ THE SOFTWARE. #include #include #endif +#include // aligned memory allocation // note: formally size needs to be a multiple of 'alignment'. See: @@ -209,7 +216,7 @@ struct ALIGNED( 16 ) bvhvec4 bvhvec4( const float a ) : x( a ), y( a ), z( a ), w( a ) {} bvhvec4( const bvhvec3 & a ); bvhvec4( const bvhvec3 & a, float b ); - float& operator [] ( const int i ) { return cell[i]; } + float& operator [] ( const int32_t i ) { return cell[i]; } union { struct { float x, y, z, w; }; float cell[4]; }; }; @@ -219,7 +226,7 @@ struct ALIGNED( 8 ) bvhvec2 bvhvec2( const float a, const float b ) : x( a ), y( b ) {} bvhvec2( const float a ) : x( a ), y( a ) {} bvhvec2( const bvhvec4 a ) : x( a.x ), y( a.y ) {} - float& operator [] ( const int i ) { return cell[i]; } + float& operator [] ( const int32_t i ) { return cell[i]; } union { struct { float x, y; }; float cell[2]; }; }; @@ -230,34 +237,34 @@ struct bvhvec3 bvhvec3( const float a ) : x( a ), y( a ), z( a ) {} bvhvec3( const bvhvec4 a ) : x( a.x ), y( a.y ), z( a.z ) {} float halfArea() { return x < -BVH_FAR ? 0 : (x * y + y * z + z * x); } // for SAH calculations - float& operator [] ( const int i ) { return cell[i]; } + float& operator [] ( const int32_t i ) { return cell[i]; } union { struct { float x, y, z; }; float cell[3]; }; }; struct bvhint3 { bvhint3() = default; - bvhint3( const int a, const int b, const int c ) : x( a ), y( b ), z( c ) {} - bvhint3( const int a ) : x( a ), y( a ), z( a ) {} - bvhint3( const bvhvec3& a ) { x = (int)a.x, y = (int)a.y, z = (int)a.z; } - int& operator [] ( const int i ) { return cell[i]; } - union { struct { int x, y, z; }; int cell[3]; }; + bvhint3( const int32_t a, const int32_t b, const int32_t c ) : x( a ), y( b ), z( c ) {} + bvhint3( const int32_t a ) : x( a ), y( a ), z( a ) {} + bvhint3( const bvhvec3& a ) { x = (int32_t)a.x, y = (int32_t)a.y, z = (int32_t)a.z; } + int32_t& operator [] ( const int32_t i ) { return cell[i]; } + union { struct { int32_t x, y, z; }; int32_t cell[3]; }; }; struct bvhint2 { bvhint2() = default; - bvhint2( const int a, const int b ) : x( a ), y( b ) {} - bvhint2( const int a ) : x( a ), y( a ) {} - int x, y; + bvhint2( const int32_t a, const int32_t b ) : x( a ), y( b ) {} + bvhint2( const int32_t a ) : x( a ), y( a ) {} + int32_t x, y; }; struct bvhuint2 { bvhuint2() = default; - bvhuint2( const unsigned a, const unsigned b ) : x( a ), y( b ) {} - bvhuint2( const unsigned a ) : x( a ), y( a ) {} - unsigned x, y; + bvhuint2( const uint32_t a, const uint32_t b ) : x( a ), y( b ) {} + bvhuint2( const uint32_t a ) : x( a ), y( a ) {} + uint32_t x, y; }; #ifdef TINYBVH_IMPLEMENTATION @@ -269,8 +276,26 @@ bvhvec4::bvhvec4( const bvhvec3& a, float b ) { x = a.x; y = a.y; z = a.z; w = b struct bvhaabb { - bvhvec3 minBounds; unsigned dummy1; - bvhvec3 maxBounds; unsigned dummy2; + bvhvec3 minBounds; uint32_t dummy1; + bvhvec3 maxBounds; uint32_t dummy2; +}; + +/** + * Strided slice of @ref bvhvec4 + */ +struct bvhvec4slice +{ + bvhvec4slice() = default; + /** + * @param data Pointer to the first element + * @param count Number of @ref bvhvec4 in the slice, not **bytes** + * @param stride Byte stride between each @ref bvhvec4 element + */ + bvhvec4slice( const bvhvec4* data, uint32_t count, uint32_t stride = sizeof( bvhvec4 ) ); + operator bool() const { return !!data; } + const bvhvec4& operator [] ( size_t i ) const; + const int8_t* data = nullptr; + uint32_t count, stride; }; #ifdef _MSC_VER @@ -287,16 +312,16 @@ static inline float tinybvh_min( const float a, const float b ) { return a < b ? static inline float tinybvh_max( const float a, const float b ) { return a > b ? a : b; } static inline double tinybvh_min( const double a, const double b ) { return a < b ? a : b; } static inline double tinybvh_max( const double a, const double b ) { return a > b ? a : b; } -static inline int tinybvh_min( const int a, const int b ) { return a < b ? a : b; } -static inline int tinybvh_max( const int a, const int b ) { return a > b ? a : b; } -static inline unsigned tinybvh_min( const unsigned a, const unsigned b ) { return a < b ? a : b; } -static inline unsigned tinybvh_max( const unsigned a, const unsigned b ) { return a > b ? a : b; } +static inline int32_t tinybvh_min( const int32_t a, const int32_t b ) { return a < b ? a : b; } +static inline int32_t tinybvh_max( const int32_t a, const int32_t b ) { return a > b ? a : b; } +static inline uint32_t tinybvh_min( const uint32_t a, const uint32_t b ) { return a < b ? a : b; } +static inline uint32_t tinybvh_max( const uint32_t a, const uint32_t b ) { return a > b ? a : b; } static inline bvhvec3 tinybvh_min( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( tinybvh_min( a.x, b.x ), tinybvh_min( a.y, b.y ), tinybvh_min( a.z, b.z ) ); } static inline bvhvec4 tinybvh_min( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( tinybvh_min( a.x, b.x ), tinybvh_min( a.y, b.y ), tinybvh_min( a.z, b.z ), tinybvh_min( a.w, b.w ) ); } static inline bvhvec3 tinybvh_max( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( tinybvh_max( a.x, b.x ), tinybvh_max( a.y, b.y ), tinybvh_max( a.z, b.z ) ); } static inline bvhvec4 tinybvh_max( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( tinybvh_max( a.x, b.x ), tinybvh_max( a.y, b.y ), tinybvh_max( a.z, b.z ), tinybvh_max( a.w, b.w ) ); } static inline float tinybvh_clamp( const float x, const float a, const float b ) { return x < a ? a : (x > b ? b : x); } -static inline int tinybvh_clamp( const int x, const int a, const int b ) { return x < a ? a : (x > b ? b : x); } +static inline int32_t tinybvh_clamp( const int32_t x, const int32_t a, const int32_t b ) { return x < a ? a : (x > b ? b : x); } template inline static void tinybvh_swap( T& a, T& b ) { T t = a; a = b; b = t; } // Operator overloads. @@ -309,6 +334,7 @@ inline bvhvec4 operator-( const bvhvec4& a ) { return bvhvec4( -a.x, -a.y, -a.z, inline bvhvec2 operator+( const bvhvec2& a, const bvhvec2& b ) { return bvhvec2( a.x + b.x, a.y + b.y ); } inline bvhvec3 operator+( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( a.x + b.x, a.y + b.y, a.z + b.z ); } inline bvhvec4 operator+( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w ); } +inline bvhvec4 operator+( const bvhvec4& a, const bvhvec3& b ) { return bvhvec4( a.x + b.x, a.y + b.y, a.z + b.z, a.w ); } inline bvhvec2 operator-( const bvhvec2& a, const bvhvec2& b ) { return bvhvec2( a.x - b.x, a.y - b.y ); } inline bvhvec3 operator-( const bvhvec3& a, const bvhvec3& b ) { return bvhvec3( a.x - b.x, a.y - b.y, a.z - b.z ); } inline bvhvec4 operator-( const bvhvec4& a, const bvhvec4& b ) { return bvhvec4( a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w ); } @@ -327,7 +353,7 @@ inline bvhvec4 operator*( float b, const bvhvec4& a ) { return bvhvec4( b * a.x, inline bvhvec2 operator/( float b, const bvhvec2& a ) { return bvhvec2( b / a.x, b / a.y ); } inline bvhvec3 operator/( float b, const bvhvec3& a ) { return bvhvec3( b / a.x, b / a.y, b / a.z ); } inline bvhvec4 operator/( float b, const bvhvec4& a ) { return bvhvec4( b / a.x, b / a.y, b / a.z, b / a.w ); } -inline bvhvec3 operator*=( bvhvec3& a, const float b ) { return bvhvec3( a.x * b, a.y * b, a.z * b ); } +inline void operator*=( bvhvec3& a, const float b ) { a.x *= b; a.y *= b; a.z *= b; } #endif // TINYBVH_USE_CUSTOM_VECTOR_TYPES @@ -360,7 +386,7 @@ struct bvhdbl3 bvhdbl3( const double a ) : x( a ), y( a ), z( a ) {} bvhdbl3( const bvhvec3 a ) : x( (double)a.x ), y( (double)a.y ), z( (double)a.z ) {} double halfArea() { return x < -BVH_FAR ? 0 : (x * y + y * z + z * x); } // for SAH calculations - double& operator [] ( const int i ) { return cell[i]; } + double& operator [] ( const int32_t i ) { return cell[i]; } union { struct { double x, y, z; }; double cell[3]; }; }; @@ -376,10 +402,10 @@ inline bvhdbl3 operator+( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3( inline bvhdbl3 operator-( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3( a.x - b.x, a.y - b.y, a.z - b.z ); } inline void operator+=( bvhdbl3& a, const bvhdbl3& b ) { a.x += b.x; a.y += b.y; a.z += b.z; } inline bvhdbl3 operator*( const bvhdbl3& a, const bvhdbl3& b ) { return bvhdbl3( a.x * b.x, a.y * b.y, a.z * b.z ); } -inline bvhdbl3 operator*( const bvhdbl3& a, float b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); } -inline bvhdbl3 operator*( float b, const bvhdbl3& a ) { return bvhdbl3( b * a.x, b * a.y, b * a.z ); } -inline bvhdbl3 operator/( float b, const bvhdbl3& a ) { return bvhdbl3( b / a.x, b / a.y, b / a.z ); } -inline bvhdbl3 operator*=( bvhdbl3& a, const float b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); } +inline bvhdbl3 operator*( const bvhdbl3& a, double b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); } +inline bvhdbl3 operator*( double b, const bvhdbl3& a ) { return bvhdbl3( b * a.x, b * a.y, b * a.z ); } +inline bvhdbl3 operator/( double b, const bvhdbl3& a ) { return bvhdbl3( b / a.x, b / a.y, b / a.z ); } +inline bvhdbl3 operator*=( bvhdbl3& a, const double b ) { return bvhdbl3( a.x * b, a.y * b, a.z * b ); } #endif // TINYBVH_USE_CUSTOM_VECTOR_TYPES @@ -408,12 +434,21 @@ inline float32x4_t SIMD_SETRVEC( float x, float y, float z, float w ) ALIGNED( 64 ) float data[4] = { x, y, z, w }; return vld1q_f32( data ); } + +inline uint32x4_t SIMD_SETRVECU( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) +{ + ALIGNED( 64 ) uint32_t data[4] = { x, y, z, w }; + return vld1q_u32( data ); +} + #else typedef bvhvec4 SIMDVEC4; #define SIMD_SETVEC(a,b,c,d) bvhvec4( d, c, b, a ) #define SIMD_SETRVEC(a,b,c,d) bvhvec4( a, b, c, d ) #endif +#endif + // error handling #define FATAL_ERROR_IF(c,s) if (c) { fprintf( stderr, \ "Fatal error in tiny_bvh.h, line %i:\n%s\n", __LINE__, s ); exit( 1 ); } @@ -421,7 +456,7 @@ typedef bvhvec4 SIMDVEC4; // ============================================================================ // // T I N Y _ B V H I N T E R F A C E -// +// // ============================================================================ struct Intersection @@ -433,7 +468,7 @@ struct Intersection // Using this data and the original triangle data, all other info for // shading (such as normal, texture color etc.) can be reconstructed. float t, u, v; // distance along ray & barycentric coordinates of the intersection - unsigned prim; // primitive index + uint32_t prim; // primitive index }; struct Ray @@ -448,9 +483,9 @@ struct Ray O = origin, D = normalize( direction ), rD = tinybvh_safercp( D ); hit.t = t; } - ALIGNED( 16 ) bvhvec3 O; unsigned dummy1; - ALIGNED( 16 ) bvhvec3 D; unsigned dummy2; - ALIGNED( 16 ) bvhvec3 rD; unsigned dummy3; + ALIGNED( 16 ) bvhvec3 O; uint32_t dummy1; + ALIGNED( 16 ) bvhvec3 D; uint32_t dummy2; + ALIGNED( 16 ) bvhvec3 rD; uint32_t dummy3; ALIGNED( 16 ) Intersection hit; }; @@ -471,7 +506,7 @@ struct RayEx } bvhdbl3 O, D, rD; double t, u, v; - unsigned long long int primIdx; + uint64_t primIdx; }; #endif @@ -486,7 +521,7 @@ struct BVHContext class BVH { public: - enum BVHLayout { + enum BVHLayout : uint32_t { WALD_32BYTE = 1, // Default format, obtained using BVH::Build variants. WALD_DOUBLE, // Double-precision version of the default format. AILA_LAINE, // For GPU rendering. Obtained by converting WALD_32BYTE. @@ -495,14 +530,15 @@ class BVH BASIC_BVH4, // Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE. BVH4_GPU, // For fast GPU rendering. Obtained by converting BASIC_BVH4. BVH4_AFRA, // For fast CPU rendering. Obtained by converting BASIC_BVH4. + BVH4_WIVE, // For fast CPU rendering. Obtained by converting BASIC_BVH4. BASIC_BVH8, // Input for CWBVH. Obtained by converting WALD_32BYTE. CWBVH // Fastest GPU rendering. Obtained by converting BASIC_BVH8. }; - enum TraceDevice { + enum TraceDevice : uint32_t { USE_CPU = 1, USE_GPU }; - enum BuildFlags { + enum BuildFlags : uint32_t { NONE = 0, // Default building behavior (binned, SAH-driven). FULLSPLIT = 1 // Split as far as possible, even when SAH doesn't agree. }; @@ -510,8 +546,8 @@ class BVH { // 'Traditional' 32-byte BVH node layout, as proposed by Ingo Wald. // When aligned to a cache line boundary, two of these fit together. - bvhvec3 aabbMin; unsigned leftFirst; // 16 bytes - bvhvec3 aabbMax; unsigned triCount; // 16 bytes, total: 32 bytes + bvhvec3 aabbMin; uint32_t leftFirst; // 16 bytes + bvhvec3 aabbMax; uint32_t triCount; // 16 bytes, total: 32 bytes bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ } float Intersect( const Ray& ray ) const { return BVH::IntersectAABB( ray, aabbMin, aabbMax ); } float SurfaceArea() const { return BVH::SA( aabbMin, aabbMax ); } @@ -523,8 +559,8 @@ class BVH // Compared to the default BVHNode, child node indices and triangle indices // are also expanded to 64bit values to support massive scenes. bvhdbl3 aabbMin, aabbMax; // 2x24 bytes - unsigned long long int leftFirst; // 8 bytes - unsigned long long int triCount; // 8 bytes, total: 64 bytes + uint64_t leftFirst; // 8 bytes + uint64_t triCount; // 8 bytes, total: 64 bytes bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ } double Intersect( const RayEx& ray ) const; double SurfaceArea() const; @@ -533,7 +569,7 @@ class BVH { // Double-precision version of the fragment sruct. bvhdbl3 bmin, bmax; // AABB - unsigned long long int primIdx; // index of the original primitive + uint64_t primIdx; // index of the original primitive }; #endif struct BVHNodeAlt @@ -541,10 +577,10 @@ class BVH // Alternative 64-byte BVH node layout, which specifies the bounds of // the children rather than the node itself. This layout is used by // Aila and Laine in their seminal GPU ray tracing paper. - bvhvec3 lmin; unsigned left; - bvhvec3 lmax; unsigned right; - bvhvec3 rmin; unsigned triCount; - bvhvec3 rmax; unsigned firstTri; // total: 64 bytes + bvhvec3 lmin; uint32_t left; + bvhvec3 lmax; uint32_t right; + bvhvec3 rmin; uint32_t triCount; + bvhvec3 rmax; uint32_t firstTri; // total: 64 bytes bool isLeaf() const { return triCount > 0; } }; struct BVHNodeAlt2 @@ -552,7 +588,7 @@ class BVH // Second alternative 64-byte BVH node layout, same as BVHNodeAlt but // with child AABBs stored in SoA order. SIMDVEC4 xxxx, yyyy, zzzz; - unsigned left, right, triCount, firstTri; // total: 64 bytes + uint32_t left, right, triCount, firstTri; // total: 64 bytes bool isLeaf() const { return triCount > 0; } }; struct BVHNodeVerbose @@ -560,28 +596,28 @@ class BVH // This node layout has some extra data per node: It stores left and right // child node indices explicitly, and stores the index of the parent node. // This format exists primarily for the BVH optimizer. - bvhvec3 aabbMin; unsigned left; - bvhvec3 aabbMax; unsigned right; - unsigned triCount, firstTri, parent, dummy; + bvhvec3 aabbMin; uint32_t left; + bvhvec3 aabbMax; uint32_t right; + uint32_t triCount, firstTri, parent, dummy; bool isLeaf() const { return triCount > 0; } }; struct BVHNode4 { - // 4-wide (aka 'shallow') BVH layout. - bvhvec3 aabbMin; unsigned firstTri; - bvhvec3 aabbMax; unsigned triCount; - unsigned child[4]; - unsigned childCount, dummy1, dummy2, dummy3; // dummies are for alignment. + // 4-wide (aka 'shallow') BVH layout. + bvhvec3 aabbMin; uint32_t firstTri; + bvhvec3 aabbMax; uint32_t triCount; + uint32_t child[4]; + uint32_t childCount, dummy1, dummy2, dummy3; // dummies are for alignment. bool isLeaf() const { return triCount > 0; } }; struct BVHNode4Alt { // 4-way BVH node, optimized for GPU rendering - struct aabb8 { unsigned char xmin, ymin, zmin, xmax, ymax, zmax; }; // quantized - bvhvec3 aabbMin; unsigned c0Info; // 16 - bvhvec3 aabbExt; unsigned c1Info; // 16 - aabb8 c0bounds, c1bounds; unsigned c2Info; // 16 - aabb8 c2bounds, c3bounds; unsigned c3Info; // 16; total: 64 bytes + struct aabb8 { uint8_t xmin, ymin, zmin, xmax, ymax, zmax; }; // quantized + bvhvec3 aabbMin; uint32_t c0Info; // 16 + bvhvec3 aabbExt; uint32_t c1Info; // 16 + aabb8 c0bounds, c1bounds; uint32_t c2Info; // 16 + aabb8 c2bounds, c3bounds; uint32_t c3Info; // 16; total: 64 bytes // childInfo, 32bit: // msb: 0=interior, 1=leaf // leaf: 16 bits: relative start of triangle data, 15 bits: triangle count. @@ -600,16 +636,29 @@ class BVH // Áfra, 2013. SIMDVEC4 xmin4, ymin4, zmin4; SIMDVEC4 xmax4, ymax4, zmax4; - unsigned childFirst[4]; - unsigned triCount[4]; + uint32_t childFirst[4]; + uint32_t triCount[4]; + }; + struct BVHNode4WiVe + { + // 4-way BVH node, optimized for CPU rendering. + // Based on: "Accelerated Single Ray Tracing for Wide Vector Units", + // Fuetterling1 et al., 2017. + union { SIMDVEC4 xmin4; float xmin[4]; }; + union { SIMDVEC4 xmax4; float xmax[4]; }; + union { SIMDVEC4 ymin4; float ymin[4]; }; + union { SIMDVEC4 ymax4; float ymax[4]; }; + union { SIMDVEC4 zmin4; float zmin[4]; }; + union { SIMDVEC4 zmax4; float zmax[4]; }; + // ORSTRec rec[4]; }; struct BVHNode8 { - // 8-wide (aka 'shallow') BVH layout. - bvhvec3 aabbMin; unsigned firstTri; - bvhvec3 aabbMax; unsigned triCount; - unsigned child[8]; - unsigned childCount, dummy1, dummy2, dummy3; // dummies are for alignment. + // 8-wide (aka 'shallow') BVH layout. + bvhvec3 aabbMin; uint32_t firstTri; + bvhvec3 aabbMax; uint32_t triCount; + uint32_t child[8]; + uint32_t childCount, dummy1, dummy2, dummy3; // dummies are for alignment. bool isLeaf() const { return triCount > 0; } }; struct Fragment @@ -618,14 +667,14 @@ class BVH // "Parallel Spatial Splits in Bounding Volume Hierarchies", 2016, Fuetterling et al., // and refers to the potential splitting of these boxes for SBVH construction. bvhvec3 bmin; // AABB min x, y and z - unsigned primIdx; // index of the original primitive + uint32_t primIdx; // index of the original primitive bvhvec3 bmax; // AABB max x, y and z - unsigned clipped = 0; // Fragment is the result of clipping if > 0. + uint32_t clipped = 0; // Fragment is the result of clipping if > 0. bool validBox() { return bmin.x < BVH_FAR; } }; // BLASInstance: A TLAS is built over BLAS instances, where a single BLAS can be // used with multiple transforms, and multiple BLASses can be combined in a complex - // scene. The TLAS is built over the world-space AABBs of the BLAS root nodes. + // scene. The TLAS is built over the world-space AABBs of the BLAS root nodes. class BLASInstance { public: @@ -658,7 +707,7 @@ class BVH allocatedAlt4aBlocks = 0; allocatedBVH8Nodes = 0; } - float SAHCost( const unsigned nodeIdx = 0 ) const + float SAHCost( const uint32_t nodeIdx = 0 ) const { // Determine the SAH cost of the tree. This provides an indication // of the quality of the BVH: Lower is better. @@ -667,31 +716,33 @@ class BVH float cost = C_TRAV * n.SurfaceArea() + SAHCost( n.leftFirst ) + SAHCost( n.leftFirst + 1 ); return nodeIdx == 0 ? (cost / n.SurfaceArea()) : cost; } - int NodeCount( const BVHLayout layout ) const; - int PrimCount( const unsigned nodeIdx = 0 ) const + int32_t NodeCount( const BVHLayout layout ) const; + int32_t PrimCount( const uint32_t nodeIdx = 0 ) const { // Determine the total number of primitives / fragments in leaf nodes. const BVHNode& n = bvhNode[nodeIdx]; return n.isLeaf() ? n.triCount : (PrimCount( n.leftFirst ) + PrimCount( n.leftFirst + 1 )); } void Compact( const BVHLayout layout /* must be WALD_32BYTE or VERBOSE */ ); - void BuildQuick( const bvhvec4* vertices, const unsigned primCount ); - void Build( const bvhvec4* vertices, const unsigned primCount ); - void BuildHQ( const bvhvec4* vertices, const unsigned primCount ); + void BuildQuick( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void BuildHQ( const bvhvec4slice& vertices ); + void Build( const bvhvec4slice& vertices ); + void BuildHQ( const bvhvec4* vertices, const uint32_t primCount ); #ifdef BVH_USEAVX - void BuildAVX( const bvhvec4* vertices, const unsigned primCount ); + void BuildAVX( const bvhvec4* vertices, const uint32_t primCount ); #elif defined BVH_USENEON - void BuildNEON( const bvhvec4* vertices, const unsigned primCount ); + void BuildNEON( const bvhvec4* vertices, const uint32_t primCount ); #endif - void BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount ); - void BuildTLAS( const BLASInstance* bvhs, const unsigned instCount ); + void BuildTLAS( const bvhaabb* aabbs, const uint32_t aabbCount ); + void BuildTLAS( const BLASInstance* bvhs, const uint32_t instCount ); void Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal = false ); - void SplitLeafs( const unsigned maxPrims = 1 ); // operates on VERBOSE layout - void SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims = 1 ); // operates on BVH8 layout + void SplitLeafs( const uint32_t maxPrims = 1 ); // operates on VERBOSE layout + void SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims = 1 ); // operates on BVH8 layout void MergeLeafs(); // operates on VERBOSE layout - void Optimize( const unsigned iterations ); // operates on VERBOSE - void Refit( const BVHLayout layout = WALD_32BYTE, const unsigned nodeIdx = 0 ); - int Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const; + void Optimize( const uint32_t iterations ); // operates on VERBOSE + void Refit( const BVHLayout layout = WALD_32BYTE, const uint32_t nodeIdx = 0 ); + int32_t Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const; // IntersectTLAS: Interface is under construction. Current plan: // * application constructs one or more BVHs (BLAS) using a layout of choice; // * application instantiates one or more BVHInstances using the blasses; @@ -700,48 +751,48 @@ class BVH // In the leafs of the TLAS, tiny_bvh finds a BVHInstance index. Using the index, // the BLAS is found, the ray is transformed, and the appropriate Intersect_xxx // method is called. - int IntersectTLAS( Ray& ray ) const; + int32_t IntersectTLAS( Ray& ray ) const; bool IsOccluded( const Ray& ray, const BVHLayout layout = WALD_32BYTE ) const; - void BatchIntersect( Ray* rayBatch, const unsigned N, + void BatchIntersect( Ray* rayBatch, const uint32_t N, const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const; - void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, + void BatchIsOccluded( Ray* rayBatch, const uint32_t N, uint32_t* result, const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const; void Intersect256Rays( Ray* first ) const; void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX private: void* AlignedAlloc( size_t size ); void AlignedFree( void* ptr ); - int Intersect_Wald32Byte( Ray& ray ) const; - int Intersect_AilaLaine( Ray& ray ) const; - int Intersect_Afra( Ray& ray ) const; - int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX or BVH_USENEON. - int Intersect_BasicBVH4( Ray& ray ) const; // only for testing, not efficient. - int Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient. - int Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient. - int Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient. + int32_t Intersect_Wald32Byte( Ray& ray ) const; + int32_t Intersect_AilaLaine( Ray& ray ) const; + int32_t Intersect_Afra( Ray& ray ) const; + int32_t Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX or BVH_USENEON. + int32_t Intersect_BasicBVH4( Ray& ray ) const; // only for testing, not efficient. + int32_t Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient. + int32_t Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient. + int32_t Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient. bool IsOccluded_Wald32Byte( const Ray& ray ) const; bool IsOccluded_AilaLaine( const Ray& ray ) const; bool IsOccluded_AltSoA( const Ray& ray ) const; bool IsOccluded_Afra( const Ray& ray ) const; - void IntersectTri( Ray& ray, const unsigned triIdx ) const; + void IntersectTri( Ray& ray, const uint32_t triIdx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax ) { bvhvec3 e = aabbMax - aabbMin; // extent of the node return e.x * e.y + e.y * e.z + e.z * e.x; } - void PrecomputeTriangle( const bvhvec4* const vert, float* T ); + void PrecomputeTriangle( const bvhvec4slice& vert, uint32_t triIndex, float* T ); bool ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhvec3 bmax, bvhvec3 minDim ); - void RefitUpVerbose( unsigned nodeIdx ); - unsigned FindBestNewPosition( const unsigned Lid ); - void ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin ); - unsigned CountSubtreeTris( const unsigned nodeIdx, unsigned* counters ); - void MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr ); + void RefitUpVerbose( uint32_t nodeIdx ); + uint32_t FindBestNewPosition( const uint32_t Lid ); + void ReinsertNodeVerbose( const uint32_t Lid, const uint32_t Nid, const uint32_t origin ); + uint32_t CountSubtreeTris( const uint32_t nodeIdx, uint32_t* counters ); + void MergeSubtree( const uint32_t nodeIdx, uint32_t* newIdx, uint32_t& newIdxPtr ); #ifdef DOUBLE_PRECISION_SUPPORT // double-precision things public: - void BuildEx( const bvhdbl3* vertices, const unsigned primCount ); - double SAHCostEx( const unsigned long long int nodeIdx = 0 ) const + void BuildEx( const bvhdbl3* vertices, const uint32_t primCount ); + double SAHCostEx( const uint64_t nodeIdx = 0 ) const { // Determine the SAH cost of a double-precision tree. const BVHNodeEx& n = bvhNodeEx[nodeIdx]; @@ -749,24 +800,24 @@ class BVH double cost = C_TRAV * n.SurfaceArea() + SAHCostEx( n.leftFirst ) + SAHCostEx( n.leftFirst + 1 ); return nodeIdx == 0 ? (cost / n.SurfaceArea()) : cost; } - int IntersectEx( RayEx& ray, const BVHLayout layout = WALD_DOUBLE ) const; - int IntersectEx_WaldDouble( RayEx& ray ) const; + int32_t IntersectEx( RayEx& ray, const BVHLayout layout = WALD_DOUBLE ) const; + int32_t IntersectEx_WaldDouble( RayEx& ray ) const; FragmentEx* fragEx = 0; // input primitive bounding boxes, double-precision. - unsigned long long int* triIdxEx; // primitive index array for double-precision bvh. - unsigned allocatedBVHExNodes = 0; - unsigned usedBVHExNodes = 0; + uint64_t* triIdxEx; // primitive index array for double-precision bvh. + uint32_t allocatedBVHExNodes = 0; + uint32_t usedBVHExNodes = 0; private: bvhdbl3* vertsEx = 0; // pointer to input primitive array, double-precision, 3x24 bytes per tri. BVHNodeEx* bvhNodeEx = 0; // BVH node, double precision format. #endif public: // Basic BVH data (WALD_32BYTE layout). - bvhvec4* verts = 0; // pointer to input primitive array: 3x16 bytes per tri. - unsigned triCount = 0; // number of primitives in tris. - unsigned* triIdx = 0; // primitive index array. + bvhvec4slice verts = {}; // pointer to input primitive array: 3x16 bytes per tri. + uint32_t triCount = 0; // number of primitives in tris. + uint32_t* triIdx = 0; // primitive index array. BVHNode* bvhNode = 0; // BVH node pool, Wald 32-byte format. Root is always in node 0. Fragment* fragment = 0; // input primitive bounding boxes. - unsigned idxCount = 0; // number of indices in triIdx. May exceed triCount * 3 for SBVH. + uint32_t idxCount = 0; // number of indices in triIdx. May exceed triCount * 3 for SBVH. BuildFlags buildFlag = NONE; // hint to the builder: currently, NONE or FULLSPLIT. // Data for various other layuouts. BVHNodeAlt* altNode = 0; // BVH node in Aila & Laine format. @@ -774,7 +825,8 @@ class BVH BVHNodeVerbose* verbose = 0; // BVH node with additional info, for BVH optimizer. BVHNode4* bvh4Node = 0; // BVH node for 4-wide BVH. bvhvec4* bvh4Alt = 0; // 64-byte 4-wide BVH node for efficient GPU rendering. - BVHNode4Alt2* bvh4Alt2 = 0; // 64-byte 4-wide BVH node for efficient CPU rendering. + BVHNode4Alt2* bvh4Alt2 = 0; // 128-byte 4-wide BVH node for efficient CPU rendering. + BVHNode4WiVe* bvh4WiVe = 0; // 128-byte 4-wide BVH node for efficient CPU rendering. bvhvec4* bvh4Tris = 0; // triangle data for BVHNode4Alt2 nodes. BVHNode8* bvh8Node = 0; // BVH node for 8-wide BVH. bvhvec4* bvh8Compact = 0; // nodes in CWBVH format. @@ -787,24 +839,26 @@ class BVH bool bvh_over_aabbs = false; // a BVH over AABBs is useful for e.g. TLAS traversal. BVHContext context; // context used to provide user-defined allocation functions // Keep track of allocated buffer size to avoid repeated allocation during layout conversion. - unsigned allocatedBVHNodes = 0; - unsigned allocatedAltNodes = 0; - unsigned allocatedAlt2Nodes = 0; - unsigned allocatedVerbose = 0; - unsigned allocatedBVH4Nodes = 0; - unsigned allocatedAlt4aBlocks = 0; - unsigned allocatedAlt4bNodes = 0; - unsigned allocatedBVH8Nodes = 0; - unsigned allocatedCWBVHBlocks = 0; - unsigned usedBVHNodes = 0; - unsigned usedAltNodes = 0; - unsigned usedAlt2Nodes = 0; - unsigned usedVerboseNodes = 0; - unsigned usedBVH4Nodes = 0; - unsigned usedAlt4aBlocks = 0; - unsigned usedAlt4bNodes = 0; - unsigned usedBVH8Nodes = 0; - unsigned usedCWBVHBlocks = 0; + uint32_t allocatedBVHNodes = 0; + uint32_t allocatedAltNodes = 0; + uint32_t allocatedAlt2Nodes = 0; + uint32_t allocatedVerbose = 0; + uint32_t allocatedBVH4Nodes = 0; + uint32_t allocatedAlt4aBlocks = 0; + uint32_t allocatedAlt4bNodes = 0; + uint32_t allocatedWiVeNodes = 0; + uint32_t allocatedBVH8Nodes = 0; + uint32_t allocatedCWBVHBlocks = 0; + uint32_t usedBVHNodes = 0; + uint32_t usedAltNodes = 0; + uint32_t usedAlt2Nodes = 0; + uint32_t usedVerboseNodes = 0; + uint32_t usedBVH4Nodes = 0; + uint32_t usedAlt4aBlocks = 0; + uint32_t usedAlt4bNodes = 0; + uint32_t usedWiVeNodes = 0; + uint32_t usedBVH8Nodes = 0; + uint32_t usedCWBVHBlocks = 0; }; } // namespace tinybvh @@ -812,7 +866,7 @@ class BVH // ============================================================================ // // I M P L E M E N T A T I O N -// +// // ============================================================================ #ifdef TINYBVH_IMPLEMENTATION @@ -822,6 +876,8 @@ class BVH #include // for __lzcnt #endif +// We need quite a bit of type reinterpretation, so we'll +// turn off the gcc warning here until the end of the file. #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" @@ -829,6 +885,40 @@ class BVH namespace tinybvh { +#if defined BVH_USEAVX || defined BVH_USENEON + +static uint32_t __bfind( uint32_t x ) // https://github.com/mackron/refcode/blob/master/lzcnt.c +{ +#if defined(_MSC_VER) && !defined(__clang__) + return 31 - __lzcnt( x ); +#elif defined(__EMSCRIPTEN__) + return 31 - __builtin_clz( x ); +#elif defined(__GNUC__) || defined(__clang__) +#ifndef __APPLE__ + uint32_t r; + __asm__ __volatile__( "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" ); + return 31 - r; +#else + return 31 - __builtin_clz( x ); // TODO: unverified. +#endif +#endif +} + +bvhvec4::bvhvec4( const bvhvec3& a ) { x = a.x; y = a.y; z = a.z; w = 0; } +bvhvec4::bvhvec4( const bvhvec3& a, float b ) { x = a.x; y = a.y; z = a.z; w = b; } + +bvhvec4slice::bvhvec4slice( const bvhvec4* data, uint32_t count, uint32_t stride ) : + data{ reinterpret_cast(data) }, + count{ count }, stride{ stride } {} + +const bvhvec4& bvhvec4slice::operator[]( size_t i ) const +{ +#ifdef PARANOID + FATAL_ERROR_IF( i >= count, "bvhvec4slice::[..], Reading outside slice." ); +#endif + return *reinterpret_cast(data + stride * i); +} + void* BVH::AlignedAlloc( size_t size ) { return context.malloc ? context.malloc( size, context.userdata ) : nullptr; @@ -846,7 +936,7 @@ void BVH::BLASInstance::Update() // transform and calculate the worldspace aabb over these. worldBounds.minBounds = bvhvec3( BVH_FAR ), worldBounds.maxBounds = bvhvec3( -BVH_FAR ); bvhvec3 bmin = blas->bvhNode[0].aabbMin, bmax = blas->bvhNode[0].aabbMax; - for (int i = 0; i < 8; i++) + for (int32_t i = 0; i < 8; i++) { const bvhvec3 p( i & 1 ? bmax.x : bmin.x, i & 2 ? bmax.y : bmin.y, i & 4 ? bmax.z : bmin.z ); const bvhvec3 t = TransformPoint( p ); @@ -856,9 +946,10 @@ void BVH::BLASInstance::Update() } // BVH builder entry point for arrays of aabbs. -void BVH::BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount ) +void BVH::BuildTLAS( const bvhaabb* aabbs, const uint32_t aabbCount ) { // the aabb array must be cacheline aligned. + FATAL_ERROR_IF( aabbCount == 0, "BVH::BuildTLAS( .. ), aabbCount == 0." ); FATAL_ERROR_IF( ((long long)(void*)aabbs & 31) != 0, "BVH::Build( bvhaabb* ), array not cacheline aligned." ); // take the array and process it fragment = (Fragment*)aabbs; @@ -867,13 +958,14 @@ void BVH::BuildTLAS( const bvhaabb* aabbs, const unsigned aabbCount ) Build( (bvhvec4*)0, aabbCount ); // TODO: for very large scenes, use BuildAVX. Mind fragment sign flip! } -void BVH::BuildTLAS( const BLASInstance* bvhs, const unsigned instCount ) +void BVH::BuildTLAS( const BLASInstance* bvhs, const uint32_t instCount ) { + FATAL_ERROR_IF( instCount == 0, "BVH::BuildTLAS( .. ), instCount == 0." ); if (!fragment) fragment = (Fragment*)AlignedAlloc( instCount ); else FATAL_ERROR_IF( instCount != triCount, "BVH::BuildTLAS( .. ), blas count changed." ); // copy relevant data from instance array triCount = instCount; - for (unsigned i = 0; i < instCount; i++) + for (uint32_t i = 0; i < instCount; i++) { fragment[i].bmin = bvhs[i].worldBounds.minBounds; fragment[i].primIdx = i; @@ -886,10 +978,11 @@ void BVH::BuildTLAS( const BLASInstance* bvhs, const unsigned instCount ) // This builder yields a correct BVH in little time, but the quality of the // structure will be low. Use this only if build time is the bottleneck in // your application (e.g., when you need to trace few rays). -void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount ) +void BVH::BuildQuick( const bvhvec4* vertices, const uint32_t primCount ) { + FATAL_ERROR_IF( primCount == 0, "BVH::BuildQuick( .. ), primCount == 0." ); // allocate on first build - const unsigned spaceNeeded = primCount * 2; // upper limit + const uint32_t spaceNeeded = primCount * 2; // upper limit if (allocatedBVHNodes < spaceNeeded) { AlignedFree( bvhNode ); @@ -898,19 +991,19 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount ) bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); allocatedBVHNodes = spaceNeeded; memset( &bvhNode[1], 0, 32 ); // node 1 remains unused, for cache line alignment. - triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) ); + triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) ); fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) ); } else FATAL_ERROR_IF( !rebuildable, "BVH::BuildQuick( .. ), bvh not rebuildable." ); - verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete. + verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete. idxCount = triCount = primCount; // reset node pool - unsigned newNodePtr = 2; + uint32_t newNodePtr = 2; // assign all triangles to the root node BVHNode& root = bvhNode[0]; root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR ); // initialize fragments and initialize root node bounds - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); @@ -918,21 +1011,21 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount ) root.aabbMax = tinybvh_max( root.aabbMax, fragment[i].bmax ), triIdx[i] = i; } // subdivide recursively - unsigned task[256], taskCount = 0, nodeIdx = 0; + uint32_t task[256], taskCount = 0, nodeIdx = 0; while (1) { while (1) { BVHNode& node = bvhNode[nodeIdx]; // in-place partition against midpoint on longest axis - unsigned j = node.leftFirst + node.triCount, src = node.leftFirst; + uint32_t j = node.leftFirst + node.triCount, src = node.leftFirst; bvhvec3 extent = node.aabbMax - node.aabbMin; - unsigned axis = 0; + uint32_t axis = 0; if (extent.y > extent.x && extent.y > extent.z) axis = 1; if (extent.z > extent.x && extent.z > extent.y) axis = 2; float splitPos = node.aabbMin[axis] + extent[axis] * 0.5f, centroid; bvhvec3 lbmin( BVH_FAR ), lbmax( -BVH_FAR ), rbmin( BVH_FAR ), rbmax( -BVH_FAR ), fmin, fmax; - for (unsigned fi, i = 0; i < node.triCount; i++) + for (uint32_t fi, i = 0; i < node.triCount; i++) { fi = triIdx[src], fmin = fragment[fi].bmin, fmax = fragment[fi].bmax; centroid = (fmin[axis] + fmax[axis]) * 0.5f; @@ -945,9 +1038,9 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount ) } } // create child nodes - const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; if (leftCount == 0 || rightCount == 0) break; // split did not work out. - const int lci = newNodePtr++, rci = newNodePtr++; + const int32_t lci = newNodePtr++, rci = newNodePtr++; bvhNode[lci].aabbMin = lbmin, bvhNode[lci].aabbMax = lbmax; bvhNode[lci].leftFirst = node.leftFirst, bvhNode[lci].triCount = leftCount; bvhNode[rci].aabbMin = rbmin, bvhNode[rci].aabbMax = rbmax; @@ -966,16 +1059,18 @@ void BVH::BuildQuick( const bvhvec4* vertices, const unsigned primCount ) usedBVHNodes = newNodePtr; } -// Basic single-function binned-SAH-builder. -// This is the reference builder; it yields a decent tree suitable for ray -// tracing on the CPU. This code uses no SIMD instructions. +// Basic single-function binned-SAH-builder. +// This is the reference builder; it yields a decent tree suitable for ray +// tracing on the CPU. This code uses no SIMD instructions. // Faster code, using SSE/AVX, is available for x64 CPUs. // For GPU rendering: The resulting BVH should be converted to a more optimal // format after construction. -void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) +void BVH::Build( const bvhvec4slice& vertices ) { + FATAL_ERROR_IF( vertices.count == 0, "BVH::Build( .. ), primCount == 0." ); // allocate on first build - const unsigned spaceNeeded = primCount * 2; // upper limit + const uint32_t primCount = vertices.count / 3; + const uint32_t spaceNeeded = primCount * 2; // upper limit if (allocatedBVHNodes < spaceNeeded) { AlignedFree( bvhNode ); @@ -984,15 +1079,15 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); allocatedBVHNodes = spaceNeeded; memset( &bvhNode[1], 0, 32 ); // node 1 remains unused, for cache line alignment. - triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) ); + triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) ); if (vertices) fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) ); else FATAL_ERROR_IF( fragment == 0, "BVH::Build( 0, .. ), not called from ::Build( aabb )." ); } else FATAL_ERROR_IF( !rebuildable, "BVH::Build( .. ), bvh not rebuildable." ); - verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete. + verts = vertices; idxCount = triCount = primCount; // reset node pool - unsigned newNodePtr = 2; + uint32_t newNodePtr = 2; // assign all triangles to the root node BVHNode& root = bvhNode[0]; root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR ); @@ -1000,7 +1095,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) if (verts) { // building a BVH over triangles specified as three 16-byte vertices each. - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); @@ -1011,14 +1106,14 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) else { // we are building the BVH over aabbs we received from ::Build( tinyaabb* ): vertices == 0. - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { root.aabbMin = tinybvh_min( root.aabbMin, fragment[i].bmin ); root.aabbMax = tinybvh_max( root.aabbMax, fragment[i].bmax ), triIdx[i] = i; // here: aabb index. } } // subdivide recursively - unsigned task[256], taskCount = 0, nodeIdx = 0; + uint32_t task[256], taskCount = 0, nodeIdx = 0; bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-20f, bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0; while (1) { @@ -1027,13 +1122,13 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) BVHNode& node = bvhNode[nodeIdx]; // find optimal object split bvhvec3 binMin[3][BVHBINS], binMax[3][BVHBINS]; - for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR; - unsigned count[3][BVHBINS]; - memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) ); + for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR; + uint32_t count[3][BVHBINS]; + memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) ); const bvhvec3 rpd3 = bvhvec3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin; - for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once + for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once { - const unsigned fi = triIdx[node.leftFirst + i]; + const uint32_t fi = triIdx[node.leftFirst + i]; bvhint3 bi = bvhint3( ((fragment[fi].bmin + fragment[fi].bmax) * 0.5f - nmin3) * rpd3 ); bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 ); bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 ); @@ -1047,13 +1142,13 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) } // calculate per-split totals float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea(); - unsigned bestAxis = 0, bestPos = 0; - for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) + uint32_t bestAxis = 0, bestPos = 0; + for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) { bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR; bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR; float ANL[BVHBINS - 1], ANR[BVHBINS - 1]; - for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) + for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) { lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] ); rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] ); @@ -1064,7 +1159,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN); } // evaluate bin totals to find best position for object split - for (unsigned i = 0; i < BVHBINS - 1; i++) + for (uint32_t i = 0; i < BVHBINS - 1; i++) { const float C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]); if (C < splitCost) @@ -1077,19 +1172,19 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) float noSplitCost = (float)node.triCount * C_INT; if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition - unsigned j = node.leftFirst + node.triCount, src = node.leftFirst; + uint32_t j = node.leftFirst + node.triCount, src = node.leftFirst; const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis]; - for (unsigned i = 0; i < node.triCount; i++) + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned fi = triIdx[src]; - int bi = (unsigned)(((fragment[fi].bmin[bestAxis] + fragment[fi].bmax[bestAxis]) * 0.5f - nmin) * rpd); + const uint32_t fi = triIdx[src]; + int32_t bi = (uint32_t)(((fragment[fi].bmin[bestAxis] + fragment[fi].bmax[bestAxis]) * 0.5f - nmin) * rpd); bi = tinybvh_clamp( bi, 0, BVHBINS - 1 ); - if ((unsigned)bi <= bestPos) src++; else tinybvh_swap( triIdx[src], triIdx[--j] ); + if ((uint32_t)bi <= bestPos) src++; else tinybvh_swap( triIdx[src], triIdx[--j] ); } // create child nodes - unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; if (leftCount == 0 || rightCount == 0) break; // should not happen. - const int lci = newNodePtr++, rci = newNodePtr++; + const int32_t lci = newNodePtr++, rci = newNodePtr++; bvhNode[lci].aabbMin = bestLMin, bvhNode[lci].aabbMax = bestLMax; bvhNode[lci].leftFirst = node.leftFirst, bvhNode[lci].triCount = leftCount; bvhNode[rci].aabbMin = bestRMin, bvhNode[rci].aabbMax = bestRMax; @@ -1109,19 +1204,26 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) usedBVHNodes = newNodePtr; } +void BVH::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice{ vertices, primCount * 3 } ); +} + // SBVH builder. // Besides the regular object splits used in the reference builder, the SBVH // algorithm also considers spatial splits, where primitives may be cut in // multiple parts. This increases primitive count but may reduce overlap of -// BVH nodes. The cost of each option is considered per split. -// For typical geometry, SBVH yields a tree that can be traversed 25% faster. -// This comes at greatly increased construction cost, making the SBVH +// BVH nodes. The cost of each option is considered per split. +// For typical geometry, SBVH yields a tree that can be traversed 25% faster. +// This comes at greatly increased construction cost, making the SBVH // primarily useful for static geometry. -void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) +void BVH::BuildHQ( const bvhvec4slice& vertices ) { + FATAL_ERROR_IF( vertices.count == 0, "BVH::BuildHQ( .. ), primCount == 0." ); // allocate on first build - const unsigned slack = primCount >> 2; // for split prims - const unsigned spaceNeeded = primCount * 3; + const uint32_t primCount = vertices.count / 3; + const uint32_t slack = primCount >> 2; // for split prims + const uint32_t spaceNeeded = primCount * 3; if (allocatedBVHNodes < spaceNeeded) { AlignedFree( bvhNode ); @@ -1130,23 +1232,23 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); allocatedBVHNodes = spaceNeeded; memset( &bvhNode[1], 0, 32 ); // node 1 remains unused, for cache line alignment. - triIdx = (unsigned*)AlignedAlloc( (primCount + slack) * sizeof( unsigned ) ); + triIdx = (uint32_t*)AlignedAlloc( (primCount + slack) * sizeof( uint32_t ) ); fragment = (Fragment*)AlignedAlloc( (primCount + slack) * sizeof( Fragment ) ); } else FATAL_ERROR_IF( !rebuildable, "BVH::BuildHQ( .. ), bvh not rebuildable." ); - verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete. + verts = vertices; // note: we're not copying this data; don't delete. idxCount = primCount + slack; triCount = primCount; - unsigned* triIdxA = triIdx, * triIdxB = new unsigned[triCount + slack]; + uint32_t* triIdxA = triIdx, * triIdxB = new uint32_t[triCount + slack]; memset( triIdxA, 0, (triCount + slack) * 4 ); memset( triIdxB, 0, (triCount + slack) * 4 ); // reset node pool - unsigned newNodePtr = 2, nextFrag = triCount; + uint32_t newNodePtr = 2, nextFrag = triCount; // assign all triangles to the root node BVHNode& root = bvhNode[0]; root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhvec3( BVH_FAR ), root.aabbMax = bvhvec3( -BVH_FAR ); // initialize fragments and initialize root node bounds - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { fragment[i].bmin = tinybvh_min( tinybvh_min( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); fragment[i].bmax = tinybvh_max( tinybvh_max( verts[i * 3], verts[i * 3 + 1] ), verts[i * 3 + 2] ); @@ -1155,9 +1257,9 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) } const float rootArea = (root.aabbMax - root.aabbMin).halfArea(); // subdivide recursively - struct Task { unsigned node, sliceStart, sliceEnd, dummy; }; + struct Task { uint32_t node, sliceStart, sliceEnd, dummy; }; ALIGNED( 64 ) Task task[256]; - unsigned taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack; + uint32_t taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack; const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f /* don't touch, carefully picked */; bvhvec3 bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0; while (1) @@ -1167,13 +1269,13 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) BVHNode& node = bvhNode[nodeIdx]; // find optimal object split bvhvec3 binMin[3][BVHBINS], binMax[3][BVHBINS]; - for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR; - unsigned count[3][BVHBINS]; - memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) ); + for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_FAR, binMax[a][i] = -BVH_FAR; + uint32_t count[3][BVHBINS]; + memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) ); const bvhvec3 rpd3 = bvhvec3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin; - for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once + for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once { - const unsigned fi = triIdx[node.leftFirst + i]; + const uint32_t fi = triIdx[node.leftFirst + i]; bvhint3 bi = bvhint3( ((fragment[fi].bmin + fragment[fi].bmax) * 0.5f - nmin3) * rpd3 ); bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 ); bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 ); @@ -1187,13 +1289,13 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) } // calculate per-split totals float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea(); - unsigned bestAxis = 0, bestPos = 0; - for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) + uint32_t bestAxis = 0, bestPos = 0; + for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR; bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR; float ANL[BVHBINS - 1], ANR[BVHBINS - 1]; - for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) + for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) { lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] ); rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] ); @@ -1204,7 +1306,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN); } // evaluate bin totals to find best position for object split - for (unsigned i = 0; i < BVHBINS - 1; i++) + for (uint32_t i = 0; i < BVHBINS - 1; i++) { const float C = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]); if (C < splitCost) @@ -1216,25 +1318,25 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) } // consider a spatial split bool spatial = false; - unsigned NL[BVHBINS - 1], NR[BVHBINS - 1], budget = sliceEnd - sliceStart; + uint32_t NL[BVHBINS - 1], NR[BVHBINS - 1], budget = sliceEnd - sliceStart; bvhvec3 spatialUnion = bestLMax - bestRMin; float spatialOverlap = (spatialUnion.halfArea()) / rootArea; if (budget > node.triCount && splitCost < BVH_FAR && spatialOverlap > 1e-5f) { - for (unsigned a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) + for (uint32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { // setup bins bvhvec3 binMin[BVHBINS], binMax[BVHBINS]; - for (unsigned i = 0; i < BVHBINS; i++) binMin[i] = BVH_FAR, binMax[i] = -BVH_FAR; - unsigned countIn[BVHBINS] = { 0 }, countOut[BVHBINS] = { 0 }; + for (uint32_t i = 0; i < BVHBINS; i++) binMin[i] = BVH_FAR, binMax[i] = -BVH_FAR; + uint32_t countIn[BVHBINS] = { 0 }, countOut[BVHBINS] = { 0 }; // populate bins with clipped fragments const float planeDist = (node.aabbMax[a] - node.aabbMin[a]) / (BVHBINS * 0.9999f); const float rPlaneDist = 1.0f / planeDist, nodeMin = node.aabbMin[a]; - for (unsigned i = 0; i < node.triCount; i++) + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned fragIdx = triIdxA[node.leftFirst + i]; - const int bin1 = tinybvh_clamp( (int)((fragment[fragIdx].bmin[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 ); - const int bin2 = tinybvh_clamp( (int)((fragment[fragIdx].bmax[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 ); + const uint32_t fragIdx = triIdxA[node.leftFirst + i]; + const int32_t bin1 = tinybvh_clamp( (int32_t)((fragment[fragIdx].bmin[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 ); + const int32_t bin2 = tinybvh_clamp( (int32_t)((fragment[fragIdx].bmax[a] - nodeMin) * rPlaneDist), 0, BVHBINS - 1 ); countIn[bin1]++, countOut[bin2]++; if (bin2 == bin1) { @@ -1242,7 +1344,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) binMin[bin1] = tinybvh_min( binMin[bin1], fragment[fragIdx].bmin ); binMax[bin1] = tinybvh_max( binMax[bin1], fragment[fragIdx].bmax ); } - else for (int j = bin1; j <= bin2; j++) + else for (int32_t j = bin1; j <= bin2; j++) { // clip fragment to each bin it overlaps bvhvec3 bmin = node.aabbMin, bmax = node.aabbMax; @@ -1259,7 +1361,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) bvhvec3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_FAR, l2 = -BVH_FAR; bvhvec3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_FAR, r2 = -BVH_FAR; float ANL[BVHBINS], ANR[BVHBINS]; - for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) + for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) { lBMin[i] = l1 = tinybvh_min( l1, binMin[i] ), rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[BVHBINS - 1 - i] ); lBMax[i] = l2 = tinybvh_max( l2, binMax[i] ), rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[BVHBINS - 1 - i] ); @@ -1268,7 +1370,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : ((r2 - r1).halfArea() * (float)rN); } // find best position for spatial split - for (unsigned i = 0; i < BVHBINS - 1; i++) + for (uint32_t i = 0; i < BVHBINS - 1; i++) { const float Cspatial = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]); if (Cspatial < splitCost && NL[i] + NR[i] < budget) @@ -1284,16 +1386,16 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) float noSplitCost = (float)node.triCount * C_INT; if (splitCost >= noSplitCost) break; // not splitting is better. // double-buffered partition - unsigned A = sliceStart, B = sliceEnd, src = node.leftFirst; + uint32_t A = sliceStart, B = sliceEnd, src = node.leftFirst; if (spatial) { const float planeDist = (node.aabbMax[bestAxis] - node.aabbMin[bestAxis]) / (BVHBINS * 0.9999f); const float rPlaneDist = 1.0f / planeDist, nodeMin = node.aabbMin[bestAxis]; - for (unsigned i = 0; i < node.triCount; i++) + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned fragIdx = triIdxA[src++]; - const unsigned bin1 = (unsigned)((fragment[fragIdx].bmin[bestAxis] - nodeMin) * rPlaneDist); - const unsigned bin2 = (unsigned)((fragment[fragIdx].bmax[bestAxis] - nodeMin) * rPlaneDist); + const uint32_t fragIdx = triIdxA[src++]; + const uint32_t bin1 = (uint32_t)((fragment[fragIdx].bmin[bestAxis] - nodeMin) * rPlaneDist); + const uint32_t bin2 = (uint32_t)((fragment[fragIdx].bmax[bestAxis] - nodeMin) * rPlaneDist); if (bin2 <= bestPos) triIdxB[A++] = fragIdx; else if (bin1 > bestPos) triIdxB[--B] = fragIdx; else { // split straddler @@ -1310,20 +1412,20 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) { // object partitioning const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis]; - for (unsigned i = 0; i < node.triCount; i++) + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned fr = triIdx[src + i]; - int bi = (int)(((fragment[fr].bmin[bestAxis] + fragment[fr].bmax[bestAxis]) * 0.5f - nmin) * rpd); + const uint32_t fr = triIdx[src + i]; + int32_t bi = (int32_t)(((fragment[fr].bmin[bestAxis] + fragment[fr].bmax[bestAxis]) * 0.5f - nmin) * rpd); bi = tinybvh_clamp( bi, 0, BVHBINS - 1 ); - if (bi <= (int)bestPos) triIdxB[A++] = fr; else triIdxB[--B] = fr; + if (bi <= (int32_t)bestPos) triIdxB[A++] = fr; else triIdxB[--B] = fr; } } // copy back slice data memcpy( triIdxA + sliceStart, triIdxB + sliceStart, (sliceEnd - sliceStart) * 4 ); // create child nodes - unsigned leftCount = A - sliceStart, rightCount = sliceEnd - B; + uint32_t leftCount = A - sliceStart, rightCount = sliceEnd - B; if (leftCount == 0 || rightCount == 0) break; - int leftChildIdx = newNodePtr++, rightChildIdx = newNodePtr++; + int32_t leftChildIdx = newNodePtr++, rightChildIdx = newNodePtr++; bvhNode[leftChildIdx].aabbMin = bestLMin, bvhNode[leftChildIdx].aabbMax = bestLMax; bvhNode[leftChildIdx].leftFirst = sliceStart, bvhNode[leftChildIdx].triCount = leftCount; bvhNode[rightChildIdx].aabbMin = bestRMin, bvhNode[rightChildIdx].aabbMax = bestRMax; @@ -1342,7 +1444,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) sliceEnd = task[taskCount].sliceEnd; } // clean up - for (unsigned i = 0; i < triCount + slack; i++) triIdx[i] = fragment[triIdx[i]].primIdx; + for (uint32_t i = 0; i < triCount + slack; i++) triIdx[i] = fragment[triIdx[i]].primIdx; // Compact(); - TODO // all done. refittable = false; // can't refit an SBVH @@ -1351,13 +1453,18 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) usedBVHNodes = newNodePtr; } +void BVH::BuildHQ( const bvhvec4* vertices, const uint32_t primCount ) +{ + BuildHQ( bvhvec4slice{ vertices, primCount * 3 } ); +} + // Convert: Change the BVH layout from one format into another. void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* deleteOriginal */ ) { if (from == WALD_32BYTE && to == AILA_LAINE) { // allocate space - const unsigned spaceNeeded = usedBVHNodes; + const uint32_t spaceNeeded = usedBVHNodes; if (allocatedAltNodes < spaceNeeded) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, AILA_LAINE ), bvhNode == 0." ); @@ -1367,18 +1474,18 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( altNode, 0, sizeof( BVHNodeAlt ) * spaceNeeded ); // recursively convert nodes - unsigned newAltNode = 0, nodeIdx = 0, stack[128], stackPtr = 0; + uint32_t newAltNode = 0, nodeIdx = 0, stack[128], stackPtr = 0; while (1) { const BVHNode& node = bvhNode[nodeIdx]; - const unsigned idx = newAltNode++; + const uint32_t idx = newAltNode++; if (node.isLeaf()) { altNode[idx].triCount = node.triCount; altNode[idx].firstTri = node.leftFirst; if (!stackPtr) break; nodeIdx = stack[--stackPtr]; - unsigned newNodeParent = stack[--stackPtr]; + uint32_t newNodeParent = stack[--stackPtr]; altNode[newNodeParent].right = newAltNode; } else @@ -1398,7 +1505,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else if (from == WALD_32BYTE && to == ALT_SOA) { // allocate space - const unsigned spaceNeeded = usedBVHNodes; + const uint32_t spaceNeeded = usedBVHNodes; if (allocatedAlt2Nodes < spaceNeeded) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, ALT_SOA ), bvhNode == 0." ); @@ -1408,18 +1515,18 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( alt2Node, 0, sizeof( BVHNodeAlt2 ) * spaceNeeded ); // recursively convert nodes - unsigned newAlt2Node = 0, nodeIdx = 0, stack[128], stackPtr = 0; + uint32_t newAlt2Node = 0, nodeIdx = 0, stack[128], stackPtr = 0; while (1) { const BVHNode& node = bvhNode[nodeIdx]; - const unsigned idx = newAlt2Node++; + const uint32_t idx = newAlt2Node++; if (node.isLeaf()) { alt2Node[idx].triCount = node.triCount; alt2Node[idx].firstTri = node.leftFirst; if (!stackPtr) break; nodeIdx = stack[--stackPtr]; - unsigned newNodeParent = stack[--stackPtr]; + uint32_t newNodeParent = stack[--stackPtr]; alt2Node[newNodeParent].right = newAlt2Node; } else @@ -1442,7 +1549,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else if (from == WALD_32BYTE && to == VERBOSE) { // allocate space - unsigned spaceNeeded = triCount * (refittable ? 2 : 3); // this one needs space to grow to 2N + uint32_t spaceNeeded = triCount * (refittable ? 2 : 3); // this one needs space to grow to 2N if (allocatedVerbose < spaceNeeded) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, VERBOSE ), bvhNode == 0." ); @@ -1453,7 +1560,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet memset( verbose, 0, sizeof( BVHNodeVerbose ) * spaceNeeded ); verbose[0].parent = 0xffffffff; // root sentinel // convert - unsigned nodeIdx = 0, parent = 0xffffffff, stack[128], stackPtr = 0; + uint32_t nodeIdx = 0, parent = 0xffffffff, stack[128], stackPtr = 0; while (1) { const BVHNode& node = bvhNode[nodeIdx]; @@ -1481,7 +1588,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else if (from == WALD_32BYTE && to == BASIC_BVH4) { // allocate space - const unsigned spaceNeeded = usedBVHNodes; + const uint32_t spaceNeeded = usedBVHNodes; if (allocatedBVH4Nodes < spaceNeeded) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, BASIC_BVH4 ), bvhNode == 0." ); @@ -1491,7 +1598,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( bvh4Node, 0, sizeof( BVHNode4 ) * spaceNeeded ); // create an mbvh node for each bvh2 node - for (unsigned i = 0; i < usedBVHNodes; i++) if (i != 1) + for (uint32_t i = 0; i < usedBVHNodes; i++) if (i != 1) { BVHNode& orig = bvhNode[i]; BVHNode4& node4 = bvh4Node[i]; @@ -1500,15 +1607,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else node4.child[0] = orig.leftFirst, node4.child[1] = orig.leftFirst + 1, node4.childCount = 2; } // collapse - unsigned stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node + uint32_t stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node while (1) { BVHNode4& node = bvh4Node[nodeIdx]; while (node.childCount < 4) { - int bestChild = -1; + int32_t bestChild = -1; float bestChildSA = 0; - for (unsigned i = 0; i < node.childCount; i++) + for (uint32_t i = 0; i < node.childCount; i++) { // see if we can adopt child i const BVHNode4& child = bvh4Node[node.child[i]]; @@ -1521,13 +1628,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet if (bestChild == -1) break; // could not adopt const BVHNode4& child = bvh4Node[node.child[bestChild]]; node.child[bestChild] = child.child[0]; - for (unsigned i = 1; i < child.childCount; i++) + for (uint32_t i = 1; i < child.childCount; i++) node.child[node.childCount++] = child.child[i]; } // we're done with the node; proceed with the children - for (unsigned i = 0; i < node.childCount; i++) + for (uint32_t i = 0; i < node.childCount; i++) { - const unsigned childIdx = node.child[i]; + const uint32_t childIdx = node.child[i]; const BVHNode4& child = bvh4Node[childIdx]; if (!child.isLeaf()) stack[stackPtr++] = childIdx; } @@ -1546,7 +1653,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet // Leaf: 15 bits for tri count, 16 for offset // Interior: 32 bits for position of child node. // Triangle data ('by value') immediately follows each leaf node. - unsigned blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes. + uint32_t blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes. blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer. if (allocatedAlt4aBlocks < blocksNeeded) { @@ -1557,14 +1664,14 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( bvh4Alt, 0, 16 * blocksNeeded ); // start conversion - unsigned nodeIdx = 0, newAlt4Ptr = 0, stack[128], stackPtr = 0, retValPos = 0; + uint32_t nodeIdx = 0, newAlt4Ptr = 0, stack[128], stackPtr = 0, retValPos = 0; while (1) { const BVHNode4& node = bvh4Node[nodeIdx]; // convert BVH4 node - must be an interior node. assert( !bvh4Node[nodeIdx].isLeaf() ); bvhvec4* nodeBase = bvh4Alt + newAlt4Ptr; - unsigned baseAlt4Ptr = newAlt4Ptr; + uint32_t baseAlt4Ptr = newAlt4Ptr; newAlt4Ptr += 4; nodeBase[0] = bvhvec4( node.aabbMin, 0 ); nodeBase[1] = bvhvec4( (node.aabbMax - node.aabbMin) * (1.0f / 255.0f), 0 ); @@ -1573,17 +1680,17 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet &bvh4Node[node.child[2]], &bvh4Node[node.child[3]] }; // start with leaf child node conversion - unsigned childInfo[4] = { 0, 0, 0, 0 }; // will store in final fields later - for (int i = 0; i < 4; i++) if (childNode[i]->isLeaf()) + uint32_t childInfo[4] = { 0, 0, 0, 0 }; // will store in final fields later + for (int32_t i = 0; i < 4; i++) if (childNode[i]->isLeaf()) { childInfo[i] = newAlt4Ptr - baseAlt4Ptr; childInfo[i] |= childNode[i]->triCount << 16; childInfo[i] |= 0x80000000; - for (unsigned j = 0; j < childNode[i]->triCount; j++) + for (uint32_t j = 0; j < childNode[i]->triCount; j++) { - unsigned t = triIdx[childNode[i]->firstTri + j]; + uint32_t t = triIdx[childNode[i]->firstTri + j]; #ifdef BVH4_GPU_COMPRESSED_TRIS - PrecomputeTriangle( verts + t * 3, (float*)&bvh4Alt[newAlt4Ptr] ); + PrecomputeTriangle( verts, t * 3, (float*)&bvh4Alt[newAlt4Ptr] ); bvh4Alt[newAlt4Ptr + 3] = bvhvec4( 0, 0, 0, *(float*)&t ); newAlt4Ptr += 4; #else @@ -1597,12 +1704,12 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } } // process interior nodes - for (int i = 0; i < 4; i++) if (!childNode[i]->isLeaf()) + for (int32_t i = 0; i < 4; i++) if (!childNode[i]->isLeaf()) { // childInfo[i] = node.child[i] == 0 ? 0 : GPUFormatBVH4( node.child[i] ); if (node.child[i] == 0) childInfo[i] = 0; else { - stack[stackPtr++] = (unsigned)(((float*)&nodeBase[3] + i) - (float*)bvh4Alt); + stack[stackPtr++] = (uint32_t)(((float*)&nodeBase[3] + i) - (float*)bvh4Alt); stack[stackPtr++] = node.child[i]; } } @@ -1612,36 +1719,36 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet scale.x = extent.x > 1e-10f ? (254.999f / extent.x) : 0; scale.y = extent.y > 1e-10f ? (254.999f / extent.y) : 0; scale.z = extent.z > 1e-10f ? (254.999f / extent.z) : 0; - unsigned char* slot0 = (unsigned char*)&nodeBase[0] + 12; // 4 chars - unsigned char* slot1 = (unsigned char*)&nodeBase[1] + 12; // 4 chars - unsigned char* slot2 = (unsigned char*)&nodeBase[2]; // 16 chars + uint8_t* slot0 = (uint8_t*)&nodeBase[0] + 12; // 4 chars + uint8_t* slot1 = (uint8_t*)&nodeBase[1] + 12; // 4 chars + uint8_t* slot2 = (uint8_t*)&nodeBase[2]; // 16 chars if (node.child[0]) { const bvhvec3 relBMin = childNode[0]->aabbMin - node.aabbMin, relBMax = childNode[0]->aabbMax - node.aabbMin; - slot0[0] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[0] = (unsigned char)ceilf( relBMax.x * scale.x ); - slot2[0] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[4] = (unsigned char)ceilf( relBMax.y * scale.y ); - slot2[8] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[12] = (unsigned char)ceilf( relBMax.z * scale.z ); + slot0[0] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[0] = (uint8_t)ceilf( relBMax.x * scale.x ); + slot2[0] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[4] = (uint8_t)ceilf( relBMax.y * scale.y ); + slot2[8] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[12] = (uint8_t)ceilf( relBMax.z * scale.z ); } if (node.child[1]) { const bvhvec3 relBMin = childNode[1]->aabbMin - node.aabbMin, relBMax = childNode[1]->aabbMax - node.aabbMin; - slot0[1] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[1] = (unsigned char)ceilf( relBMax.x * scale.x ); - slot2[1] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[5] = (unsigned char)ceilf( relBMax.y * scale.y ); - slot2[9] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[13] = (unsigned char)ceilf( relBMax.z * scale.z ); + slot0[1] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[1] = (uint8_t)ceilf( relBMax.x * scale.x ); + slot2[1] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[5] = (uint8_t)ceilf( relBMax.y * scale.y ); + slot2[9] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[13] = (uint8_t)ceilf( relBMax.z * scale.z ); } if (node.child[2]) { const bvhvec3 relBMin = childNode[2]->aabbMin - node.aabbMin, relBMax = childNode[2]->aabbMax - node.aabbMin; - slot0[2] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[2] = (unsigned char)ceilf( relBMax.x * scale.x ); - slot2[2] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[6] = (unsigned char)ceilf( relBMax.y * scale.y ); - slot2[10] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[14] = (unsigned char)ceilf( relBMax.z * scale.z ); + slot0[2] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[2] = (uint8_t)ceilf( relBMax.x * scale.x ); + slot2[2] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[6] = (uint8_t)ceilf( relBMax.y * scale.y ); + slot2[10] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[14] = (uint8_t)ceilf( relBMax.z * scale.z ); } if (node.child[3]) { const bvhvec3 relBMin = childNode[3]->aabbMin - node.aabbMin, relBMax = childNode[3]->aabbMax - node.aabbMin; - slot0[3] = (unsigned char)floorf( relBMin.x * scale.x ), slot1[3] = (unsigned char)ceilf( relBMax.x * scale.x ); - slot2[3] = (unsigned char)floorf( relBMin.y * scale.y ), slot2[7] = (unsigned char)ceilf( relBMax.y * scale.y ); - slot2[11] = (unsigned char)floorf( relBMin.z * scale.z ), slot2[15] = (unsigned char)ceilf( relBMax.z * scale.z ); + slot0[3] = (uint8_t)floorf( relBMin.x * scale.x ), slot1[3] = (uint8_t)ceilf( relBMax.x * scale.x ); + slot2[3] = (uint8_t)floorf( relBMin.y * scale.y ), slot2[7] = (uint8_t)ceilf( relBMax.y * scale.y ); + slot2[11] = (uint8_t)floorf( relBMin.z * scale.z ), slot2[15] = (uint8_t)ceilf( relBMax.z * scale.z ); } // finalize node nodeBase[3] = bvhvec4( @@ -1649,7 +1756,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet *(float*)&childInfo[2], *(float*)&childInfo[3] ); // pop new work from the stack - if (retValPos > 0) ((unsigned*)bvh4Alt)[retValPos] = baseAlt4Ptr; + if (retValPos > 0) ((uint32_t*)bvh4Alt)[retValPos] = baseAlt4Ptr; if (stackPtr == 0) break; nodeIdx = stack[--stackPtr]; retValPos = stack[--stackPtr]; @@ -1661,7 +1768,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet // Convert a 4-wide BVH to a format suitable for CPU traversal. // See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout, // Atilla T. Áfra, 2013. - unsigned spaceNeeded = usedBVH4Nodes; + uint32_t spaceNeeded = usedBVH4Nodes; if (allocatedAlt4bNodes < spaceNeeded) { FATAL_ERROR_IF( bvh4Node == 0, "BVH::Convert( BASIC_BVH4, BVH4_AFRA ), bvh4Node == 0." ); @@ -1673,13 +1780,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( bvh4Alt2, 0, spaceNeeded * sizeof( BVHNode4Alt2 ) ); // start conversion - unsigned newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0; + uint32_t newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0; while (1) { const BVHNode4& orig = bvh4Node[nodeIdx]; BVHNode4Alt2& newNode = bvh4Alt2[newAlt4Ptr++]; - int cidx = 0; - for (int i = 0; i < 4; i++) if (orig.child[i]) + int32_t cidx = 0; + for (int32_t i = 0; i < 4; i++) if (orig.child[i]) { const BVHNode4& child = bvh4Node[orig.child[i]]; ((float*)&newNode.xmin4)[cidx] = child.aabbMin.x; @@ -1692,7 +1799,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet newNode.childFirst[cidx] = child.firstTri, newNode.triCount[cidx] = child.triCount; else - stack[stackPtr++] = (unsigned)((unsigned*)&newNode.childFirst[cidx] - (unsigned*)bvh4Alt2), + stack[stackPtr++] = (uint32_t)((uint32_t*)&newNode.childFirst[cidx] - (uint32_t*)bvh4Alt2), stack[stackPtr++] = orig.child[i]; cidx++; } @@ -1705,28 +1812,28 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet // pop next task if (!stackPtr) break; nodeIdx = stack[--stackPtr]; - unsigned offset = stack[--stackPtr]; - ((unsigned*)bvh4Alt2)[offset] = newAlt4Ptr; + uint32_t offset = stack[--stackPtr]; + ((uint32_t*)bvh4Alt2)[offset] = newAlt4Ptr; } // Convert index list: store primitives 'by value'. // This also allows us to compact and reorder them for best performance. stackPtr = 0, nodeIdx = 0; - unsigned triPtr = 0; + uint32_t triPtr = 0; while (1) { BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; - for (int i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0) + for (int32_t i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0) { if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else { - unsigned first = node.childFirst[i]; - unsigned count = node.triCount[i]; + uint32_t first = node.childFirst[i]; + uint32_t count = node.triCount[i]; node.childFirst[i] = triPtr; // assign vertex data - for (unsigned j = 0; j < count; j++) + for (uint32_t j = 0; j < count; j++) { - unsigned fi = triIdx[first + j]; - PrecomputeTriangle( verts + fi * 3, (float*)&bvh4Tris[triPtr] ); + uint32_t fi = triIdx[first + j]; + PrecomputeTriangle( verts, fi * 3, (float*)&bvh4Tris[triPtr] ); bvh4Tris[triPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&fi ); triPtr += 4; } @@ -1737,13 +1844,35 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } usedAlt4bNodes = newAlt4Ptr; } + else if (from == BASIC_BVH4 && to == BVH4_WIVE) + { + // Convert a 4-wide BVH to a format suitable for CPU traversal. + // See "Accelerated Single Ray Tracing for Wide Vector Units", + // Fuetterling et al., 2017. + uint32_t spaceNeeded = usedBVH4Nodes; + if (allocatedAlt4bNodes < spaceNeeded) + { + FATAL_ERROR_IF( bvh4Node == 0, "BVH::Convert( BASIC_BVH4, BVH4_WIVE ), bvh4Node == 0." ); + AlignedFree( bvh4WiVe ); + bvh4WiVe = (BVHNode4WiVe*)AlignedAlloc( spaceNeeded * sizeof( BVHNode4WiVe ) ); + allocatedWiVeNodes = spaceNeeded; + } + memset( bvh4WiVe, 0, spaceNeeded * sizeof( BVHNode4WiVe ) ); + // start conversion + // uint32_t newNodePtr = 0, nodeIdx = 0, stack[128], stackPtr = 0; + // while (1) + // { + // const BVHNode4& orig = bvh4Node[nodeIdx]; + // .. TODO - under construction. + // } + } else if (from == WALD_32BYTE && to == BASIC_BVH8) { // allocate space // Note: The safe upper bound here is usedBVHNodes when converting an existing // BVH2, but we need triCount * 2 to be safe in later conversions, e.g. to // CWBVH, which may further split some leaf nodes. - const unsigned spaceNeeded = triCount * 2; + const uint32_t spaceNeeded = triCount * 2; if (allocatedBVH8Nodes < spaceNeeded) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Convert( WALD_32BYTE, BASIC_BVH8 ), bvhNode == 0." ); @@ -1753,7 +1882,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( bvh8Node, 0, sizeof( BVHNode8 ) * spaceNeeded ); // create an mbvh node for each bvh2 node - for (unsigned i = 0; i < usedBVHNodes; i++) if (i != 1) + for (uint32_t i = 0; i < usedBVHNodes; i++) if (i != 1) { BVHNode& orig = bvhNode[i]; BVHNode8& node8 = bvh8Node[i]; @@ -1762,15 +1891,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else node8.child[0] = orig.leftFirst, node8.child[1] = orig.leftFirst + 1, node8.childCount = 2; } // collapse - unsigned stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node + uint32_t stack[128], stackPtr = 1, nodeIdx = stack[0] = 0; // i.e., root node while (1) { BVHNode8& node = bvh8Node[nodeIdx]; while (node.childCount < 8) { - int bestChild = -1; + int32_t bestChild = -1; float bestChildSA = 0; - for (unsigned i = 0; i < node.childCount; i++) + for (uint32_t i = 0; i < node.childCount; i++) { // see if we can adopt child i const BVHNode8& child = bvh8Node[node.child[i]]; @@ -1783,13 +1912,13 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet if (bestChild == -1) break; // could not adopt const BVHNode8& child = bvh8Node[node.child[bestChild]]; node.child[bestChild] = child.child[0]; - for (unsigned i = 1; i < child.childCount; i++) + for (uint32_t i = 1; i < child.childCount; i++) node.child[node.childCount++] = child.child[i]; } // we're done with the node; proceed with the children - for (unsigned i = 0; i < node.childCount; i++) + for (uint32_t i = 0; i < node.childCount; i++) { - const unsigned childIdx = node.child[i]; + const uint32_t childIdx = node.child[i]; const BVHNode8& child = bvh8Node[childIdx]; if (!child.isLeaf()) stack[stackPtr++] = childIdx; } @@ -1806,10 +1935,10 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet FATAL_ERROR_IF( bvh8Node[0].isLeaf(), "BVH::Convert( BASIC_BVH8, CWBVH ), collapsing single-node bvh." ); // allocate memory // Note: This can be far lower (specifically: usedBVH8Nodes) if we know that - // none of the BVH8 leafs has more than three primitives. - // Without this guarantee, the only safe upper limit is triCount * 2, since + // none of the BVH8 leafs has more than three primitives. + // Without this guarantee, the only safe upper limit is triCount * 2, since // we will be splitting fat BVH8 leafs to as we go. - unsigned spaceNeeded = triCount * 2 * 5; // CWBVH nodes use 80 bytes each. + uint32_t spaceNeeded = triCount * 2 * 5; // CWBVH nodes use 80 bytes each. if (spaceNeeded > allocatedCWBVHBlocks) { FATAL_ERROR_IF( bvh8Node == 0, "BVH::Convert( BASIC_BVH8, CWBVH ), bvh8Node == 0." ); @@ -1820,20 +1949,20 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet memset( bvh8Compact, 0, spaceNeeded * 16 ); memset( bvh8Tris, 0, idxCount * 3 * 16 ); BVHNode8* stackNodePtr[256]; - unsigned stackNodeAddr[256], stackPtr = 1, nodeDataPtr = 5, triDataPtr = 0; + uint32_t stackNodeAddr[256], stackPtr = 1, nodeDataPtr = 5, triDataPtr = 0; stackNodePtr[0] = &bvh8Node[0], stackNodeAddr[0] = 0; // start conversion while (stackPtr > 0) { BVHNode8* node = stackNodePtr[--stackPtr]; - const int currentNodeAddr = stackNodeAddr[stackPtr]; + const int32_t currentNodeAddr = stackNodeAddr[stackPtr]; bvhvec3 nodeLo = node->aabbMin, nodeHi = node->aabbMax; // greedy child node ordering const bvhvec3 nodeCentroid = (nodeLo + nodeHi) * 0.5f; float cost[8][8]; - int assignment[8]; + int32_t assignment[8]; bool isSlotEmpty[8]; - for (int s = 0; s < 8; s++) + for (int32_t s = 0; s < 8; s++) { isSlotEmpty[s] = true, assignment[s] = -1; bvhvec3 ds( @@ -1841,7 +1970,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet (((s >> 1) & 1) == 1) ? -1.0f : 1.0f, (((s >> 0) & 1) == 1) ? -1.0f : 1.0f ); - for (int i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = BVH_FAR; else + for (int32_t i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = BVH_FAR; else { BVHNode8* const child = &bvh8Node[node->child[i]]; if (child->triCount > 3 /* must be leaf */) SplitBVH8Leaf( node->child[i], 1 ); @@ -1852,67 +1981,67 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet while (1) { float minCost = BVH_FAR; - int minEntryx = -1, minEntryy = -1; - for (int s = 0; s < 8; s++) for (int i = 0; i < 8; i++) + int32_t minEntryx = -1, minEntryy = -1; + for (int32_t s = 0; s < 8; s++) for (int32_t i = 0; i < 8; i++) if (assignment[i] == -1 && isSlotEmpty[s] && cost[s][i] < minCost) minCost = cost[s][i], minEntryx = s, minEntryy = i; if (minEntryx == -1 && minEntryy == -1) break; isSlotEmpty[minEntryx] = false, assignment[minEntryy] = minEntryx; } - for (int i = 0; i < 8; i++) if (assignment[i] == -1) for (int s = 0; s < 8; s++) if (isSlotEmpty[s]) + for (int32_t i = 0; i < 8; i++) if (assignment[i] == -1) for (int32_t s = 0; s < 8; s++) if (isSlotEmpty[s]) { isSlotEmpty[s] = false, assignment[i] = s; break; } const BVHNode8 oldNode = *node; - for (int i = 0; i < 8; i++) node->child[assignment[i]] = oldNode.child[i]; + for (int32_t i = 0; i < 8; i++) node->child[assignment[i]] = oldNode.child[i]; // calculate quantization parameters for each axis - const int ex = (int)((char)ceilf( log2f( (nodeHi.x - nodeLo.x) / 255.0f ) )); - const int ey = (int)((char)ceilf( log2f( (nodeHi.y - nodeLo.y) / 255.0f ) )); - const int ez = (int)((char)ceilf( log2f( (nodeHi.z - nodeLo.z) / 255.0f ) )); + const int32_t ex = (int32_t)((int8_t)ceilf( log2f( (nodeHi.x - nodeLo.x) / 255.0f ) )); + const int32_t ey = (int32_t)((int8_t)ceilf( log2f( (nodeHi.y - nodeLo.y) / 255.0f ) )); + const int32_t ez = (int32_t)((int8_t)ceilf( log2f( (nodeHi.z - nodeLo.z) / 255.0f ) )); // encode output - int internalChildCount = 0, leafChildTriCount = 0, childBaseIndex = 0, triangleBaseIndex = 0; - unsigned char imask = 0; - for (int i = 0; i < 8; i++) + int32_t internalChildCount = 0, leafChildTriCount = 0, childBaseIndex = 0, triangleBaseIndex = 0; + uint8_t imask = 0; + for (int32_t i = 0; i < 8; i++) { if (node->child[i] == 0) continue; BVHNode8* const child = &bvh8Node[node->child[i]]; - const int qlox = (int)floorf( (child->aabbMin.x - nodeLo.x) / powf( 2, (float)ex ) ); - const int qloy = (int)floorf( (child->aabbMin.y - nodeLo.y) / powf( 2, (float)ey ) ); - const int qloz = (int)floorf( (child->aabbMin.z - nodeLo.z) / powf( 2, (float)ez ) ); - const int qhix = (int)ceilf( (child->aabbMax.x - nodeLo.x) / powf( 2, (float)ex ) ); - const int qhiy = (int)ceilf( (child->aabbMax.y - nodeLo.y) / powf( 2, (float)ey ) ); - const int qhiz = (int)ceilf( (child->aabbMax.z - nodeLo.z) / powf( 2, (float)ez ) ); - unsigned char* const baseAddr = (unsigned char*)&bvh8Compact[currentNodeAddr + 2]; - baseAddr[i + 0] = (unsigned char)qlox, baseAddr[i + 24] = (unsigned char)qhix; - baseAddr[i + 8] = (unsigned char)qloy, baseAddr[i + 32] = (unsigned char)qhiy; - baseAddr[i + 16] = (unsigned char)qloz, baseAddr[i + 40] = (unsigned char)qhiz; + const int32_t qlox = (int32_t)floorf( (child->aabbMin.x - nodeLo.x) / powf( 2, (float)ex ) ); + const int32_t qloy = (int32_t)floorf( (child->aabbMin.y - nodeLo.y) / powf( 2, (float)ey ) ); + const int32_t qloz = (int32_t)floorf( (child->aabbMin.z - nodeLo.z) / powf( 2, (float)ez ) ); + const int32_t qhix = (int32_t)ceilf( (child->aabbMax.x - nodeLo.x) / powf( 2, (float)ex ) ); + const int32_t qhiy = (int32_t)ceilf( (child->aabbMax.y - nodeLo.y) / powf( 2, (float)ey ) ); + const int32_t qhiz = (int32_t)ceilf( (child->aabbMax.z - nodeLo.z) / powf( 2, (float)ez ) ); + uint8_t* const baseAddr = (uint8_t*)&bvh8Compact[currentNodeAddr + 2]; + baseAddr[i + 0] = (uint8_t)qlox, baseAddr[i + 24] = (uint8_t)qhix; + baseAddr[i + 8] = (uint8_t)qloy, baseAddr[i + 32] = (uint8_t)qhiy; + baseAddr[i + 16] = (uint8_t)qloz, baseAddr[i + 40] = (uint8_t)qhiz; if (!child->isLeaf()) { // interior node, set params and push onto stack - const int childNodeAddr = nodeDataPtr; + const int32_t childNodeAddr = nodeDataPtr; if (internalChildCount++ == 0) childBaseIndex = childNodeAddr / 5; nodeDataPtr += 5, imask |= 1 << i; // set the meta field - This calculation assumes children are stored contiguously. - unsigned char* const childMetaField = ((unsigned char*)&bvh8Compact[currentNodeAddr + 1]) + 8; - childMetaField[i] = (1 << 5) | (24 + (unsigned char)i); // I don't see how this accounts for empty children? + uint8_t* const childMetaField = ((uint8_t*)&bvh8Compact[currentNodeAddr + 1]) + 8; + childMetaField[i] = (1 << 5) | (24 + (uint8_t)i); // I don't see how this accounts for empty children? stackNodePtr[stackPtr] = child, stackNodeAddr[stackPtr++] = childNodeAddr; // counted in float4s internalChildCount++; continue; } // leaf node - const unsigned tcount = tinybvh_min( child->triCount, 3u ); // TODO: ensure that's the case; clamping for now. + const uint32_t tcount = tinybvh_min( child->triCount, 3u ); // TODO: ensure that's the case; clamping for now. if (leafChildTriCount == 0) triangleBaseIndex = triDataPtr; - int unaryEncodedTriCount = tcount == 1 ? 0b001 : tcount == 2 ? 0b011 : 0b111; + int32_t unaryEncodedTriCount = tcount == 1 ? 0b001 : tcount == 2 ? 0b011 : 0b111; // set the meta field - This calculation assumes children are stored contiguously. - unsigned char* const childMetaField = ((unsigned char*)&bvh8Compact[currentNodeAddr + 1]) + 8; - childMetaField[i] = (unsigned char)((unaryEncodedTriCount << 5) | leafChildTriCount); + uint8_t* const childMetaField = ((uint8_t*)&bvh8Compact[currentNodeAddr + 1]) + 8; + childMetaField[i] = (uint8_t)((unaryEncodedTriCount << 5) | leafChildTriCount); leafChildTriCount += tcount; - for (unsigned j = 0; j < tcount; j++) + for (uint32_t j = 0; j < tcount; j++) { - int primitiveIndex = triIdx[child->firstTri + j]; + int32_t primitiveIndex = triIdx[child->firstTri + j]; #ifdef CWBVH_COMPRESSED_TRIS - PrecomputeTriangle( verts + primitiveIndex * 3, (float*)&bvh8Tris[triDataPtr] ); + PrecomputeTriangle( verts, +primitiveIndex * 3, (float*)&bvh8Tris[triDataPtr] ); bvh8Tris[triDataPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&primitiveIndex ); triDataPtr += 4; #else @@ -1924,7 +2053,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet #endif } } - unsigned char exyzAndimask[4] = { *(unsigned char*)&ex, *(unsigned char*)&ey, *(unsigned char*)&ez, imask }; + uint8_t exyzAndimask[4] = { *(uint8_t*)&ex, *(uint8_t*)&ey, *(uint8_t*)&ez, imask }; bvh8Compact[currentNodeAddr + 0] = bvhvec4( nodeLo, *(float*)&exyzAndimask ); bvh8Compact[currentNodeAddr + 1].x = *(float*)&childBaseIndex; bvh8Compact[currentNodeAddr + 1].y = *(float*)&triangleBaseIndex; @@ -1934,7 +2063,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else if (from == VERBOSE && to == WALD_32BYTE) { // allocate space - const unsigned spaceNeeded = usedVerboseNodes; + const uint32_t spaceNeeded = usedVerboseNodes; if (allocatedBVHNodes < spaceNeeded) { FATAL_ERROR_IF( verbose == 0, "BVH::Convert( VERBOSE, WALD_32BYTE ), verbose == 0." ); @@ -1944,8 +2073,8 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } memset( bvhNode, 0, sizeof( BVHNode ) * spaceNeeded ); // start conversion - unsigned srcNodeIdx = 0, dstNodeIdx = 0, newNodePtr = 2; - unsigned srcStack[64], dstStack[64], stackPtr = 0; + uint32_t srcNodeIdx = 0, dstNodeIdx = 0, newNodePtr = 2; + uint32_t srcStack[64], dstStack[64], stackPtr = 0; while (1) { const BVHNodeVerbose& srcNode = verbose[srcNodeIdx]; @@ -1962,7 +2091,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet else { bvhNode[dstNodeIdx].leftFirst = newNodePtr; - unsigned srcRightIdx = srcNode.right; + uint32_t srcRightIdx = srcNode.right; srcNodeIdx = srcNode.left, dstNodeIdx = newNodePtr++; srcStack[stackPtr] = srcRightIdx; dstStack[stackPtr++] = newNodePtr++; @@ -1977,12 +2106,12 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool /* delet } } -int BVH::NodeCount( const BVHLayout layout ) const +int32_t BVH::NodeCount( const BVHLayout layout ) const { // Determine the number of nodes in the tree. Typically the result should - // be usedBVHNodes - 1 (second node is always unused), but some builders may + // be usedBVHNodes - 1 (second node is always unused), but some builders may // have unused nodes besides node 1. TODO: Support more layouts. - unsigned retVal = 0, nodeIdx = 0, stack[64], stackPtr = 0; + uint32_t retVal = 0, nodeIdx = 0, stack[64], stackPtr = 0; if (layout == WALD_32BYTE) { FATAL_ERROR_IF( bvhNode == 0, "BVH::NodeCount( WALD_32BYTE ), bvhNode == 0." ); @@ -2012,22 +2141,22 @@ int BVH::NodeCount( const BVHLayout layout ) const // includes trees waving in the wind, or subsequent frames for skinned // animations. Repeated refitting tends to lead to deteriorated BVHs and // slower ray tracing. Rebuild when this happens. -void BVH::Refit( const BVHLayout layout, const unsigned nodeIdx ) +void BVH::Refit( const BVHLayout layout, const uint32_t nodeIdx ) { FATAL_ERROR_IF( !refittable, "BVH::Refit( .. ), refitting an SBVH." ); if (layout == WALD_32BYTE) { FATAL_ERROR_IF( bvhNode == 0, "BVH::Refit( WALD_32BYTE ), bvhNode == 0." ); FATAL_ERROR_IF( may_have_holes, "BVH::Refit( WALD_32BYTE ), bvh may have holes." ); - for (int i = usedBVHNodes - 1; i >= 0; i--) + for (int32_t i = usedBVHNodes - 1; i >= 0; i--) { BVHNode& node = bvhNode[i]; if (node.isLeaf()) // leaf: adjust to current triangle vertex positions { bvhvec4 aabbMin( BVH_FAR ), aabbMax( -BVH_FAR ); - for (unsigned first = node.leftFirst, j = 0; j < node.triCount; j++) + for (uint32_t first = node.leftFirst, j = 0; j < node.triCount; j++) { - const unsigned vertIdx = triIdx[first + j] * 3; + const uint32_t vertIdx = triIdx[first + j] * 3; aabbMin = tinybvh_min( aabbMin, verts[vertIdx] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx] ); aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 1] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 1] ); aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 2] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 2] ); @@ -2049,9 +2178,9 @@ void BVH::Refit( const BVHLayout layout, const unsigned nodeIdx ) if (node.isLeaf()) // leaf: adjust to current triangle vertex positions { bvhvec4 aabbMin( BVH_FAR ), aabbMax( -BVH_FAR ); - for (unsigned first = node.firstTri, j = 0; j < node.triCount; j++) + for (uint32_t first = node.firstTri, j = 0; j < node.triCount; j++) { - const unsigned vertIdx = triIdx[first + j] * 3; + const uint32_t vertIdx = triIdx[first + j] * 3; aabbMin = tinybvh_min( aabbMin, verts[vertIdx] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx] ); aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 1] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 1] ); aabbMin = tinybvh_min( aabbMin, verts[vertIdx + 2] ), aabbMax = tinybvh_max( aabbMax, verts[vertIdx + 2] ); @@ -2084,14 +2213,14 @@ void BVH::Compact( const BVHLayout layout ) FATAL_ERROR_IF( bvhNode == 0, "BVH::Compact( WALD_32BYTE ), bvhNode == 0." ); BVHNode* tmp = (BVHNode*)AlignedAlloc( sizeof( BVHNode ) * usedBVHNodes ); memcpy( tmp, bvhNode, 2 * sizeof( BVHNode ) ); - unsigned newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0; + uint32_t newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0; while (1) { BVHNode& node = tmp[nodeIdx]; const BVHNode& left = bvhNode[node.leftFirst]; const BVHNode& right = bvhNode[node.leftFirst + 1]; tmp[newNodePtr] = left, tmp[newNodePtr + 1] = right; - const unsigned todo1 = newNodePtr, todo2 = newNodePtr + 1; + const uint32_t todo1 = newNodePtr, todo2 = newNodePtr + 1; node.leftFirst = newNodePtr, newNodePtr += 2; if (!left.isLeaf()) stack[stackPtr++] = todo1; if (!right.isLeaf()) stack[stackPtr++] = todo2; @@ -2110,14 +2239,14 @@ void BVH::Compact( const BVHLayout layout ) FATAL_ERROR_IF( verbose == 0, "BVH::Compact( VERBOSE ), verbose == 0." ); BVHNodeVerbose* tmp = (BVHNodeVerbose*)AlignedAlloc( sizeof( BVHNodeVerbose ) * usedVerboseNodes ); memcpy( tmp, verbose, 2 * sizeof( BVHNodeVerbose ) ); - unsigned newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0; + uint32_t newNodePtr = 2, nodeIdx = 0, stack[64], stackPtr = 0; while (1) { BVHNodeVerbose& node = tmp[nodeIdx]; const BVHNodeVerbose& left = verbose[node.left]; const BVHNodeVerbose& right = verbose[node.right]; tmp[newNodePtr] = left, tmp[newNodePtr + 1] = right; - const unsigned todo1 = newNodePtr, todo2 = newNodePtr + 1; + const uint32_t todo1 = newNodePtr, todo2 = newNodePtr + 1; node.left = newNodePtr++, node.right = newNodePtr++; if (!left.isLeaf()) stack[stackPtr++] = todo1; if (!right.isLeaf()) stack[stackPtr++] = todo2; @@ -2138,10 +2267,10 @@ void BVH::Compact( const BVHLayout layout ) // necessary to have a single primitive per leaf, it will yield a slightly better // optimized BVH. The leafs of the optimized BVH should be collapsed ('MergeLeafs') // to obtain the final tree. -void BVH::SplitLeafs( const unsigned maxPrims ) +void BVH::SplitLeafs( const uint32_t maxPrims ) { FATAL_ERROR_IF( verbose == 0, "BVH::SplitLeafs(), requires VERBOSE bvh." ); - unsigned nodeIdx = 0, stack[64], stackPtr = 0; + uint32_t nodeIdx = 0, stack[64], stackPtr = 0; float fragMinFix = frag_min_flipped ? -1.0f : 1.0f; while (1) { @@ -2151,8 +2280,8 @@ void BVH::SplitLeafs( const unsigned maxPrims ) // split this leaf if (node.triCount > maxPrims) { - const unsigned newIdx1 = usedVerboseNodes++; - const unsigned newIdx2 = usedVerboseNodes++; + const uint32_t newIdx1 = usedVerboseNodes++; + const uint32_t newIdx2 = usedVerboseNodes++; BVHNodeVerbose& new1 = verbose[newIdx1], & new2 = verbose[newIdx2]; new1.firstTri = node.firstTri; new1.triCount = node.triCount / 2; @@ -2163,11 +2292,11 @@ void BVH::SplitLeafs( const unsigned maxPrims ) new2.left = new2.right = 0; node.left = newIdx1, node.right = newIdx2, node.triCount = 0; new1.aabbMin = new2.aabbMin = BVH_FAR, new1.aabbMax = new2.aabbMax = -BVH_FAR; - for (unsigned fi, i = 0; i < new1.triCount; i++) + for (uint32_t fi, i = 0; i < new1.triCount; i++) fi = triIdx[new1.firstTri + i], new1.aabbMin = tinybvh_min( new1.aabbMin, fragment[fi].bmin * fragMinFix ), new1.aabbMax = tinybvh_max( new1.aabbMax, fragment[fi].bmax ); - for (unsigned fi, i = 0; i < new2.triCount; i++) + for (uint32_t fi, i = 0; i < new2.triCount; i++) fi = triIdx[new2.firstTri + i], new2.aabbMin = tinybvh_min( new2.aabbMin, fragment[fi].bmin * fragMinFix ), new2.aabbMax = tinybvh_max( new2.aabbMax, fragment[fi].bmax ); @@ -2183,7 +2312,7 @@ void BVH::SplitLeafs( const unsigned maxPrims ) // SplitBVH8Leaf: CWBVH requires that a leaf has no more than 3 primitives, // but regular BVH construction does not guarantee this. So, here we split // busy leafs recursively in multiple leaves, until the requirement is met. -void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims ) +void BVH::SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims ) { float fragMinFix = frag_min_flipped ? -1.0f : 1.0f; BVHNode8& node = bvh8Node[nodeIdx]; @@ -2192,7 +2321,7 @@ void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims ) BVHNode8& firstChild = bvh8Node[node.child[0] = usedBVH8Nodes++]; firstChild.triCount = node.triCount; firstChild.firstTri = node.firstTri; - unsigned nextChild = 1; + uint32_t nextChild = 1; // share with new sibling nodes while (firstChild.triCount > maxPrims && nextChild < 8) { @@ -2201,14 +2330,14 @@ void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims ) child.firstTri = firstChild.firstTri + firstChild.triCount; nextChild++; } - for (unsigned i = 0; i < nextChild; i++) + for (uint32_t i = 0; i < nextChild; i++) { BVHNode8& child = bvh8Node[node.child[i]]; if (!refittable) child.aabbMin = node.aabbMin, child.aabbMax = node.aabbMax; else { // TODO: why is this producing wrong aabbs for SBVH? child.aabbMin = bvhvec3( BVH_FAR ), child.aabbMax = bvhvec3( -BVH_FAR ); - for (unsigned fi, j = 0; j < child.triCount; j++) + for (uint32_t fi, j = 0; j < child.triCount; j++) fi = triIdx[child.firstTri + j], child.aabbMin = tinybvh_min( child.aabbMin, fragment[fi].bmin * fragMinFix ), child.aabbMax = tinybvh_max( child.aabbMax, fragment[fi].bmax ); @@ -2225,17 +2354,17 @@ void BVH::MergeLeafs() { // allocate some working space FATAL_ERROR_IF( verbose == 0, "BVH::MergeLeafs(), requires VERBOSE bvh." ); - unsigned* subtreeTriCount = (unsigned*)AlignedAlloc( usedVerboseNodes * 4 ); - unsigned* newIdx = (unsigned*)AlignedAlloc( idxCount * 4 ); + uint32_t* subtreeTriCount = (uint32_t*)AlignedAlloc( usedVerboseNodes * 4 ); + uint32_t* newIdx = (uint32_t*)AlignedAlloc( idxCount * 4 ); memset( subtreeTriCount, 0, usedVerboseNodes * 4 ); CountSubtreeTris( 0, subtreeTriCount ); - unsigned stack[64], stackPtr = 0, nodeIdx = 0, newIdxPtr = 0; + uint32_t stack[64], stackPtr = 0, nodeIdx = 0, newIdxPtr = 0; while (1) { BVHNodeVerbose& node = verbose[nodeIdx]; if (node.isLeaf()) { - unsigned start = newIdxPtr; + uint32_t start = newIdxPtr; MergeSubtree( nodeIdx, newIdx, newIdxPtr ); node.firstTri = start; // pop new task @@ -2244,9 +2373,9 @@ void BVH::MergeLeafs() } else { - const unsigned leftCount = subtreeTriCount[node.left]; - const unsigned rightCount = subtreeTriCount[node.right]; - const unsigned mergedCount = leftCount + rightCount; + const uint32_t leftCount = subtreeTriCount[node.left]; + const uint32_t rightCount = subtreeTriCount[node.right]; + const uint32_t mergedCount = leftCount + rightCount; // cost of unsplit float Cunsplit = SA( node.aabbMin, node.aabbMax ) * mergedCount * C_INT; // cost of leaving things as they are @@ -2259,7 +2388,7 @@ void BVH::MergeLeafs() if (Cunsplit <= Ckeepsplit) { // collapse the subtree - unsigned start = newIdxPtr; + uint32_t start = newIdxPtr; MergeSubtree( nodeIdx, newIdx, newIdxPtr ); node.firstTri = start; node.triCount = mergedCount; @@ -2285,18 +2414,18 @@ void BVH::MergeLeafs() // Optimizing a BVH: BVH must be in 'verbose' format. // Implements "Fast Insertion-Based Optimization of Bounding Volume Hierarchies", -void BVH::Optimize( const unsigned iterations ) +void BVH::Optimize( const uint32_t iterations ) { // Optimize by reinserting a random subtree. // Suggested iteration count: ~1M for best results. // TODO: Implement Section 3.4 of the paper to speed up the process. FATAL_ERROR_IF( verbose == 0, "BVH::Optimize(), requires VERBOSE bvh." ); - for (unsigned i = 0; i < iterations; i++) + for (uint32_t i = 0; i < iterations; i++) { - unsigned Nid, valid = 0; + uint32_t Nid, valid = 0; do { - static unsigned seed = 0x12345678; + static uint32_t seed = 0x12345678; seed ^= seed << 13, seed ^= seed >> 17, seed ^= seed << 5; // xor32 valid = 1, Nid = 2 + seed % (usedVerboseNodes - 2); if (verbose[Nid].parent == 0 || verbose[Nid].isLeaf()) valid = 0; @@ -2304,12 +2433,12 @@ void BVH::Optimize( const unsigned iterations ) } while (valid == 0); // snip it loose const BVHNodeVerbose& N = verbose[Nid], & P = verbose[N.parent]; - const unsigned Pid = N.parent, X1 = P.parent; - const unsigned X2 = P.left == Nid ? P.right : P.left; + const uint32_t Pid = N.parent, X1 = P.parent; + const uint32_t X2 = P.left == Nid ? P.right : P.left; if (verbose[X1].left == Pid) verbose[X1].left = X2; else /* verbose[X1].right == Pid */ verbose[X1].right = X2; verbose[X2].parent = X1; - unsigned L = N.left, R = N.right; + uint32_t L = N.left, R = N.right; // fix affected node bounds RefitUpVerbose( X1 ); ReinsertNodeVerbose( L, Pid, X1 ); @@ -2321,7 +2450,7 @@ void BVH::Optimize( const unsigned iterations ) // This function returns the intersection details in Ray::hit. Additionally, // the number of steps through the BVH is returned. Visualize this to get a // visual impression of the structure of the BVH. -int BVH::Intersect( Ray& ray, const BVHLayout layout ) const +int32_t BVH::Intersect( Ray& ray, const BVHLayout layout ) const { switch (layout) { @@ -2356,11 +2485,13 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const FATAL_ERROR_IF( bvh4Alt2 == 0, "BVH::Intersect( .. , BVH4_AFRA ), bvh not available." ); return Intersect_Afra( ray ); break; + #if defined BVH_USEAVX case CWBVH: FATAL_ERROR_IF( bvh8Compact == 0, "BVH::Intersect( .. , CWBVH ), bvh not available." ); return Intersect_CWBVH( ray ); break; #endif + #endif default: FATAL_ERROR_IF( true, "BVH::Intersect( .. , ? ), unsupported bvh layout." ); break; @@ -2368,15 +2499,15 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const return 0; } -void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout, const TraceDevice /* device */ ) const +void BVH::BatchIntersect( Ray* rayBatch, const uint32_t N, const BVHLayout layout, const TraceDevice /* device */ ) const { - for (unsigned i = 0; i < N; i++) Intersect( rayBatch[i], layout ); + for (uint32_t i = 0; i < N; i++) Intersect( rayBatch[i], layout ); } // Detect if a ray is occluded / shadow ray query. -// Unlike Intersect, this function only returns a yes/no answer: Yes if any -// geometry blocks it (taking into account ray length); no if the ray can -// travel the specified distance without encountering anything. +// Unlike Intersect, this function only returns a yes/no answer: Yes if any +// geometry blocks it (taking into account ray length); no if the ray can +// travel the specified distance without encountering anything. bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const { switch (layout) @@ -2404,25 +2535,25 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const // For now this exists only to establish the interface. // A future implementation will exploit the batch to trace the rays faster. // BatchIsOccluded returns the hits as a bit array in result: -// Each unsigned integer in this array stores 32 hits. -void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout, const TraceDevice /* device */ ) const +// Each uint32_t integer in this array stores 32 hits. +void BVH::BatchIsOccluded( Ray* rayBatch, const uint32_t N, uint32_t* result, const BVHLayout layout, const TraceDevice /* device */ ) const { - unsigned words = (N + 31 /* round up */) / 32; + uint32_t words = (N + 31 /* round up */) / 32; memset( result, 0, words * 4 ); - for (unsigned i = 0; i < N; i++) if (IsOccluded( rayBatch[i], layout )) result[i >> 5] |= 1 << (i & 31); + for (uint32_t i = 0; i < N; i++) if (IsOccluded( rayBatch[i], layout )) result[i >> 5] |= 1 << (i & 31); } // Traverse the default BVH layout (WALD_32BYTE). -int BVH::Intersect_Wald32Byte( Ray& ray ) const +int32_t BVH::Intersect_Wald32Byte( Ray& ray ) const { BVHNode* node = &bvhNode[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; while (1) { steps++; if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->leftFirst + i] ); + for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->leftFirst + i] ); if (stackPtr == 0) break; else node = stack[--stackPtr]; continue; } @@ -2446,15 +2577,15 @@ int BVH::Intersect_Wald32Byte( Ray& ray ) const bool BVH::IsOccluded_Wald32Byte( const Ray& ray ) const { BVHNode* node = &bvhNode[0], * stack[64]; - unsigned stackPtr = 0; + uint32_t stackPtr = 0; while (1) { if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { // Moeller-Trumbore ray/triangle intersection algorithm - const unsigned vertIdx = triIdx[node->leftFirst + i] * 3; + const uint32_t vertIdx = triIdx[node->leftFirst + i] * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -2491,16 +2622,16 @@ bool BVH::IsOccluded_Wald32Byte( const Ray& ray ) const } // Traverse the alternative BVH layout (AILA_LAINE). -int BVH::Intersect_AilaLaine( Ray& ray ) const +int32_t BVH::Intersect_AilaLaine( Ray& ray ) const { BVHNodeAlt* node = &altNode[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; while (1) { steps++; if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] ); + for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] ); if (stackPtr == 0) break; else node = stack[--stackPtr]; continue; } @@ -2515,11 +2646,11 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) ); if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina; if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb; - unsigned lidx = node->left, ridx = node->right; + uint32_t lidx = node->left, ridx = node->right; if (dist1 > dist2) { float t = dist1; dist1 = dist2; dist2 = t; - unsigned i = lidx; lidx = ridx; ridx = i; + uint32_t i = lidx; lidx = ridx; ridx = i; } if (dist1 == BVH_FAR) { @@ -2538,14 +2669,14 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const { BVHNodeAlt* node = &altNode[0], * stack[64]; - unsigned stackPtr = 0; + uint32_t stackPtr = 0; while (1) { if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { - const unsigned vertIdx = triIdx[node->firstTri + i] * 3; + const uint32_t vertIdx = triIdx[node->firstTri + i] * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -2575,11 +2706,11 @@ bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) ); if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina; if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb; - unsigned lidx = node->left, ridx = node->right; + uint32_t lidx = node->left, ridx = node->right; if (dist1 > dist2) { float t = dist1; dist1 = dist2; dist2 = t; - unsigned i = lidx; lidx = ridx; ridx = i; + uint32_t i = lidx; lidx = ridx; ridx = i; } if (dist1 == BVH_FAR) { @@ -2595,16 +2726,16 @@ bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const } // Intersect_BasicBVH4. For testing the converted data only; not efficient. -int BVH::Intersect_BasicBVH4( Ray& ray ) const +int32_t BVH::Intersect_BasicBVH4( Ray& ray ) const { BVHNode4* node = &bvh4Node[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; while (1) { steps++; - if (node->isLeaf()) for (unsigned i = 0; i < node->triCount; i++) + if (node->isLeaf()) for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] ); - else for (unsigned i = 0; i < node->childCount; i++) + else for (uint32_t i = 0; i < node->childCount; i++) { BVHNode4* child = bvh4Node + node->child[i]; float dist = IntersectAABB( ray, child->aabbMin, child->aabbMax ); @@ -2616,16 +2747,16 @@ int BVH::Intersect_BasicBVH4( Ray& ray ) const } // Intersect_BasicBVH8. For testing the converted data only; not efficient. -int BVH::Intersect_BasicBVH8( Ray& ray ) const +int32_t BVH::Intersect_BasicBVH8( Ray& ray ) const { BVHNode8* node = &bvh8Node[0], * stack[512]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; while (1) { steps++; - if (node->isLeaf()) for (unsigned i = 0; i < node->triCount; i++) + if (node->isLeaf()) for (uint32_t i = 0; i < node->triCount; i++) IntersectTri( ray, triIdx[node->firstTri + i] ); - else for (unsigned i = 0; i < 8; i++) if (node->child[i]) + else for (uint32_t i = 0; i < 8; i++) if (node->child[i]) { BVHNode8* child = bvh8Node + node->child[i]; float dist = IntersectAABB( ray, child->aabbMin, child->aabbMax ); @@ -2639,14 +2770,14 @@ int BVH::Intersect_BasicBVH8( Ray& ray ) const // IntersectAlt4Nodes. For testing the converted data only; not efficient. // This code replicates how traversal on GPU happens. #define SWAP(A,B,C,D) t=A,A=B,B=t,t2=C,C=D,D=t2; -struct uchar4 { unsigned char x, y, z, w; }; +struct uchar4 { uint8_t x, y, z, w; }; static uchar4 as_uchar4( const float v ) { union { float t; uchar4 t4; }; t = v; return t4; } -static unsigned as_uint( const float v ) { return *(unsigned*)&v; } -int BVH::Intersect_Alt4BVH( Ray& ray ) const +static uint32_t as_uint( const float v ) { return *(uint32_t*)&v; } +int32_t BVH::Intersect_Alt4BVH( Ray& ray ) const { // traverse a blas - unsigned offset = 0, stack[128], stackPtr = 0, t2 /* for SWAP macro */; - unsigned steps = 0; + uint32_t offset = 0, stack[128], stackPtr = 0, t2 /* for SWAP macro */; + uint32_t steps = 0; while (1) { steps++; @@ -2682,16 +2813,16 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const float dist0 = tmina > tmaxa ? BVH_FAR : tmina, dist1 = tminb > tmaxb ? BVH_FAR : tminb; float dist2 = tminc > tmaxc ? BVH_FAR : tminc, dist3 = tmind > tmaxd ? BVH_FAR : tmind, t; // get child node info fields - unsigned c0info = as_uint( data3.x ), c1info = as_uint( data3.y ); - unsigned c2info = as_uint( data3.z ), c3info = as_uint( data3.w ); + uint32_t c0info = as_uint( data3.x ), c1info = as_uint( data3.y ); + uint32_t c2info = as_uint( data3.z ), c3info = as_uint( data3.w ); if (dist0 < dist2) SWAP( dist0, dist2, c0info, c2info ); if (dist1 < dist3) SWAP( dist1, dist3, c1info, c3info ); if (dist0 < dist1) SWAP( dist0, dist1, c0info, c1info ); if (dist2 < dist3) SWAP( dist2, dist3, c2info, c3info ); if (dist1 < dist2) SWAP( dist1, dist2, c1info, c2info ); // process results, starting with farthest child, so nearest ends on top of stack - unsigned nextNode = 0; - unsigned leaf[4] = { 0, 0, 0, 0 }, leafs = 0; + uint32_t nextNode = 0; + uint32_t leaf[4] = { 0, 0, 0, 0 }, leafs = 0; if (dist0 < BVH_FAR) { if (c0info & 0x80000000) leaf[leafs++] = c0info; else if (c0info) stack[stackPtr++] = c0info; @@ -2709,11 +2840,11 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const if (c3info & 0x80000000) leaf[leafs++] = c3info; else if (c3info) stack[stackPtr++] = c3info; } // process encountered leafs, if any - for (unsigned i = 0; i < leafs; i++) + for (uint32_t i = 0; i < leafs; i++) { - const unsigned N = (leaf[i] >> 16) & 0x7fff; - unsigned triStart = offset + (leaf[i] & 0xffff); - for (unsigned j = 0; j < N; j++, triStart += 3) + const uint32_t N = (leaf[i] >> 16) & 0x7fff; + uint32_t triStart = offset + (leaf[i] & 0xffff); + for (uint32_t j = 0; j < N; j++, triStart += 3) { const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ); const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ); @@ -2745,7 +2876,7 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const } // Intersect a WALD_32BYTE BVH with a ray packet. -// The 256 rays travel together to better utilize the caches and to amortize the cost +// The 256 rays travel together to better utilize the caches and to amortize the cost // of memory transfers over the rays in the bundle. // Note that this basic implementation assumes a specific layout of the rays. Provided // as 'proof of concept', should not be used in production code. @@ -2768,27 +2899,27 @@ void BVH::Intersect256Rays( Ray* packet ) const const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane - const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; - const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; - const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; - const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; + const int32_t sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; + const int32_t sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; + const int32_t sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; + const int32_t sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 ); const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 ); // Traverse the tree with the packet - int first = 0, last = 255; // first and last active ray in the packet + int32_t first = 0, last = 255; // first and last active ray in the packet const BVHNode* node = &bvhNode[0]; - ALIGNED( 64 ) unsigned stack[64], stackPtr = 0; + ALIGNED( 64 ) uint32_t stack[64], stackPtr = 0; while (1) { if (node->isLeaf()) { // handle leaf node - for (unsigned j = 0; j < node->triCount; j++) + for (uint32_t j = 0; j < node->triCount; j++) { - const unsigned idx = triIdx[node->leftFirst + j], vid = idx * 3; + const uint32_t idx = triIdx[node->leftFirst + j], vid = idx * 3; const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid]; const bvhvec3 s = O - bvhvec3( verts[vid] ); - for (int i = first; i <= last; i++) + for (int32_t i = first; i <= last; i++) { Ray& ray = packet[i]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -2814,7 +2945,7 @@ void BVH::Intersect256Rays( Ray* packet ) const const BVHNode* left = bvhNode + node->leftFirst; const BVHNode* right = bvhNode + node->leftFirst + 1; bool visitLeft = true, visitRight = true; - int leftFirst = first, leftLast = last, rightFirst = first, rightLast = last; + int32_t leftFirst = first, leftLast = last, rightFirst = first, rightLast = last; float distLeft, distRight; { // see if we want to intersect the left child @@ -2918,7 +3049,7 @@ void BVH::Intersect256Rays( Ray* packet ) const // ============================================================================ // // I M P L E M E N T A T I O N - A V X / S S E C O D E -// +// // ============================================================================ #ifdef BVH_USEAVX @@ -2966,52 +3097,54 @@ inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz n #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif -void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) +void BVH::BuildAVX( const bvhvec4* vertices, const uint32_t primCount ) { - int test = BVHBINS; + FATAL_ERROR_IF( primCount == 0, "BVH::BuildAVX( .. ), primCount == 0." ); + int32_t test = BVHBINS; if (test != 8) assert( false ); // AVX builders require BVHBINS == 8. assert( ((long long)vertices & 63) == 0 ); // buffer must be cacheline-aligned // aligned data ALIGNED( 64 ) __m256 binbox[3 * BVHBINS]; // 768 bytes ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS]; // 768 bytes - ALIGNED( 64 ) unsigned count[3][BVHBINS]{}; // 96 bytes + ALIGNED( 64 ) uint32_t count[3][BVHBINS]{}; // 96 bytes ALIGNED( 64 ) __m256 bestLBox, bestRBox; // 64 bytes // some constants static const __m128 max4 = _mm_set1_ps( -BVH_FAR ), half4 = _mm_set1_ps( 0.5f ); static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 ); - static const __m256 max8 = _mm256_set1_ps( -BVH_FAR ); - static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f ); + static const __m128i maxbin4 = _mm_set1_epi32( 7 ); static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f ); static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() ); static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f ); - for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template + static const __m256 max8 = _mm256_set1_ps( -BVH_FAR ); + static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f ); + for (uint32_t i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template // reset node pool - const unsigned spaceNeeded = primCount * 2; + const uint32_t spaceNeeded = primCount * 2; if (allocatedBVHNodes < spaceNeeded) { AlignedFree( bvhNode ); AlignedFree( triIdx ); AlignedFree( fragment ); - triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) ); + triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) ); bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); allocatedBVHNodes = spaceNeeded; memset( &bvhNode[1], 0, 32 ); // avoid crash in refit. fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) ); } else FATAL_ERROR_IF( !rebuildable, "BVH::BuildAVX( .. ), bvh not rebuildable." ); - verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete. + verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete. triCount = idxCount = primCount; - unsigned newNodePtr = 2; + uint32_t newNodePtr = 2; struct FragSSE { __m128 bmin4, bmax4; }; FragSSE* frag4 = (FragSSE*)fragment; __m256* frag8 = (__m256*)fragment; - const __m128* verts4 = (__m128*)verts; + const __m128* verts4 = (__m128*)vertices; // assign all triangles to the root node BVHNode& root = bvhNode[0]; root.leftFirst = 0, root.triCount = triCount; // initialize fragments and update root bounds __m128 rootMin = max4, rootMax = max4; - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) ); const __m128 v2 = _mm_max_ps( _mm_max_ps( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ); @@ -3020,7 +3153,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) rootMin = _mm_xor_ps( rootMin, signFlip4 ); root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax; // subdivide recursively - ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0; + ALIGNED( 64 ) uint32_t task[128], taskCount = 0, nodeIdx = 0; const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f; while (1) { @@ -3034,26 +3167,28 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) ); // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies": // main loop operates on two fragments to minimize dependencies and maximize ILP. - unsigned fi = triIdx[node.leftFirst]; + uint32_t fi = triIdx[node.leftFirst]; memset( count, 0, sizeof( count ) ); __m256 r0, r1, r2, f = frag8[fi]; - __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ); + const __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ); + const __m128i b4c = _mm_max_epi32( _mm_min_epi32( bi4, maxbin4 ), _mm_setzero_si128() ); // clamp needed after all memcpy( binbox, binboxOrig, sizeof( binbox ) ); - unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1; - for (unsigned i = 0; i < node.triCount - 1; i++) + uint32_t i0 = ILANE( b4c, 0 ), i1 = ILANE( b4c, 1 ), i2 = ILANE( b4c, 2 ), * ti = triIdx + node.leftFirst + 1; + for (uint32_t i = 0; i < node.triCount - 1; i++) { - unsigned fid = *ti++; + uint32_t fid = *ti++; #if defined __GNUC__ || _MSC_VER < 1920 if (fid > triCount) fid = triCount - 1; // never happens but g++ *and* vs2017 need this to not crash... #endif const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4; r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); - const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ); + const __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ); + const __m128i b4c = _mm_max_epi32( _mm_min_epi32( bi4, maxbin4 ), _mm_setzero_si128() ); // clamp needed after all f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++; - binbox[i0] = r0, i0 = ILANE( b4, 0 ); - binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 ); - binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 ); + binbox[i0] = r0, i0 = ILANE( b4c, 0 ); + binbox[BVHBINS + i1] = r1, i1 = ILANE( b4c, 1 ); + binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4c, 2 ); } // final business for final fragment const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; @@ -3062,21 +3197,21 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; // calculate per-split totals float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea(); - unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; + uint32_t bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; const __m256* bb = binbox; - for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) + for (int32_t a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { // hardcoded bin processing for BVHBINS == 8 assert( BVHBINS == 8 ); - const unsigned lN0 = count[a][0], rN0 = count[a][7]; + const uint32_t lN0 = count[a][0], rN0 = count[a][7]; const __m256 lb0 = bb[0], rb0 = bb[7]; - const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; - const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; + const uint32_t lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; + const uint32_t rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] ); const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] ); const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] ); - const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; - const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; + const uint32_t lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; + const uint32_t rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] ); const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] ); const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] ); @@ -3092,14 +3227,14 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; - unsigned t, fr = triIdx[src]; - for (unsigned i = 0; i < node.triCount; i++) + uint32_t t, fr = triIdx[src]; + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); + const uint32_t bi = (uint32_t)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t; } // create child nodes and recurse - const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; if (leftCount == 0 || rightCount == 0) break; // should not happen. *(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 ); bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount; @@ -3142,27 +3277,27 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane - int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; - int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; - int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; - int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; + int32_t sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; + int32_t sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; + int32_t sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; + int32_t sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; float t0 = dot( O, plane0 ), t1 = dot( O, plane1 ); float t2 = dot( O, plane2 ), t3 = dot( O, plane3 ); // Traverse the tree with the packet - int first = 0, last = 255; // first and last active ray in the packet + int32_t first = 0, last = 255; // first and last active ray in the packet BVHNode* node = &bvhNode[0]; - ALIGNED( 64 ) unsigned stack[64], stackPtr = 0; + ALIGNED( 64 ) uint32_t stack[64], stackPtr = 0; while (1) { if (node->isLeaf()) { // handle leaf node - for (unsigned j = 0; j < node->triCount; j++) + for (uint32_t j = 0; j < node->triCount; j++) { - const unsigned idx = triIdx[node->leftFirst + j], vid = idx * 3; + const uint32_t idx = triIdx[node->leftFirst + j], vid = idx * 3; const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid]; const bvhvec3 s = O - bvhvec3( verts[vid] ); - for (int i = first; i <= last; i++) + for (int32_t i = first; i <= last; i++) { Ray& ray = packet[i]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -3188,7 +3323,7 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const BVHNode* left = bvhNode + node->leftFirst; BVHNode* right = bvhNode + node->leftFirst + 1; bool visitLeft = true, visitRight = true; - int leftFirst = first, leftLast = last, rightFirst = first, rightLast = last; + int32_t leftFirst = first, leftLast = last, rightFirst = first, rightLast = last; float distLeft, distRight; { // see if we want to intersect the left child @@ -3319,10 +3454,10 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const } // Traverse the second alternative BVH layout (ALT_SOA). -int BVH::Intersect_AltSoA( Ray& ray ) const +int32_t BVH::Intersect_AltSoA( Ray& ray ) const { BVHNodeAlt2* node = &alt2Node[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x ); const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y ); const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z ); @@ -3331,9 +3466,9 @@ int BVH::Intersect_AltSoA( Ray& ray ) const steps++; if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { - const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; + const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -3372,7 +3507,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - unsigned lidx = node->left, ridx = node->right; + uint32_t lidx = node->left, ridx = node->right; const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() ); const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) ); #if 0 @@ -3392,7 +3527,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const if (dist1 > dist2) { float t = dist1; dist1 = dist2; dist2 = t; - unsigned i = lidx; lidx = ridx; ridx = i; + uint32_t i = lidx; lidx = ridx; ridx = i; } if (dist1 == BVH_FAR) { @@ -3411,7 +3546,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const bool BVH::IsOccluded_AltSoA( const Ray& ray ) const { BVHNodeAlt2* node = &alt2Node[0], * stack[64]; - unsigned stackPtr = 0; + uint32_t stackPtr = 0; const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x ); const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y ); const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z ); @@ -3419,9 +3554,9 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const { if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { - const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; + const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -3459,7 +3594,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - unsigned lidx = node->left, ridx = node->right; + uint32_t lidx = node->left, ridx = node->right; const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() ); const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) ); #if 0 @@ -3479,7 +3614,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const if (dist1 > dist2) { float t = dist1; dist1 = dist2; dist2 = t; - unsigned i = lidx; lidx = ridx; ridx = i; + uint32_t i = lidx; lidx = ridx; ridx = i; } if (dist1 == BVH_FAR) { @@ -3499,19 +3634,7 @@ bool BVH::IsOccluded_AltSoA( const Ray& ray ) const // Not technically limited to BVH_USEAVX, but __lzcnt and __popcnt will require // exotic compiler flags (in combination with __builtin_ia32_lzcnt_u32), so... Since // this is just here to test data before it goes to the GPU: MSVC-only for now. -static unsigned __bfind( unsigned x ) // https://github.com/mackron/refcode/blob/master/lzcnt.c -{ -#if defined(_MSC_VER) && !defined(__clang__) - return 31 - __lzcnt( x ); -#elif defined(__EMSCRIPTEN__) - return 31 - __builtin_clz( x ); -#elif defined(__GNUC__) || defined(__clang__) - unsigned r; - __asm__ __volatile__( "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" ); - return 31 - r; -#endif -} -static unsigned __popc( unsigned x ) +static uint32_t __popc( uint32_t x ) { #if defined(_MSC_VER) && !defined(__clang__) return __popcnt( x ); @@ -3521,61 +3644,61 @@ static unsigned __popc( unsigned x ) } #define STACK_POP() { ngroup = traversalStack[--stackPtr]; } #define STACK_PUSH() { traversalStack[stackPtr++] = ngroup; } -static inline unsigned extract_byte( const unsigned i, const unsigned n ) { return (i >> (n * 8)) & 0xFF; } -static inline unsigned sign_extend_s8x4( const unsigned i ) +static inline uint32_t extract_byte( const uint32_t i, const uint32_t n ) { return (i >> (n * 8)) & 0xFF; } +static inline uint32_t sign_extend_s8x4( const uint32_t i ) { - // asm("prmt.b32 %0, %1, 0x0, 0x0000BA98;" : "=r"(v) : "r"(i)); // BA98: 1011`1010`1001`1000 - // with the given parameters, prmt will extend the sign to all bits in a byte. - unsigned b0 = (i & 0b10000000000000000000000000000000) ? 0xff000000 : 0; - unsigned b1 = (i & 0b00000000100000000000000000000000) ? 0x00ff0000 : 0; - unsigned b2 = (i & 0b00000000000000001000000000000000) ? 0x0000ff00 : 0; - unsigned b3 = (i & 0b00000000000000000000000010000000) ? 0x000000ff : 0; + // asm("prmt.b32 %0, %1, 0x0, 0x0000BA98;" : "=r"(v) : "r"(i)); // BA98: 1011`1010`1001`1000 + // with the given parameters, prmt will extend the sign to all bits in a byte. + uint32_t b0 = (i & 0b10000000000000000000000000000000) ? 0xff000000 : 0; + uint32_t b1 = (i & 0b00000000100000000000000000000000) ? 0x00ff0000 : 0; + uint32_t b2 = (i & 0b00000000000000001000000000000000) ? 0x0000ff00 : 0; + uint32_t b3 = (i & 0b00000000000000000000000010000000) ? 0x000000ff : 0; return b0 + b1 + b2 + b3; // probably can do better than this. } -int BVH::Intersect_CWBVH( Ray& ray ) const +int32_t BVH::Intersect_CWBVH( Ray& ray ) const { bvhuint2 traversalStack[128]; - unsigned hitAddr = 0, stackPtr = 0; + uint32_t hitAddr = 0, stackPtr = 0; bvhvec2 triangleuv( 0, 0 ); const bvhvec4* blasNodes = bvh8Compact; const bvhvec4* blasTris = bvh8Tris; float tmin = 0, tmax = ray.hit.t; - const unsigned octinv = (7 - ((ray.D.x < 0 ? 4 : 0) | (ray.D.y < 0 ? 2 : 0) | (ray.D.z < 0 ? 1 : 0))) * 0x1010101; + const uint32_t octinv = (7 - ((ray.D.x < 0 ? 4 : 0) | (ray.D.y < 0 ? 2 : 0) | (ray.D.z < 0 ? 1 : 0))) * 0x1010101; bvhuint2 ngroup = bvhuint2( 0, 0b10000000000000000000000000000000 ), tgroup = bvhuint2( 0 ); do { if (ngroup.y > 0x00FFFFFF) { - const unsigned hits = ngroup.y, imask = ngroup.y; - const unsigned child_bit_index = __bfind( hits ); - const unsigned child_node_base_index = ngroup.x; + const uint32_t hits = ngroup.y, imask = ngroup.y; + const uint32_t child_bit_index = __bfind( hits ); + const uint32_t child_node_base_index = ngroup.x; ngroup.y &= ~(1 << child_bit_index); if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( /* nodeGroup */ ); } { - const unsigned slot_index = (child_bit_index - 24) ^ (octinv & 255); - const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) ); - const unsigned child_node_index = child_node_base_index + relative_index; + const uint32_t slot_index = (child_bit_index - 24) ^ (octinv & 255); + const uint32_t relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) ); + const uint32_t child_node_index = child_node_base_index + relative_index; const bvhvec4 n0 = blasNodes[child_node_index * 5 + 0], n1 = blasNodes[child_node_index * 5 + 1]; const bvhvec4 n2 = blasNodes[child_node_index * 5 + 2], n3 = blasNodes[child_node_index * 5 + 3]; const bvhvec4 n4 = blasNodes[child_node_index * 5 + 4], p = n0; bvhint3 e; - e.x = (int)*((char*)&n0.w + 0), e.y = (int)*((char*)&n0.w + 1), e.z = (int)*((char*)&n0.w + 2); + e.x = (int32_t)*((int8_t*)&n0.w + 0), e.y = (int32_t)*((int8_t*)&n0.w + 1), e.z = (int32_t)*((int8_t*)&n0.w + 2); ngroup.x = as_uint( n1.x ), tgroup.x = as_uint( n1.y ), tgroup.y = 0; - unsigned hitmask = 0; - const unsigned vx = (e.x + 127) << 23u; const float adjusted_idirx = *(float*)&vx * ray.rD.x; - const unsigned vy = (e.y + 127) << 23u; const float adjusted_idiry = *(float*)&vy * ray.rD.y; - const unsigned vz = (e.z + 127) << 23u; const float adjusted_idirz = *(float*)&vz * ray.rD.z; + uint32_t hitmask = 0; + const uint32_t vx = (e.x + 127) << 23u; const float adjusted_idirx = *(float*)&vx * ray.rD.x; + const uint32_t vy = (e.y + 127) << 23u; const float adjusted_idiry = *(float*)&vy * ray.rD.y; + const uint32_t vz = (e.z + 127) << 23u; const float adjusted_idirz = *(float*)&vz * ray.rD.z; const float origx = -(ray.O.x - p.x) * ray.rD.x; const float origy = -(ray.O.y - p.y) * ray.rD.y; const float origz = -(ray.O.z - p.z) * ray.rD.z; { // First 4 - const unsigned meta4 = *(unsigned*)&n1.z, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; - const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); - const unsigned bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F; - const unsigned child_bits4 = (meta4 >> 5) & 0x07070707; - unsigned swizzledLox = (ray.rD.x < 0) ? *(unsigned*)&n3.z : *(unsigned*)&n2.x, swizzledHix = (ray.rD.x < 0) ? *(unsigned*)&n2.x : *(unsigned*)&n3.z; - unsigned swizzledLoy = (ray.rD.y < 0) ? *(unsigned*)&n4.x : *(unsigned*)&n2.z, swizzledHiy = (ray.rD.y < 0) ? *(unsigned*)&n2.z : *(unsigned*)&n4.x; - unsigned swizzledLoz = (ray.rD.z < 0) ? *(unsigned*)&n4.z : *(unsigned*)&n3.x, swizzledHiz = (ray.rD.z < 0) ? *(unsigned*)&n3.x : *(unsigned*)&n4.z; + const uint32_t meta4 = *(uint32_t*)&n1.z, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; + const uint32_t inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); + const uint32_t bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F; + const uint32_t child_bits4 = (meta4 >> 5) & 0x07070707; + uint32_t swizzledLox = (ray.rD.x < 0) ? *(uint32_t*)&n3.z : *(uint32_t*)&n2.x, swizzledHix = (ray.rD.x < 0) ? *(uint32_t*)&n2.x : *(uint32_t*)&n3.z; + uint32_t swizzledLoy = (ray.rD.y < 0) ? *(uint32_t*)&n4.x : *(uint32_t*)&n2.z, swizzledHiy = (ray.rD.y < 0) ? *(uint32_t*)&n2.z : *(uint32_t*)&n4.x; + uint32_t swizzledLoz = (ray.rD.z < 0) ? *(uint32_t*)&n4.z : *(uint32_t*)&n3.x, swizzledHiz = (ray.rD.z < 0) ? *(uint32_t*)&n3.x : *(uint32_t*)&n4.z; float tminx[4], tminy[4], tminz[4], tmaxx[4], tmaxy[4], tmaxz[4]; tminx[0] = ((swizzledLox >> 0) & 0xFF) * adjusted_idirx + origx, tminx[1] = ((swizzledLox >> 8) & 0xFF) * adjusted_idirx + origx, tminx[2] = ((swizzledLox >> 16) & 0xFF) * adjusted_idirx + origx; tminx[3] = ((swizzledLox >> 24) & 0xFF) * adjusted_idirx + origx, tminy[0] = ((swizzledLoy >> 0) & 0xFF) * adjusted_idiry + origy, tminy[1] = ((swizzledLoy >> 8) & 0xFF) * adjusted_idiry + origy; @@ -3585,7 +3708,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const tmaxx[3] = ((swizzledHix >> 24) & 0xFF) * adjusted_idirx + origx, tmaxy[0] = ((swizzledHiy >> 0) & 0xFF) * adjusted_idiry + origy, tmaxy[1] = ((swizzledHiy >> 8) & 0xFF) * adjusted_idiry + origy; tmaxy[2] = ((swizzledHiy >> 16) & 0xFF) * adjusted_idiry + origy, tmaxy[3] = ((swizzledHiy >> 24) & 0xFF) * adjusted_idiry + origy, tmaxz[0] = ((swizzledHiz >> 0) & 0xFF) * adjusted_idirz + origz; tmaxz[1] = ((swizzledHiz >> 8) & 0xFF) * adjusted_idirz + origz, tmaxz[2] = ((swizzledHiz >> 16) & 0xFF) * adjusted_idirz + origz, tmaxz[3] = ((swizzledHiz >> 24) & 0xFF) * adjusted_idirz + origz; - for (int i = 0; i < 4; i++) + for (int32_t i = 0; i < 4; i++) { // Use VMIN, VMAX to compute the slabs const float cmin = tinybvh_max( tinybvh_max( tinybvh_max( tminx[i], tminy[i] ), tminz[i] ), tmin ); @@ -3594,13 +3717,13 @@ int BVH::Intersect_CWBVH( Ray& ray ) const } } { // Second 4 - const unsigned meta4 = *(unsigned*)&n1.w, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; - const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); - const unsigned bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F; - const unsigned child_bits4 = (meta4 >> 5) & 0x07070707; - unsigned swizzledLox = (ray.rD.x < 0) ? *(unsigned*)&n3.w : *(unsigned*)&n2.y, swizzledHix = (ray.rD.x < 0) ? *(unsigned*)&n2.y : *(unsigned*)&n3.w; - unsigned swizzledLoy = (ray.rD.y < 0) ? *(unsigned*)&n4.y : *(unsigned*)&n2.w, swizzledHiy = (ray.rD.y < 0) ? *(unsigned*)&n2.w : *(unsigned*)&n4.y; - unsigned swizzledLoz = (ray.rD.z < 0) ? *(unsigned*)&n4.w : *(unsigned*)&n3.y, swizzledHiz = (ray.rD.z < 0) ? *(unsigned*)&n3.y : *(unsigned*)&n4.w; + const uint32_t meta4 = *(uint32_t*)&n1.w, is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; + const uint32_t inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); + const uint32_t bit_index4 = (meta4 ^ (octinv & inner_mask4)) & 0x1F1F1F1F; + const uint32_t child_bits4 = (meta4 >> 5) & 0x07070707; + uint32_t swizzledLox = (ray.rD.x < 0) ? *(uint32_t*)&n3.w : *(uint32_t*)&n2.y, swizzledHix = (ray.rD.x < 0) ? *(uint32_t*)&n2.y : *(uint32_t*)&n3.w; + uint32_t swizzledLoy = (ray.rD.y < 0) ? *(uint32_t*)&n4.y : *(uint32_t*)&n2.w, swizzledHiy = (ray.rD.y < 0) ? *(uint32_t*)&n2.w : *(uint32_t*)&n4.y; + uint32_t swizzledLoz = (ray.rD.z < 0) ? *(uint32_t*)&n4.w : *(uint32_t*)&n3.y, swizzledHiz = (ray.rD.z < 0) ? *(uint32_t*)&n3.y : *(uint32_t*)&n4.w; float tminx[4], tminy[4], tminz[4], tmaxx[4], tmaxy[4], tmaxz[4]; tminx[0] = ((swizzledLox >> 0) & 0xFF) * adjusted_idirx + origx, tminx[1] = ((swizzledLox >> 8) & 0xFF) * adjusted_idirx + origx, tminx[2] = ((swizzledLox >> 16) & 0xFF) * adjusted_idirx + origx; tminx[3] = ((swizzledLox >> 24) & 0xFF) * adjusted_idirx + origx, tminy[0] = ((swizzledLoy >> 0) & 0xFF) * adjusted_idiry + origy, tminy[1] = ((swizzledLoy >> 8) & 0xFF) * adjusted_idiry + origy; @@ -3610,7 +3733,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const tmaxx[3] = ((swizzledHix >> 24) & 0xFF) * adjusted_idirx + origx, tmaxy[0] = ((swizzledHiy >> 0) & 0xFF) * adjusted_idiry + origy, tmaxy[1] = ((swizzledHiy >> 8) & 0xFF) * adjusted_idiry + origy; tmaxy[2] = ((swizzledHiy >> 16) & 0xFF) * adjusted_idiry + origy, tmaxy[3] = ((swizzledHiy >> 24) & 0xFF) * adjusted_idiry + origy, tmaxz[0] = ((swizzledHiz >> 0) & 0xFF) * adjusted_idirz + origz; tmaxz[1] = ((swizzledHiz >> 8) & 0xFF) * adjusted_idirz + origz, tmaxz[2] = ((swizzledHiz >> 16) & 0xFF) * adjusted_idirz + origz, tmaxz[3] = ((swizzledHiz >> 24) & 0xFF) * adjusted_idirz + origz; - for (int i = 0; i < 4; i++) + for (int32_t i = 0; i < 4; i++) { const float cmin = tinybvh_max( tinybvh_max( tinybvh_max( tminx[i], tminy[i] ), tminz[i] ), tmin ); const float cmax = tinybvh_min( tinybvh_min( tinybvh_min( tmaxx[i], tmaxy[i] ), tmaxz[i] ), tmax ); @@ -3623,7 +3746,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const else tgroup = ngroup, ngroup = bvhuint2( 0 ); while (tgroup.y != 0) { - unsigned triangleIndex = __bfind( tgroup.y ); + uint32_t triangleIndex = __bfind( tgroup.y ); #ifdef CWBVH_COMPRESSED_TRIS const float* T = (float*)&blasTris[tgroup.x + triangleIndex * 4]; const float transS = T[8] * ray.O.x + T[9] * ray.O.y + T[10] * ray.O.z + T[11]; @@ -3635,10 +3758,10 @@ int BVH::Intersect_CWBVH( Ray& ray ) const const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; const bool hit = u >= 0 && v >= 0 && u + v < 1; - if (hit) triangleuv = bvhvec2( u, v ), tmax = ta, hitAddr = *(unsigned*)&T[15]; + if (hit) triangleuv = bvhvec2( u, v ), tmax = ta, hitAddr = *(uint32_t*)&T[15]; } #else - int triAddr = tgroup.x + triangleIndex * 3; + int32_t triAddr = tgroup.x + triangleIndex * 3; const bvhvec3 v0 = blasTris[triAddr]; const bvhvec3 edge1 = bvhvec3( blasTris[triAddr + 1] ) - v0; const bvhvec3 edge2 = bvhvec3( blasTris[triAddr + 2] ) - v0; @@ -3694,11 +3817,11 @@ inline void IntersectCompactTri( Ray& r, __m128& t4, const float* T ) const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; const bool hit = u >= 0 && v >= 0 && u + v < 1; - if (hit) r.hit = { ta, u, v, *(unsigned*)&T[15] }, t4 = _mm_set1_ps( ta ); + if (hit) r.hit = { ta, u, v, *(uint32_t*)&T[15] }, t4 = _mm_set1_ps( ta ); } -int BVH::Intersect_Afra( Ray& ray ) const +int32_t BVH::Intersect_Afra( Ray& ray ) const { - unsigned nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0; + uint32_t nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0; const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x ); const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y ); const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z ); @@ -3725,15 +3848,15 @@ int BVH::Intersect_Afra( Ray& ray ) const const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin ); const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax ); const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); - const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); + const int32_t hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); if (hits == 1 /* 43% */) { // just one node was hit - no sorting needed. - const unsigned lane = __bfind( hitBits ), count = node.triCount[lane]; + const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane]; if (count == 0) nodeIdx = node.childFirst[lane]; else { - const unsigned first = node.childFirst[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); if (stackPtr == 0) break; nodeIdx = stack[--stackPtr]; @@ -3749,19 +3872,19 @@ int BVH::Intersect_Afra( Ray& ray ) const if (hits == 2 /* 16% */) { // two nodes hit - unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); + uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1]; if (dist1 < dist0) { - unsigned t = lane0; lane0 = lane1; lane1 = t; + uint32_t t = lane0; lane0 = lane1; lane1 = t; float ft = dist0; dist0 = dist1; dist1 = ft; } - const unsigned triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; + const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; // process first lane if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else { - const unsigned first = node.childFirst[lane0]; - for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane0]; + for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); nodeIdx = 0; } @@ -3773,8 +3896,8 @@ int BVH::Intersect_Afra( Ray& ray ) const } else { - const unsigned first = node.childFirst[lane1]; - for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane1]; + for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } @@ -3792,18 +3915,18 @@ int BVH::Intersect_Afra( Ray& ray ) const // process hits float d[4] = { d0, d1, d2, d3 }; nodeIdx = 0; - for (int i = 1; i < 4; i++) + for (int32_t i = 1; i < 4; i++) { - unsigned lane = *(unsigned*)&d[i] & 3; + uint32_t lane = *(uint32_t*)&d[i] & 3; if (node.triCount[lane] == 0) { - const unsigned childIdx = node.childFirst[lane]; + const uint32_t childIdx = node.childFirst[lane]; if (nodeIdx) stack[stackPtr++] = nodeIdx; nodeIdx = childIdx; continue; } - const unsigned first = node.childFirst[lane], count = node.triCount[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } @@ -3821,19 +3944,19 @@ int BVH::Intersect_Afra( Ray& ray ) const // process hits float d[4] = { d0, d1, d2, d3 }; nodeIdx = 0; - for (int i = 0; i < 4; i++) + for (int32_t i = 0; i < 4; i++) { - unsigned lane = *(unsigned*)&d[i] & 3; + uint32_t lane = *(uint32_t*)&d[i] & 3; if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? if (node.triCount[lane] == 0) { - const unsigned childIdx = node.childFirst[lane]; + const uint32_t childIdx = node.childFirst[lane]; if (nodeIdx) stack[stackPtr++] = nodeIdx; nodeIdx = childIdx; continue; } - const unsigned first = node.childFirst[lane], count = node.triCount[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } @@ -3862,7 +3985,7 @@ inline bool OccludedCompactTri( const Ray& r, const float* T ) #endif bool BVH::IsOccluded_Afra( const Ray& ray ) const { - unsigned nodeIdx = 0, stack[1024], stackPtr = 0; + uint32_t nodeIdx = 0, stack[1024], stackPtr = 0; const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x ); const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y ); const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z ); @@ -3888,15 +4011,15 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin ); const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax ); const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); - const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); + const int32_t hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); if (hits == 1 /* 43% */) { // just one node was hit - no sorting needed. - const unsigned lane = __bfind( hitBits ), count = node.triCount[lane]; + const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane]; if (count == 0) nodeIdx = node.childFirst[lane]; else { - const unsigned first = node.childFirst[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; if (stackPtr == 0) break; nodeIdx = stack[--stackPtr]; @@ -3912,19 +4035,19 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const if (hits == 2 /* 16% */) { // two nodes hit - unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); + uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1]; if (dist1 < dist0) { - unsigned t = lane0; lane0 = lane1; lane1 = t; + uint32_t t = lane0; lane0 = lane1; lane1 = t; float ft = dist0; dist0 = dist1; dist1 = ft; } - const unsigned triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; + const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; // process first lane if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else { - const unsigned first = node.childFirst[lane0]; - for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane0]; + for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; nodeIdx = 0; } @@ -3936,8 +4059,8 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const } else { - const unsigned first = node.childFirst[lane1]; - for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane1]; + for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; } } @@ -3955,18 +4078,18 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const // process hits float d[4] = { d0, d1, d2, d3 }; nodeIdx = 0; - for (int i = 1; i < 4; i++) + for (int32_t i = 1; i < 4; i++) { - unsigned lane = *(unsigned*)&d[i] & 3; + uint32_t lane = *(uint32_t*)&d[i] & 3; if (node.triCount[lane] == 0) { - const unsigned childIdx = node.childFirst[lane]; + const uint32_t childIdx = node.childFirst[lane]; if (nodeIdx) stack[stackPtr++] = nodeIdx; nodeIdx = childIdx; continue; } - const unsigned first = node.childFirst[lane], count = node.triCount[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; } } @@ -3984,19 +4107,19 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const // process hits float d[4] = { d0, d1, d2, d3 }; nodeIdx = 0; - for (int i = 0; i < 4; i++) + for (int32_t i = 0; i < 4; i++) { - unsigned lane = *(unsigned*)&d[i] & 3; + uint32_t lane = *(uint32_t*)&d[i] & 3; if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? if (node.triCount[lane] == 0) { - const unsigned childIdx = node.childFirst[lane]; + const uint32_t childIdx = node.childFirst[lane]; if (nodeIdx) stack[stackPtr++] = nodeIdx; nodeIdx = childIdx; continue; } - const unsigned first = node.childFirst[lane], count = node.triCount[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; } } @@ -4015,7 +4138,7 @@ bool BVH::IsOccluded_Afra( const Ray& ray ) const // ============================================================================ // // I M P L E M E N T A T I O N - A R M / N E O N C O D E -// +// // ============================================================================ #ifdef BVH_USENEON @@ -4044,20 +4167,34 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2]; return ex * ey + ey * ez + ez * ex; } + +#if defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON) && __ARM_ARCH >= 85 +// Use the native vrnd32xq_f32 if NEON 8.5 is available +#else +// Custom implementation of vrnd32xq_f32 +static inline int32x4_t vrnd32xq_f32( float32x4_t a ) { + const float32x4_t half = vdupq_n_f32( 0.5f ); + uint32x4_t isNegative = vcltq_f32( a, vdupq_n_f32( 0.0f ) ); // Mask for negative numbers + float32x4_t adjustment = vbslq_f32( isNegative, vnegq_f32( half ), half ); + return vcvtq_s32_f32( vaddq_f32( a, adjustment ) ); +} +#endif + #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \ const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \ splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; } -void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) +void BVH::BuildNEON( const bvhvec4* vertices, const uint32_t primCount ) { - int test = BVHBINS; + FATAL_ERROR_IF( primCount == 0, "BVH::BuildNEON( .. ), primCount == 0." ); + int32_t test = BVHBINS; if (test != 8) assert( false ); // AVX builders require BVHBINS == 8. assert( ((long long)vertices & 63) == 0 ); // buffer must be cacheline-aligned // aligned data ALIGNED( 64 ) float32x4x2_t binbox[3 * BVHBINS]; // 768 bytes ALIGNED( 64 ) float32x4x2_t binboxOrig[3 * BVHBINS]; // 768 bytes - ALIGNED( 64 ) unsigned count[3][BVHBINS]{}; // 96 bytes + ALIGNED( 64 ) uint32_t count[3][BVHBINS]{}; // 96 bytes ALIGNED( 64 ) float32x4x2_t bestLBox, bestRBox; // 64 bytes // some constants static const float32x4_t max4 = vdupq_n_f32( -BVH_FAR ), half4 = vdupq_n_f32( 0.5f ); @@ -4067,34 +4204,34 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) static const float32x4x2_t signFlip8 = { signFlip4, vdupq_n_f32( 0 ) }; // TODO: Check me static const float32x4_t mask3 = vceqq_f32( SIMD_SETRVEC( 0, 0, 0, 1 ), vdupq_n_f32( 0 ) ); static const float32x4_t binmul3 = vdupq_n_f32( BVHBINS * 0.49999f ); - for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template + for (uint32_t i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template // reset node pool - const unsigned spaceNeeded = primCount * 2; + const uint32_t spaceNeeded = primCount * 2; if (allocatedBVHNodes < spaceNeeded) { AlignedFree( bvhNode ); AlignedFree( triIdx ); AlignedFree( fragment ); - triIdx = (unsigned*)AlignedAlloc( primCount * sizeof( unsigned ) ); + triIdx = (uint32_t*)AlignedAlloc( primCount * sizeof( uint32_t ) ); bvhNode = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); allocatedBVHNodes = spaceNeeded; memset( &bvhNode[1], 0, 32 ); // avoid crash in refit. fragment = (Fragment*)AlignedAlloc( primCount * sizeof( Fragment ) ); } else FATAL_ERROR_IF( !rebuildable, "BVH::BuildNEON( .. ), bvh not rebuildable." ); - verts = (bvhvec4*)vertices; // note: we're not copying this data; don't delete. + verts = bvhvec4slice{ vertices, primCount * 3 }; // note: we're not copying this data; don't delete. triCount = idxCount = primCount; - unsigned newNodePtr = 2; + uint32_t newNodePtr = 2; struct FragSSE { float32x4_t bmin4, bmax4; }; FragSSE* frag4 = (FragSSE*)fragment; float32x4x2_t* frag8 = (float32x4x2_t*)fragment; - const float32x4_t* verts4 = (float32x4_t*)verts; + const float32x4_t* verts4 = (float32x4_t*)vertices; // assign all triangles to the root node BVHNode& root = bvhNode[0]; root.leftFirst = 0, root.triCount = triCount; // initialize fragments and update root bounds float32x4_t rootMin = max4, rootMax = max4; - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { const float32x4_t v1 = veorq_s32( signFlip4, vminq_f32( vminq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) ); const float32x4_t v2 = vmaxq_f32( vmaxq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ); @@ -4103,7 +4240,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) rootMin = veorq_s32( rootMin, signFlip4 ); root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax; // subdivide recursively - ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0; + ALIGNED( 64 ) uint32_t task[128], taskCount = 0, nodeIdx = 0; const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f; while (1) { @@ -4117,15 +4254,15 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) const float32x4_t rpd4 = vandq_s32( vdivq_f32( binmul3, d4 ), vmvnq_u32( vceqq_f32( d4, vdupq_n_f32( 0 ) ) ) ); // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies": // main loop operates on two fragments to minimize dependencies and maximize ILP. - unsigned fi = triIdx[node.leftFirst]; + uint32_t fi = triIdx[node.leftFirst]; memset( count, 0, sizeof( count ) ); float32x4x2_t r0, r1, r2, f = frag8[fi]; int32x4_t bi4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ) ); memcpy( binbox, binboxOrig, sizeof( binbox ) ); - unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1; - for (unsigned i = 0; i < node.triCount - 1; i++) + uint32_t i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1; + for (uint32_t i = 0; i < node.triCount - 1; i++) { - unsigned fid = *ti++; + uint32_t fid = *ti++; const float32x4x2_t b0 = binbox[i0]; const float32x4x2_t b1 = binbox[BVHBINS + i1]; const float32x4x2_t b2 = binbox[2 * BVHBINS + i2]; @@ -4149,21 +4286,21 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; // calculate per-split totals float splitCost = BVH_FAR, rSAV = 1.0f / node.SurfaceArea(); - unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; + uint32_t bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; const float32x4x2_t* bb = binbox; - for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) + for (int32_t a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { // hardcoded bin processing for BVHBINS == 8 assert( BVHBINS == 8 ); - const unsigned lN0 = count[a][0], rN0 = count[a][7]; + const uint32_t lN0 = count[a][0], rN0 = count[a][7]; const float32x4x2_t lb0 = bb[0], rb0 = bb[7]; - const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; - const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; + const uint32_t lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; + const uint32_t rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; const float32x4x2_t lb1 = vmaxq_f32x2( lb0, bb[1] ), rb1 = vmaxq_f32x2( rb0, bb[6] ); const float32x4x2_t lb2 = vmaxq_f32x2( lb1, bb[2] ), rb2 = vmaxq_f32x2( rb1, bb[5] ); const float32x4x2_t lb3 = vmaxq_f32x2( lb2, bb[3] ), rb3 = vmaxq_f32x2( rb2, bb[4] ); - const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; - const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; + const uint32_t lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; + const uint32_t rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; const float32x4x2_t lb4 = vmaxq_f32x2( lb3, bb[4] ), rb4 = vmaxq_f32x2( rb3, bb[3] ); const float32x4x2_t lb5 = vmaxq_f32x2( lb4, bb[5] ), rb5 = vmaxq_f32x2( rb4, bb[2] ); const float32x4x2_t lb6 = vmaxq_f32x2( lb5, bb[6] ), rb6 = vmaxq_f32x2( rb5, bb[1] ); @@ -4179,14 +4316,14 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; - unsigned t, fr = triIdx[src]; - for (unsigned i = 0; i < node.triCount; i++) + uint32_t t, fr = triIdx[src]; + for (uint32_t i = 0; i < node.triCount; i++) { - const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); + const uint32_t bi = (uint32_t)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t; } // create child nodes and recurse - const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + const uint32_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; if (leftCount == 0 || rightCount == 0) break; // should not happen. (*(float32x4x2_t*)&bvhNode[n]).val[0] = veorq_s32( bestLBox.val[0], signFlip8.val[0] ); (*(float32x4x2_t*)&bvhNode[n]).val[1] = veorq_s32( bestLBox.val[1], signFlip8.val[1] ); @@ -4208,10 +4345,10 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) } // Traverse the second alternative BVH layout (ALT_SOA). -int BVH::Intersect_AltSoA( Ray& ray ) const +int32_t BVH::Intersect_AltSoA( Ray& ray ) const { BVHNodeAlt2* node = &alt2Node[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; const float32x4_t Ox4 = vdupq_n_f32( ray.O.x ), rDx4 = vdupq_n_f32( ray.rD.x ); const float32x4_t Oy4 = vdupq_n_f32( ray.O.y ), rDy4 = vdupq_n_f32( ray.rD.y ); const float32x4_t Oz4 = vdupq_n_f32( ray.O.z ), rDz4 = vdupq_n_f32( ray.rD.z ); @@ -4221,9 +4358,9 @@ int BVH::Intersect_AltSoA( Ray& ray ) const steps++; if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { - const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; + const uint32_t tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -4262,7 +4399,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const x4 = vcombine_f32( vget_low_f32( t0 ), vget_low_f32( t2 ) ); y4 = vcombine_f32( vget_high_f32( t0 ), vget_high_f32( t2 ) ); z4 = vcombine_f32( vget_low_f32( t1 ), vget_low_f32( t3 ) ); - unsigned lidx = node->left, ridx = node->right; + uint32_t lidx = node->left, ridx = node->right; const float32x4_t min4 = vmaxq_f32( vmaxq_f32( vmaxq_f32( x4, y4 ), z4 ), vdupq_n_f32( 0 ) ); const float32x4_t max4 = vminq_f32( vminq_f32( vminq_f32( x4, y4 ), z4 ), vdupq_n_f32( ray.hit.t ) ); #if 0 @@ -4282,7 +4419,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const if (dist1 > dist2) { float t = dist1; dist1 = dist2; dist2 = t; - unsigned i = lidx; lidx = ridx; ridx = i; + uint32_t i = lidx; lidx = ridx; ridx = i; } if (dist1 == BVH_FAR) { @@ -4297,21 +4434,356 @@ int BVH::Intersect_AltSoA( Ray& ray ) const return steps; } +// Traverse a 4-way BVH stored in 'Atilla Áfra' layout. +inline void IntersectCompactTri( Ray& r, float32x4_t& t4, const float* T ) +{ + const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11]; + const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z; + const float ta = -transS / transD; + if (ta <= 0 || ta >= r.hit.t) return; + const bvhvec3 wr = r.O + ta * r.D; + const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; + const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; + const bool hit = u >= 0 && v >= 0 && u + v < 1; + if (hit) r.hit = { ta, u, v, *(uint32_t*)&T[15] }, t4 = vdupq_n_f32( ta ); +} + +inline int32_t ARMVecMovemask( uint32x4_t v ) { + const int32_t shiftArr[4] = { 0, 1, 2, 3 }; + int32x4_t shift = vld1q_s32( shiftArr ); + return vaddvq_u32( vshlq_u32( vshrq_n_u32( v, 31 ), shift ) ); +} + +int32_t BVH::Intersect_Afra( Ray& ray ) const +{ + uint32_t nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0; + const float32x4_t ox4 = vdupq_n_f32( ray.O.x ), rdx4 = vdupq_n_f32( ray.rD.x ); + const float32x4_t oy4 = vdupq_n_f32( ray.O.y ), rdy4 = vdupq_n_f32( ray.rD.y ); + const float32x4_t oz4 = vdupq_n_f32( ray.O.z ), rdz4 = vdupq_n_f32( ray.rD.z ); + float32x4_t t4 = vdupq_n_f32( ray.hit.t ), zero4 = vdupq_n_f32( 0.0f ); + const uint32x4_t idx4 = SIMD_SETRVECU( 0, 1, 2, 3 ); + const uint32x4_t idxMask = vdupq_n_u32( 0xfffffffc ); + const float32x4_t inf4 = vdupq_n_f32( BVH_FAR ); + while (1) + { + steps++; + const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; + // intersect the ray with four AABBs + const float32x4_t xmin4 = node.xmin4, xmax4 = node.xmax4; + const float32x4_t ymin4 = node.ymin4, ymax4 = node.ymax4; + const float32x4_t zmin4 = node.zmin4, zmax4 = node.zmax4; + const float32x4_t x0 = vsubq_f32( xmin4, ox4 ), x1 = vsubq_f32( xmax4, ox4 ); + const float32x4_t y0 = vsubq_f32( ymin4, oy4 ), y1 = vsubq_f32( ymax4, oy4 ); + const float32x4_t z0 = vsubq_f32( zmin4, oz4 ), z1 = vsubq_f32( zmax4, oz4 ); + const float32x4_t tx1 = vmulq_f32( x0, rdx4 ), tx2 = vmulq_f32( x1, rdx4 ); + const float32x4_t ty1 = vmulq_f32( y0, rdy4 ), ty2 = vmulq_f32( y1, rdy4 ); + const float32x4_t tz1 = vmulq_f32( z0, rdz4 ), tz2 = vmulq_f32( z1, rdz4 ); + const float32x4_t txmin = vminq_f32( tx1, tx2 ), tymin = vminq_f32( ty1, ty2 ), tzmin = vminq_f32( tz1, tz2 ); + const float32x4_t txmax = vmaxq_f32( tx1, tx2 ), tymax = vmaxq_f32( ty1, ty2 ), tzmax = vmaxq_f32( tz1, tz2 ); + const float32x4_t tmin = vmaxq_f32( vmaxq_f32( txmin, tymin ), tzmin ); + const float32x4_t tmax = vminq_f32( vminq_f32( txmax, tymax ), tzmax ); + + uint32x4_t hit = vandq_u32( vandq_u32( vcgeq_f32( tmax, tmin ), vcltq_f32( tmin, t4 ) ), vcgeq_f32( tmax, zero4 ) ); + int32_t hitBits = ARMVecMovemask( hit ), hits = vcnt_s8( vreinterpret_s8_s32( vcreate_u32( hitBits ) ) )[0]; + + if (hits == 1 /* 43% */) + { + // just one node was hit - no sorting needed. + const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane]; + if (count == 0) nodeIdx = node.childFirst[lane]; else + { + const uint32_t first = node.childFirst[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + } + continue; + } + if (hits == 0 /* 29% */) + { + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + continue; + } + if (hits == 2 /* 16% */) + { + // two nodes hit + uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); + float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1]; + if (dist1 < dist0) + { + uint32_t t = lane0; lane0 = lane1; lane1 = t; + float ft = dist0; dist0 = dist1; dist1 = ft; + } + const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; + // process first lane + if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else + { + const uint32_t first = node.childFirst[lane0]; + for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); + nodeIdx = 0; + } + // process second lane + if (triCount1 == 0) + { + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = node.childFirst[lane1]; + } + else + { + const uint32_t first = node.childFirst[lane1]; + for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); + } + } + else if (hits == 3 /* 8% */) + { + // blend in lane indices + float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) ); + + // sort + float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3]; + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int32_t i = 1; i < 4; i++) + { + uint32_t lane = *(uint32_t*)&d[i] & 3; + if (node.triCount[lane] == 0) + { + const uint32_t childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); + } + } + else /* hits == 4, 2%: rare */ + { + // blend in lane indices + float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) ); + // sort + float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3]; + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int32_t i = 0; i < 4; i++) + { + uint32_t lane = *(uint32_t*)&d[i] & 3; + if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + if (node.triCount[lane] == 0) + { + const uint32_t childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); + } + } + // get next task + if (nodeIdx) continue; + if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; + } + return steps; +} + +// Find occlusions in a 4-way BVH stored in 'Atilla Áfra' layout. +inline bool OccludedCompactTri( const Ray& r, const float* T ) +{ + const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11]; + const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z; + const float ta = -transS / transD; + if (ta <= 0 || ta >= r.hit.t) return false; + const bvhvec3 wr = r.O + ta * r.D; + const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; + const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; + return u >= 0 && v >= 0 && u + v < 1; +} + +bool BVH::IsOccluded_Afra( const Ray& ray ) const +{ + uint32_t nodeIdx = 0, stack[1024], stackPtr = 0; + const float32x4_t ox4 = vdupq_n_f32( ray.O.x ), rdx4 = vdupq_n_f32( ray.rD.x ); + const float32x4_t oy4 = vdupq_n_f32( ray.O.y ), rdy4 = vdupq_n_f32( ray.rD.y ); + const float32x4_t oz4 = vdupq_n_f32( ray.O.z ), rdz4 = vdupq_n_f32( ray.rD.z ); + float32x4_t t4 = vdupq_n_f32( ray.hit.t ), zero4 = vdupq_n_f32( 0.0f ); + const uint32x4_t idx4 = SIMD_SETRVECU( 0, 1, 2, 3 ); + const uint32x4_t idxMask = vdupq_n_u32( 0xfffffffc ); + const float32x4_t inf4 = vdupq_n_f32( BVH_FAR ); + + while (1) + { + const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; + // intersect the ray with four AABBs + const float32x4_t xmin4 = node.xmin4, xmax4 = node.xmax4; + const float32x4_t ymin4 = node.ymin4, ymax4 = node.ymax4; + const float32x4_t zmin4 = node.zmin4, zmax4 = node.zmax4; + const float32x4_t x0 = vsubq_f32( xmin4, ox4 ), x1 = vsubq_f32( xmax4, ox4 ); + const float32x4_t y0 = vsubq_f32( ymin4, oy4 ), y1 = vsubq_f32( ymax4, oy4 ); + const float32x4_t z0 = vsubq_f32( zmin4, oz4 ), z1 = vsubq_f32( zmax4, oz4 ); + const float32x4_t tx1 = vmulq_f32( x0, rdx4 ), tx2 = vmulq_f32( x1, rdx4 ); + const float32x4_t ty1 = vmulq_f32( y0, rdy4 ), ty2 = vmulq_f32( y1, rdy4 ); + const float32x4_t tz1 = vmulq_f32( z0, rdz4 ), tz2 = vmulq_f32( z1, rdz4 ); + const float32x4_t txmin = vminq_f32( tx1, tx2 ), tymin = vminq_f32( ty1, ty2 ), tzmin = vminq_f32( tz1, tz2 ); + const float32x4_t txmax = vmaxq_f32( tx1, tx2 ), tymax = vmaxq_f32( ty1, ty2 ), tzmax = vmaxq_f32( tz1, tz2 ); + const float32x4_t tmin = vmaxq_f32( vmaxq_f32( txmin, tymin ), tzmin ); + const float32x4_t tmax = vminq_f32( vminq_f32( txmax, tymax ), tzmax ); + + uint32x4_t hit = vandq_u32( vandq_u32( vcgeq_f32( tmax, tmin ), vcltq_f32( tmin, t4 ) ), vcgeq_f32( tmax, zero4 ) ); + int32_t hitBits = ARMVecMovemask( hit ), hits = vcnt_s8( vreinterpret_s8_s32( vcreate_u32( hitBits ) ) )[0]; + + if (hits == 1 /* 43% */) + { + // just one node was hit - no sorting needed. + const uint32_t lane = __bfind( hitBits ), count = node.triCount[lane]; + if (count == 0) nodeIdx = node.childFirst[lane]; else + { + const uint32_t first = node.childFirst[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + } + continue; + } + if (hits == 0 /* 29% */) + { + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + continue; + } + if (hits == 2 /* 16% */) + { + // two nodes hit + uint32_t lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); + float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1]; + if (dist1 < dist0) + { + uint32_t t = lane0; lane0 = lane1; lane1 = t; + float ft = dist0; dist0 = dist1; dist1 = ft; + } + const uint32_t triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; + // process first lane + if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else + { + const uint32_t first = node.childFirst[lane0]; + for (uint32_t j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + nodeIdx = 0; + } + // process second lane + if (triCount1 == 0) + { + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = node.childFirst[lane1]; + } + else + { + const uint32_t first = node.childFirst[lane1]; + for (uint32_t j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + else if (hits == 3 /* 8% */) + { + // blend in lane indices + float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) ); + // sort + float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3]; + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int32_t i = 1; i < 4; i++) + { + uint32_t lane = *(uint32_t*)&d[i] & 3; + if (node.triCount[lane] == 0) + { + const uint32_t childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + else /* hits == 4, 2%: rare */ + { + // blend in lane indices + float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) ); + // sort + float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3]; + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int32_t i = 0; i < 4; i++) + { + uint32_t lane = *(uint32_t*)&d[i] & 3; + if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + if (node.triCount[lane] == 0) + { + const uint32_t childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const uint32_t first = node.childFirst[lane], count = node.triCount[lane]; + for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + // get next task + if (nodeIdx) continue; + if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; + } + return false; +} + #endif // BVH_USENEON // ============================================================================ // // D O U B L E P R E C I S I O N S U P P O R T -// +// // ============================================================================ #ifdef DOUBLE_PRECISION_SUPPORT -// Basic single-function binned-SAH-builder, double-precision version. -void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) +// Basic single-function binned-SAH-builder, double-precision version. +void BVH::BuildEx( const bvhdbl3* vertices, const uint32_t primCount ) { // allocate on first build - const unsigned spaceNeeded = primCount * 2; // upper limit + FATAL_ERROR_IF( primCount == 0, "BVH::BuildEx( .. ), primCount == 0." ); + const uint32_t spaceNeeded = primCount * 2; // upper limit if (allocatedBVHExNodes < spaceNeeded) { AlignedFree( bvhNodeEx ); @@ -4319,14 +4791,14 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) AlignedFree( fragEx ); bvhNodeEx = (BVHNodeEx*)AlignedAlloc( spaceNeeded * sizeof( BVHNodeEx ) ); allocatedBVHExNodes = spaceNeeded; - triIdxEx = (unsigned long long int*)AlignedAlloc( primCount * sizeof( unsigned long long int ) ); + triIdxEx = (uint64_t*)AlignedAlloc( primCount * sizeof( uint64_t ) ); fragEx = (FragmentEx*)AlignedAlloc( primCount * sizeof( FragmentEx ) ); } else FATAL_ERROR_IF( !rebuildable, "BVH::BuildEx( .. ), bvh not rebuildable." ); vertsEx = (bvhdbl3*)vertices; // note: we're not copying this data; don't delete. idxCount = triCount = primCount; // reset node pool - unsigned newNodePtr = 2; + uint32_t newNodePtr = 2; // assign all triangles to the root node BVHNodeEx& root = bvhNodeEx[0]; root.leftFirst = 0, root.triCount = triCount, root.aabbMin = bvhdbl3( BVH_DBL_FAR ), root.aabbMax = bvhdbl3( -BVH_DBL_FAR ); @@ -4334,7 +4806,7 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) if (vertsEx) { // building a BVH over triangles specified as three 16-byte vertices each. - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { fragEx[i].bmin = tinybvh_min( tinybvh_min( vertsEx[i * 3], vertsEx[i * 3 + 1] ), vertsEx[i * 3 + 2] ); fragEx[i].bmax = tinybvh_max( tinybvh_max( vertsEx[i * 3], vertsEx[i * 3 + 1] ), vertsEx[i * 3 + 2] ); @@ -4345,15 +4817,16 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) else { // we are building the BVH over aabbs we received from ::BuildEx( tinyaabb* ): vertices == 0. - for (unsigned i = 0; i < triCount; i++) + for (uint32_t i = 0; i < triCount; i++) { root.aabbMin = tinybvh_min( root.aabbMin, fragEx[i].bmin ); root.aabbMax = tinybvh_max( root.aabbMax, fragEx[i].bmax ), triIdxEx[i] = i; // here: aabb index. } } // subdivide recursively - unsigned task[256], taskCount = 0, nodeIdx = 0; - bvhdbl3 minDim = (root.aabbMax - root.aabbMin) * 1e-20, bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0; + uint32_t task[256], taskCount = 0, nodeIdx = 0; + bvhdbl3 minDim = (root.aabbMax - root.aabbMin) * 1e-20; + bvhdbl3 bestLMin = 0, bestLMax = 0, bestRMin = 0, bestRMax = 0; while (1) { while (1) @@ -4361,15 +4834,15 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) BVHNodeEx& node = bvhNodeEx[nodeIdx]; // find optimal object split bvhdbl3 binMin[3][BVHBINS], binMax[3][BVHBINS]; - for (unsigned a = 0; a < 3; a++) for (unsigned i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_DBL_FAR, binMax[a][i] = -BVH_DBL_FAR; - unsigned count[3][BVHBINS]; - memset( count, 0, BVHBINS * 3 * sizeof( unsigned ) ); + for (uint32_t a = 0; a < 3; a++) for (uint32_t i = 0; i < BVHBINS; i++) binMin[a][i] = BVH_DBL_FAR, binMax[a][i] = -BVH_DBL_FAR; + uint32_t count[3][BVHBINS]; + memset( count, 0, BVHBINS * 3 * sizeof( uint32_t ) ); const bvhdbl3 rpd3 = bvhdbl3( BVHBINS / (node.aabbMax - node.aabbMin) ), nmin3 = node.aabbMin; - for (unsigned i = 0; i < node.triCount; i++) // process all tris for x,y and z at once + for (uint32_t i = 0; i < node.triCount; i++) // process all tris for x,y and z at once { - const unsigned long long int fi = triIdxEx[node.leftFirst + i]; + const uint64_t fi = triIdxEx[node.leftFirst + i]; const bvhdbl3 fbi = ((fragEx[fi].bmin + fragEx[fi].bmax) * 0.5 - nmin3) * rpd3; - bvhint3 bi( (int)fbi.x, (int)fbi.y, (int)fbi.z ); + bvhint3 bi( (int32_t)fbi.x, (int32_t)fbi.y, (int32_t)fbi.z ); bi.x = tinybvh_clamp( bi.x, 0, BVHBINS - 1 ); bi.y = tinybvh_clamp( bi.y, 0, BVHBINS - 1 ); bi.z = tinybvh_clamp( bi.z, 0, BVHBINS - 1 ); @@ -4382,13 +4855,13 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) } // calculate per-split totals double splitCost = BVH_DBL_FAR, rSAV = 1.0 / node.SurfaceArea(); - unsigned bestAxis = 0, bestPos = 0; - for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) + uint32_t bestAxis = 0, bestPos = 0; + for (int32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) { bvhdbl3 lBMin[BVHBINS - 1], rBMin[BVHBINS - 1], l1 = BVH_DBL_FAR, l2 = -BVH_DBL_FAR; bvhdbl3 lBMax[BVHBINS - 1], rBMax[BVHBINS - 1], r1 = BVH_DBL_FAR, r2 = -BVH_DBL_FAR; double ANL[BVHBINS - 1], ANR[BVHBINS - 1]; - for (unsigned lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) + for (uint32_t lN = 0, rN = 0, i = 0; i < BVHBINS - 1; i++) { lBMin[i] = l1 = tinybvh_min( l1, binMin[a][i] ); rBMin[BVHBINS - 2 - i] = r1 = tinybvh_min( r1, binMin[a][BVHBINS - 1 - i] ); @@ -4399,7 +4872,7 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : ((r2 - r1).halfArea() * (double)rN); } // evaluate bin totals to find best position for object split - for (unsigned i = 0; i < BVHBINS - 1; i++) + for (uint32_t i = 0; i < BVHBINS - 1; i++) { const double C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]); if (C < splitCost) @@ -4412,19 +4885,19 @@ void BVH::BuildEx( const bvhdbl3* vertices, const unsigned primCount ) double noSplitCost = (double)node.triCount * C_INT; if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition - unsigned long long int j = node.leftFirst + node.triCount, src = node.leftFirst; + uint64_t j = node.leftFirst + node.triCount, src = node.leftFirst; const double rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis]; - for (unsigned long long int i = 0; i < node.triCount; i++) + for (uint64_t i = 0; i < node.triCount; i++) { - const unsigned long long int fi = triIdxEx[src]; - int bi = (unsigned)(((fragEx[fi].bmin[bestAxis] + fragEx[fi].bmax[bestAxis]) * 0.5 - nmin) * rpd); + const uint64_t fi = triIdxEx[src]; + int32_t bi = (uint32_t)(((fragEx[fi].bmin[bestAxis] + fragEx[fi].bmax[bestAxis]) * 0.5 - nmin) * rpd); bi = tinybvh_clamp( bi, 0, BVHBINS - 1 ); - if ((unsigned)bi <= bestPos) src++; else tinybvh_swap( triIdxEx[src], triIdxEx[--j] ); + if ((uint32_t)bi <= bestPos) src++; else tinybvh_swap( triIdxEx[src], triIdxEx[--j] ); } // create child nodes - unsigned long long int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + uint64_t leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; if (leftCount == 0 || rightCount == 0) break; // should not happen. - const int lci = newNodePtr++, rci = newNodePtr++; + const int32_t lci = newNodePtr++, rci = newNodePtr++; bvhNodeEx[lci].aabbMin = bestLMin, bvhNodeEx[lci].aabbMax = bestLMax; bvhNodeEx[lci].leftFirst = node.leftFirst, bvhNodeEx[lci].triCount = leftCount; bvhNodeEx[rci].aabbMin = bestRMin, bvhNodeEx[rci].aabbMax = bestRMax; @@ -4450,7 +4923,7 @@ double BVH::BVHNodeEx::SurfaceArea() const return e.x * e.y + e.y * e.z + e.z * e.x; } -int BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const +int32_t BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const { switch (layout) { @@ -4461,23 +4934,23 @@ int BVH::IntersectEx( RayEx& ray, const BVHLayout layout ) const FATAL_ERROR_IF( true, "BVH::IntersectEx( .. , ? ), unsupported double-precision bvh layout." ); break; } - return 0; + return 0; } // Traverse the default BVH layout, double-precision. -int BVH::IntersectEx_WaldDouble( RayEx& ray ) const +int32_t BVH::IntersectEx_WaldDouble( RayEx& ray ) const { BVHNodeEx* node = &bvhNodeEx[0], * stack[64]; - unsigned stackPtr = 0, steps = 0; + uint32_t stackPtr = 0, steps = 0; while (1) { steps++; if (node->isLeaf()) { - for (unsigned i = 0; i < node->triCount; i++) + for (uint32_t i = 0; i < node->triCount; i++) { - const unsigned long long int idx = triIdxEx[node->leftFirst + i]; - const unsigned long long int vertIdx = idx * 3; + const uint64_t idx = triIdxEx[node->leftFirst + i]; + const uint64_t vertIdx = idx * 3; const bvhdbl3 edge1 = vertsEx[vertIdx + 1] - vertsEx[vertIdx]; const bvhdbl3 edge2 = vertsEx[vertIdx + 2] - vertsEx[vertIdx]; const bvhdbl3 h = cross( ray.D, edge2 ); @@ -4537,7 +5010,7 @@ double BVH::BVHNodeEx::Intersect( const RayEx& ray ) const // ============================================================================ // // H E L P E R S -// +// // ============================================================================ // TransformPoint @@ -4560,10 +5033,10 @@ bvhvec3 BVH::BLASInstance::TransformVector( const bvhvec3& v ) const } // IntersectTri -void BVH::IntersectTri( Ray& ray, const unsigned idx ) const +void BVH::IntersectTri( Ray& ray, const uint32_t idx ) const { // Moeller-Trumbore ray/triangle intersection algorithm - const unsigned vertIdx = idx * 3; + const uint32_t vertIdx = idx * 3; const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; const bvhvec3 h = cross( ray.D, edge2 ); @@ -4600,10 +5073,10 @@ float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& } // PrecomputeTriangle (helper), transforms a triangle to the format used in: -// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016. -void BVH::PrecomputeTriangle( const bvhvec4* const vert, float* T ) +// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016. +void BVH::PrecomputeTriangle( const bvhvec4slice& vert, uint32_t triIndex, float* T ) { - bvhvec3 v0 = vert[0], v1 = vert[1], v2 = vert[2]; + bvhvec3 v0 = vert[triIndex], v1 = vert[triIndex + 1], v2 = vert[triIndex + 2]; bvhvec3 e1 = v1 - v0, e2 = v2 - v0, N = cross( e1, e2 ); float x1, x2, n = dot( v0, N ), rN; if (fabs( N[0] ) > fabs( N[1] ) && fabs( N[0] ) > fabs( N[2] )) @@ -4639,16 +5112,16 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve bmax = tinybvh_min( bmax, orig.bmax ); const bvhvec3 extent = bmax - bmin; // Sutherland-Hodgeman against six bounding planes - unsigned Nin = 3, vidx = orig.primIdx * 3; + uint32_t Nin = 3, vidx = orig.primIdx * 3; bvhvec3 vin[10] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[10]; - for (unsigned a = 0; a < 3; a++) + for (uint32_t a = 0; a < 3; a++) { const float eps = minDim.cell[a]; if (extent.cell[a] > eps) { - unsigned Nout = 0; + uint32_t Nout = 0; const float l = bmin[a], r = bmax[a]; - for (unsigned v = 0; v < Nin; v++) + for (uint32_t v = 0; v < Nin; v++) { bvhvec3 v0 = vin[v], v1 = vin[(v + 1) % Nin]; const bool v0in = v0[a] >= l - eps, v1in = v1[a] >= l - eps; @@ -4660,7 +5133,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve if (v1in) vout[Nout++] = v1; } Nin = 0; - for (unsigned v = 0; v < Nout; v++) + for (uint32_t v = 0; v < Nout; v++) { bvhvec3 v0 = vout[v], v1 = vout[(v + 1) % Nout]; const bool v0in = v0[a] <= r + eps, v1in = v1[a] <= r + eps; @@ -4674,7 +5147,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve } } bvhvec3 mn( BVH_FAR ), mx( -BVH_FAR ); - for (unsigned i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] ); + for (uint32_t i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] ); newFrag.primIdx = orig.primIdx; newFrag.bmin = tinybvh_max( mn, bmin ), newFrag.bmax = tinybvh_min( mx, bmax ); newFrag.clipped = 1; @@ -4682,7 +5155,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve } // RefitUpVerbose: Update bounding boxes of ancestors of the specified node. -void BVH::RefitUpVerbose( unsigned nodeIdx ) +void BVH::RefitUpVerbose( uint32_t nodeIdx ) { while (nodeIdx != 0xffffffff) { @@ -4697,21 +5170,21 @@ void BVH::RefitUpVerbose( unsigned nodeIdx ) // FindBestNewPosition // Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" -unsigned BVH::FindBestNewPosition( const unsigned Lid ) +uint32_t BVH::FindBestNewPosition( const uint32_t Lid ) { const BVHNodeVerbose& L = verbose[Lid]; const float SA_L = SA( L.aabbMin, L.aabbMax ); // reinsert L into BVH - unsigned taskNode[512], tasks = 1, Xbest = 0; + uint32_t taskNode[512], tasks = 1, Xbest = 0; float taskCi[512], taskInvCi[512], Cbest = BVH_FAR, epsilon = 1e-10f; taskNode[0] = 0 /* root */, taskCi[0] = 0, taskInvCi[0] = 1 / epsilon; while (tasks > 0) { // 'pop' task with createst taskInvCi float maxInvCi = 0; - unsigned bestTask = 0; - for (unsigned j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j; - const unsigned Xid = taskNode[bestTask]; + uint32_t bestTask = 0; + for (uint32_t j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j; + const uint32_t Xid = taskNode[bestTask]; const float CiLX = taskCi[bestTask]; taskNode[bestTask] = taskNode[--tasks], taskCi[bestTask] = taskCi[tasks], taskInvCi[bestTask] = taskInvCi[tasks]; // execute task @@ -4732,11 +5205,11 @@ unsigned BVH::FindBestNewPosition( const unsigned Lid ) // ReinsertNodeVerbose // Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" -void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin ) +void BVH::ReinsertNodeVerbose( const uint32_t Lid, const uint32_t Nid, const uint32_t origin ) { - unsigned Xbest = FindBestNewPosition( Lid ); + uint32_t Xbest = FindBestNewPosition( Lid ); if (Xbest == 0 || verbose[Xbest].parent == 0) Xbest = origin; - const unsigned X1 = verbose[Xbest].parent; + const uint32_t X1 = verbose[Xbest].parent; BVHNodeVerbose& N = verbose[Nid]; N.left = Xbest, N.right = Lid; N.aabbMin = tinybvh_min( verbose[Xbest].aabbMin, verbose[Lid].aabbMin ); @@ -4749,10 +5222,10 @@ void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const uns // Determine for each node in the tree the number of primitives // stored in that subtree. Helper function for MergeLeafs. -unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters ) +uint32_t BVH::CountSubtreeTris( const uint32_t nodeIdx, uint32_t* counters ) { BVHNodeVerbose& node = verbose[nodeIdx]; - unsigned result = node.triCount; + uint32_t result = node.triCount; if (!result) result = CountSubtreeTris( node.left, counters ) + CountSubtreeTris( node.right, counters ); counters[nodeIdx] = result; @@ -4761,7 +5234,7 @@ unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters ) // Write the triangle indices stored in a subtree to a continuous // slice in the 'newIdx' array. Helper function for MergeLeafs. -void BVH::MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr ) +void BVH::MergeSubtree( const uint32_t nodeIdx, uint32_t* newIdx, uint32_t& newIdxPtr ) { BVHNodeVerbose& node = verbose[nodeIdx]; if (node.isLeaf()) diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index 1823a21..8325009 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -3,8 +3,6 @@ #define SCRHEIGHT 600 #include "external/fenster.h" // https://github.com/zserge/fenster -// #define USE_EMBREE // enable to verify correct implementation, win64 only for now. -#define TEST_DOUBLE // enable to verify correct implementation of double-precision path. #define LOADSCENE #define TINYBVH_IMPLEMENTATION @@ -13,20 +11,10 @@ using namespace tinybvh; -#if defined(USE_EMBREE) -#include "embree4/rtcore.h" -static RTCScene embreeScene; -void embreeError( void* userPtr, enum RTCError error, const char* str ) -{ - printf( "error %d: %s\n", error, str ); -} -#else BVH bvh; -#endif #ifdef LOADSCENE bvhvec4* triangles = 0; -bvhdbl3* triEx = 0; const char scene[] = "cryteksponza.bin"; #else ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; @@ -83,35 +71,6 @@ void Init() sphere_flake( 0, 0, 0, 1.5f ); #endif -#if defined USE_EMBREE - - RTCDevice embreeDevice = rtcNewDevice( NULL ); - rtcSetDeviceErrorFunction( embreeDevice, embreeError, NULL ); - embreeScene = rtcNewScene( embreeDevice ); - RTCGeometry embreeGeom = rtcNewGeometry( embreeDevice, RTC_GEOMETRY_TYPE_TRIANGLE ); - float* vertices = (float*)rtcSetNewGeometryBuffer( embreeGeom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, 3 * sizeof( float ), verts ); - unsigned* indices = (unsigned*)rtcSetNewGeometryBuffer( embreeGeom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, 3 * sizeof( unsigned ), verts / 3 ); - for (int i = 0; i < verts; i++) - { - vertices[i * 3 + 0] = triangles[i].x, vertices[i * 3 + 1] = triangles[i].y; - vertices[i * 3 + 2] = triangles[i].z, indices[i] = i; // Note: not using shared vertices. - } - rtcCommitGeometry( embreeGeom ); - rtcAttachGeometry( embreeScene, embreeGeom ); - rtcReleaseGeometry( embreeGeom ); - rtcCommitScene( embreeScene ); - -#elif defined TEST_DOUBLE - - triEx = (tinybvh::bvhdbl3*)malloc64( verts * sizeof( tinybvh::bvhdbl3 )); - for( int i = 0; i < verts; i++ ) - triEx[i].x = (double)triangles[i].x, - triEx[i].y = (double)triangles[i].y, - triEx[i].z = (double)triangles[i].z; - bvh.BuildEx( triEx, verts / 3 ); - -#else - // build a BVH over the scene #if defined(BVH_USEAVX) bvh.BuildAVX( triangles, verts / 3 ); @@ -119,8 +78,6 @@ void Init() bvh.BuildNEON( triangles, verts / 3 ); #else bvh.Build( triangles, verts / 3 ); -#endif - #endif // load camera position / direction from file @@ -132,15 +89,12 @@ void Init() t.close(); } - void UpdateCamera(float delta_time_s, fenster& f) { bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ); bvhvec3 up = 0.8f * cross( view, right ); - int64_t new_fenster_time = fenster_time(); // get camera controls. - if (f.keys['A']) eye += right * -1.0f * delta_time_s * 10; if (f.keys['D']) eye += right * delta_time_s * 10; if (f.keys['W']) eye += view * delta_time_s * 10; @@ -179,33 +133,13 @@ void Tick(float delta_time_s, fenster & f, uint32_t* buf) } // trace primary rays -#if defined TEST_DOUBLE - for (int i = 0; i < N; i++) - { - RayEx r( rays[i].O, rays[i].D ); - depths[i] = bvh.IntersectEx( r ) & 127; - } -#elif !defined USE_EMBREE for (int i = 0; i < N; i++) depths[i] = bvh.Intersect( rays[i] ); -#else - struct RTCRayHit rayhit; - for (int i = 0; i < N; i++) - { - rayhit.ray.org_x = rays[i].O.x, rayhit.ray.org_y = rays[i].O.y, rayhit.ray.org_z = rays[i].O.z; - rayhit.ray.dir_x = rays[i].D.x, rayhit.ray.dir_y = rays[i].D.y, rayhit.ray.dir_z = rays[i].D.z; - rayhit.ray.tnear = 0, rayhit.ray.tfar = rays[i].hit.t, rayhit.ray.mask = -1, rayhit.ray.flags = 0; - rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID, rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; - rtcIntersect1( embreeScene, &rayhit ); - rays[i].hit.u = rayhit.hit.u, rays[i].hit.u = rayhit.hit.v; - rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar; - } -#endif // visualize result const bvhvec3 L = normalize( bvhvec3( 1, 2, 3 ) ); for (int i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++) for (int tx = 0; tx < SCRWIDTH / 4; tx++) { - for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++, i++) // if (rays[i].hit.t < 10000) + for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++, i++) if (rays[i].hit.t < 10000) { int pixel_x = tx * 4 + x, pixel_y = ty * 4 + y, primIdx = rays[i].hit.prim; bvhvec3 v0 = triangles[primIdx * 3 + 0]; @@ -213,9 +147,9 @@ void Tick(float delta_time_s, fenster & f, uint32_t* buf) bvhvec3 v2 = triangles[primIdx * 3 + 2]; bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) ); int c = (int)(255.9f * fabs( dot( N, L ) )); - // buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16); + buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16); // buf[pixel_x + pixel_y * SCRWIDTH] = (primIdx * 0xdeece66d + 0xb) & 0xFFFFFF; // color is hashed primitive index - buf[pixel_x + pixel_y * SCRWIDTH] = depths[i] << 17; // render depth as red + // buf[pixel_x + pixel_y * SCRWIDTH] = depths[i] << 17; // render depth as red } } tinybvh::free64( rays ); diff --git a/tiny_bvh_pt.cpp b/tiny_bvh_pt.cpp new file mode 100644 index 0000000..a9095ca --- /dev/null +++ b/tiny_bvh_pt.cpp @@ -0,0 +1,199 @@ +#define FENSTER_APP_IMPLEMENTATION +#define SCRWIDTH 800 +#define SCRHEIGHT 600 +#define TILESIZE 8 +#include "external/fenster.h" // https://github.com/zserge/fenster + +#define TINYBVH_IMPLEMENTATION +#include "tiny_bvh.h" +#include + +using namespace tinybvh; + +// Application variables +static BVH bvh; +static bvhvec4* tris = 0; +static int triCount = 0, frameIdx = 0, spp = 0; +static bvhvec3 accumulator[SCRWIDTH * SCRHEIGHT]; +static BVH::BVHLayout layout = BVH::WALD_32BYTE; + +// Setup view pyramid for a pinhole camera: +// eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) +static bvhvec3 eye( 0, 30, 0 ), p1, p2, p3; +static bvhvec3 view = normalize( bvhvec3( -1, 0, 0 ) ); + +// Xor32 RNG +static unsigned RandomUInt( unsigned& seed ) { seed ^= seed << 13, seed ^= seed >> 17, seed ^= seed << 5; return seed; } +static float RandomFloat( unsigned& seed ) { return RandomUInt( seed ) * 2.3283064365387e-10f; } + +// Ray tracing math +bvhvec3 DiffuseReflection( const bvhvec3 N, unsigned& seed ) +{ + bvhvec3 R; + do + { + R = bvhvec3( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 ); + } while (dot( R, R ) > 1); + return normalize( dot( R, N ) < 0 ? R : -R ); +} +bvhvec3 CosWeightedDiffReflection( const bvhvec3 N, unsigned& seed ) +{ + bvhvec3 R = DiffuseReflection( N, seed ); + return normalize( N + R ); +} + +// Color conversion +bvhvec3 rgb32_to_vec3( const unsigned c ) +{ + return bvhvec3( (float)(c >> 16), (float)((c >> 8) & 255), (float)(c & 255) ) * (1 / 255.f); +} + +// Geometry access +bvhvec3 TriangleColor( const unsigned idx ) { return rgb32_to_vec3( *(unsigned*)&tris[idx * 3].w ); } +bvhvec3 TriangleNormal( const unsigned idx ) +{ + bvhvec3 a = tris[idx * 3], b = tris[idx * 3 + 1], c = tris[idx * 3 + 2]; + return normalize( cross( b - a, a - c ) ); +} + +// Scene management - Append a file, with optional position, scale and color override, tinyfied +void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, int N = 0 ) +{ + std::fstream s{ file, s.binary | s.in }; + s.read( (char*)&N, 4 ); + bvhvec4* data = (bvhvec4*)malloc64( (N + triCount) * 48 ); + if (tris) memcpy( data, tris, triCount * 48 ), free64( tris ); + tris = data, s.read( (char*)tris + triCount * 48, N * 48 ), triCount += N; + for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++) + *(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4; +} + +// Application init +void Init() +{ + // load raw vertex data + AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff ); + AddMesh( "./testdata/dragon.bin", 1.1f, bvhvec3( 29, 3.01f, 0 ), 0xffbb88 ); + AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff ); + AddMesh( "./testdata/bunny.bin", 0.2f, bvhvec3( -7, 0.13f, 0 ), 0x333333 ); + AddMesh( "./testdata/legocar.bin", 0.3f, bvhvec3( -12, 0.8f, -5 ) ); + AddMesh( "./testdata/armadillo.bin", 0.3f, bvhvec3( 7, 1, 3 ), 0xff2020 ); + AddMesh( "./testdata/xyzrgb_dragon.bin", 0.5f, bvhvec3( -22, 0.95f, 0 ), 0xffffaa ); + AddMesh( "./testdata/suzanne.bin", 0.2f, bvhvec3( -18, 0.95f, -16 ), 0x90ff90 ); + AddMesh( "./testdata/head.bin", 0.5f, bvhvec3( 0, 3, 9 ) ); + // build bvh + bvh.BuildAVX( tris, triCount ); +#if defined BVH_USEAVX || defined BVH_USENEON + bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); + bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); + layout = BVH::BVH4_AFRA; +#endif + // load camera position / direction from file + std::fstream t = std::fstream{ "camera.bin", t.binary | t.in }; + if (!t.is_open()) return; + t.read( (char*)&eye, sizeof( eye ) ); + t.read( (char*)&view, sizeof( view ) ); + t.close(); +} + +// Keyboard handling +bool UpdateCamera( float delta_time_s, fenster& f ) +{ + bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ), up = 0.8f * cross( view, right ); + // get camera controls. + bool moved = false; + if (f.keys['A']) eye += right * -1.0f * delta_time_s * 10, moved = true; + if (f.keys['D']) eye += right * delta_time_s * 10, moved = true; + if (f.keys['W']) eye += view * delta_time_s * 10, moved = true; + if (f.keys['S']) eye += view * -1.0f * delta_time_s * 10, moved = true; + if (f.keys['R']) eye += up * delta_time_s * 20, moved = true; + if (f.keys['F']) eye += up * -1.0f * delta_time_s * 20, moved = true; + if (f.keys[20]) view = normalize( view + right * -1.0f * delta_time_s ), moved = true; + if (f.keys[19]) view = normalize( view + right * delta_time_s ), moved = true; + if (f.keys[17]) view = normalize( view + up * -1.0f * delta_time_s ), moved = true; + if (f.keys[18]) view = normalize( view + up * delta_time_s ), moved = true; + // recalculate right, up + right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ), up = 0.8f * cross( view, right ); + bvhvec3 C = eye + 1.2f * view; + p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; + return moved; +} + +// Light transport calculation - Basic recursive Path Tracer with IS and Next Event Estimation +bvhvec3 Trace( Ray ray, unsigned& seed, unsigned depth = 0 ) +{ + // find primary intersection + bvh.Intersect( ray, layout ); + // shade + if (ray.hit.t == 1e30f) return bvhvec3( 0.6f, 0.7f, 1 ); // hit nothing + bvhvec3 I = ray.O + ray.hit.t * ray.D; + bvhvec3 N = TriangleNormal( ray.hit.prim ); + if (dot( N, ray.D ) > 0) N = -N; + bvhvec3 BRDF = TriangleColor( ray.hit.prim ) * (1.0f / 3.14159f); + bvhvec3 Lpos( RandomFloat( seed ) * 30 - 15, 40, RandomFloat( seed ) * 6 - 3 ); // virtual + float dist = length( Lpos - I ); + bvhvec3 L = (Lpos - I) * (1.0f / dist); // normalize + bvhvec3 direct = {}, indirect = {}; + float NdotL = dot( N, L ), NLdotL = fabs( dot( L, bvhvec3( 0, 1, 0 ) ) ); + if (NdotL > 0) + if (!bvh.IsOccluded( Ray( I + L * 0.001f, L, dist ), layout ) ) + direct = BRDF * NdotL * NLdotL * bvhvec3( 9, 9, 8 ) * 500 * (1.0f / (dist * dist)); + // random bounce + if (depth < 4) + { + bvhvec3 R = CosWeightedDiffReflection( N, seed ); + float pdf = 1.0f / dot( N, R ); + bvhvec3 irradiance = Trace( Ray( I + R * 0.001f, R ), seed, depth + 1 ); + indirect = BRDF * irradiance * (1.0f / pdf); + } + // finalize + return direct + indirect; +} + +// Application Tick +void Tick( float delta_time_s, fenster& f, uint32_t* buf ) +{ + // handle user input and update camera + if (UpdateCamera( delta_time_s, f ) || frameIdx++ == 0 ) + { + memset( accumulator, 0, SCRWIDTH * SCRHEIGHT * sizeof( bvhvec3 ) ); + spp = 1; + } + + // render tiles + const int xtiles = SCRWIDTH / TILESIZE, ytiles = SCRHEIGHT / TILESIZE; + const int tiles = xtiles * ytiles; + const float scale = 1.0f / spp++; +#pragma omp parallel for schedule(dynamic) + for (int tile = 0; tile < tiles; tile++) + { + const int tx = tile % xtiles, ty = tile / xtiles; + unsigned seed = (tile + 17) * 171717 + frameIdx * 1023; + for (int y = 0; y < TILESIZE; y++) for (int x = 0; x < TILESIZE; x++) + { + const int pixel_x = tx * TILESIZE + x, pixel_y = ty * TILESIZE + y; + const int pixelIdx = pixel_x + pixel_y * SCRWIDTH; + // setup primary ray + const float u = (float)pixel_x / SCRWIDTH, v = (float)pixel_y / SCRHEIGHT; + const bvhvec3 D = normalize( p1 + u * (p2 - p1) + v * (p3 - p1) - eye ); + // trace + accumulator[pixelIdx] += Trace( Ray( eye, D ), seed ); + const bvhvec3 E = accumulator[pixelIdx] * scale; + // visualize, with a poor man's gamma correct + const int r = (int)tinybvh_min( 255.0f, sqrtf( E.x ) * 255.0f ); + const int g = (int)tinybvh_min( 255.0f, sqrtf( E.y ) * 255.0f ); + const int b = (int)tinybvh_min( 255.0f, sqrtf( E.z ) * 255.0f ); + buf[pixelIdx] = b + (g << 8) + (r << 16); + } + } +} + +// Application Shutdown +void Shutdown() +{ + // save camera position / direction to file + std::fstream s = std::fstream{ "camera.bin", s.binary | s.out }; + s.write( (char*)&eye, sizeof( eye ) ); + s.write( (char*)&view, sizeof( view ) ); + s.close(); +} \ No newline at end of file diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index b4ae620..1c6eea6 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -68,7 +68,7 @@ ALIGNED( 64 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; int verts = 0; BVH bvh; float traceTime, buildTime, * refDist = 0, * refDistFull = 0; -unsigned refOccluded = 0, *refOccl = 0; +unsigned refOccluded = 0, * refOccl = 0; #if defined EMBREE_BUILD || defined EMBREE_TRAVERSE #include "embree4/rtcore.h" @@ -183,7 +183,7 @@ float TestShadowRays( BVH::BVHLayout layout, Ray* batch, unsigned N, unsigned pa // kernels will lead to some diverging results. We check if no more than about // 1/1000 checks differ. Shadow rays also use an origin offset, based on scene // extend, to account for limited floating point accuracy. - if (abs( (int)occluded - (int)refOccluded) > 500) // allow some slack, we're using various tri intersectors + if (abs( (int)occluded - (int)refOccluded ) > 500) // allow some slack, we're using various tri intersectors { fprintf( stderr, "\nValidation for shadow rays failed (%i != %i).\n", (int)occluded, (int)refOccluded ); exit( 1 ); @@ -288,7 +288,7 @@ int main() // setup view pyramid for a pinhole camera: // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) #ifdef LOADSPONZA - bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) ); + bvhvec3 eye( -15.24f, 21.5f, 2.54f ), view = normalize( bvhvec3( 0.826f, -0.438f, -0.356f ) ); #else bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) ); #endif @@ -316,7 +316,7 @@ int main() float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4); bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); fullBatch[Nfull++] = Ray( eye, normalize( P - eye ) ); - if ((s & 7) == 0) + if ((s & 7) == 0) { smallBatch[Nsmall] = fullBatch[Nfull - 1]; #ifdef DOUBLE_PRECISION_SUPPORT @@ -369,7 +369,7 @@ int main() printf( "- 'double' builder: " ); t.reset(); tinybvh::bvhdbl3* triEx = (tinybvh::bvhdbl3*)tinybvh::malloc64( verts * sizeof( tinybvh::bvhdbl3 ) ); - for( int i = 0; i < verts; i++ ) + for (int i = 0; i < verts; i++) triEx[i].x = (double)triangles[i].x, triEx[i].y = (double)triangles[i].y, triEx[i].z = (double)triangles[i].z; @@ -452,13 +452,13 @@ int main() // estimate correct shadow ray epsilon based on scene extends tinybvh::bvhvec4 bmin( 1e30f ), bmax( -1e30f ); - for( int i = 0; i < verts; i++ ) + for (int i = 0; i < verts; i++) bmin = tinybvh::tinybvh_min( bmin, triangles[i] ), bmax = tinybvh::tinybvh_max( bmax, triangles[i] ); tinybvh::bvhvec3 e = bmax - bmin; float maxExtent = tinybvh::tinybvh_max( tinybvh::tinybvh_max( e.x, e.y ), e.z ); float shadowEpsilon = maxExtent * 5e-7f; - + // setup proper shadow ray batch traceTime = TestPrimaryRays( BVH::WALD_32BYTE, smallBatch, Nsmall, 1 ); // just to generate intersection points Ray* shadowBatch = (Ray*)tinybvh::malloc64( sizeof( Ray ) * Nsmall ); @@ -472,7 +472,7 @@ int main() } // get reference shadow ray query result refOccluded = 0, refOccl = new unsigned[Nsmall]; - for (int i = 0; i < Nsmall; i++) + for (int i = 0; i < Nsmall; i++) refOccluded += (refOccl[i] = bvh.IsOccluded( shadowBatch[i], BVH::WALD_32BYTE ) ? 1 : 0); #ifdef TRAVERSE_2WAY_ST @@ -643,7 +643,7 @@ int main() // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer gpuNodes( bvh.usedAltNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode ); tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx ); - tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts ); + tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), triangles ); // synchronize the host-side data to the gpu side gpuNodes.CopyToDevice(); idxData.CopyToDevice(); diff --git a/tiny_bvh_test.sln b/tiny_bvh_test.sln index e799e62..023e562 100644 --- a/tiny_bvh_test.sln +++ b/tiny_bvh_test.sln @@ -11,6 +11,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_speedtest", "vcpro EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_minimal", "vcproj\tiny_bvh_minimal.vcxproj", "{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_pt", "vcproj\tiny_bvh_pt.vcxproj", "{430FCECF-7685-40A7-A415-4A0DE487158F}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -51,6 +53,14 @@ Global {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x64.Build.0 = Release|x64 {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.ActiveCfg = Release|Win32 {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.Build.0 = Release|Win32 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x64.ActiveCfg = Debug|x64 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x64.Build.0 = Debug|x64 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x86.ActiveCfg = Debug|Win32 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Debug|x86.Build.0 = Debug|Win32 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x64.ActiveCfg = Release|x64 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x64.Build.0 = Release|x64 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x86.ActiveCfg = Release|Win32 + {430FCECF-7685-40A7-A415-4A0DE487158F}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/vcproj/tiny_bvh_fenster.vcxproj.user b/vcproj/tiny_bvh_fenster.vcxproj.user new file mode 100644 index 0000000..58dbbfa --- /dev/null +++ b/vcproj/tiny_bvh_fenster.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(SolutionDir) + WindowsLocalDebugger + + + $(SolutionDir) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_minimal.vcxproj.user b/vcproj/tiny_bvh_minimal.vcxproj.user new file mode 100644 index 0000000..58dbbfa --- /dev/null +++ b/vcproj/tiny_bvh_minimal.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(SolutionDir) + WindowsLocalDebugger + + + $(SolutionDir) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_pt.vcxproj b/vcproj/tiny_bvh_pt.vcxproj new file mode 100644 index 0000000..b668694 --- /dev/null +++ b/vcproj/tiny_bvh_pt.vcxproj @@ -0,0 +1,154 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 17.0 + Win32Proj + {430FCECF-7685-40A7-A415-4A0DE487158F} + tinybvhpt + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + NotSet + + + Application + false + v143 + true + NotSet + + + + + + + + + + + + + + + + + + + + + $(SolutionDir) + + + $(SolutionDir) + + + + Level3 + true + WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + + + Windows + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + + + Windows + true + true + true + + + + + Level3 + true + _DEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + stdcpp20 + ..\external\embree\include + false + + + Windows + true + ../external/embree/lib + $(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib + + + + + Level3 + true + true + true + NDEBUG;_WINDOWS;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS + true + stdcpp20 + ..\external\embree\include + true + + + Windows + true + true + true + ../external/embree/lib + $(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib + + + + + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_pt.vcxproj.user b/vcproj/tiny_bvh_pt.vcxproj.user new file mode 100644 index 0000000..58dbbfa --- /dev/null +++ b/vcproj/tiny_bvh_pt.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(SolutionDir) + WindowsLocalDebugger + + + $(SolutionDir) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_renderer.vcxproj.user b/vcproj/tiny_bvh_renderer.vcxproj.user new file mode 100644 index 0000000..58dbbfa --- /dev/null +++ b/vcproj/tiny_bvh_renderer.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(SolutionDir) + WindowsLocalDebugger + + + $(SolutionDir) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_speedtest.vcxproj.user b/vcproj/tiny_bvh_speedtest.vcxproj.user new file mode 100644 index 0000000..58dbbfa --- /dev/null +++ b/vcproj/tiny_bvh_speedtest.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(SolutionDir) + WindowsLocalDebugger + + + $(SolutionDir) + WindowsLocalDebugger + + \ No newline at end of file