Skip to content

Commit

Permalink
Functional basic gpu wavefront path tracer.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Dec 19, 2024
1 parent dd8dc99 commit 299bc27
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 31 deletions.
33 changes: 19 additions & 14 deletions tiny_bvh_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ using namespace tinybvh;
static BVH8_CWBVH bvh;
static bvhvec4* tris = 0;
static int triCount = 0, frameIdx = 0, spp = 0;
static Kernel* init, * clear, * generate, * extend, * shade, * traceShadows, * finalize;
static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections;
static Kernel* init, * clear, * generate, * extend, * shade;
static Kernel* updateCounters1, * updateCounters2, * traceShadows, * finalize;
static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triData;

// View pyramid for a pinhole camera
struct RenderData
Expand Down Expand Up @@ -58,6 +59,8 @@ void Init()
generate = new Kernel( "wavefront.cl", "Generate" );
extend = new Kernel( "wavefront.cl", "Extend" );
shade = new Kernel( "wavefront.cl", "Shade" );
updateCounters1 = new Kernel( "wavefront.cl", "UpdateCounters1" );
updateCounters2 = new Kernel( "wavefront.cl", "UpdateCounters2" );
traceShadows = new Kernel( "wavefront.cl", "Connect" );
finalize = new Kernel( "wavefront.cl", "Finalize" );
// create OpenCL buffers
Expand All @@ -82,6 +85,8 @@ void Init()
connections = new Buffer( N * sizeof( bvhvec4 ) * 3 );
accumulator = new Buffer( N * sizeof( bvhvec4 ) );
pixels = new Buffer( N * sizeof( uint32_t ) );
triData = new Buffer( triCount * 3 * sizeof( bvhvec4 ), tris );
triData->CopyToDevice();
// load camera position / direction from file
std::fstream t = std::fstream{ "camera_gpu.bin", t.binary | t.in };
if (!t.is_open()) return;
Expand Down Expand Up @@ -123,22 +128,22 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
spp = 1;
}
// wavefront step 0: render on the GPU
init->SetArguments( N, rd.eye, rd.p0, rd.p1, rd.p2, 0, cwbvhNodes, cwbvhTris );
init->SetArguments( N, rd.eye, rd.p0, rd.p1, rd.p2, frameIdx, cwbvhNodes, cwbvhTris );
init->Run( 1 ); // init atomic counters, set buffer ptrs etc.
// wavefront step 1: generate primary rays
generate->SetArguments( raysOut );
generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
// wavefront step 2: extend paths
swap( raysOut, raysIn );
extend->SetArguments( raysIn );
extend->Run( 16384 /* todo: 64 * SM count */ );
// wavefront step 3: shade intersection results
shade->SetArguments( accumulator, raysIn, raysOut, connections );
shade->Run( 16384 /* todo: 64 * SM count */ );
// wavefront step 4: connect shadow rays
for (int i = 0; i < 4; i++)
{
swap( raysOut, raysIn );
extend->SetArguments( raysIn );
extend->Run( N /* todo: 64 * SM count */ );
updateCounters1->Run( 1 );
shade->SetArguments( accumulator, raysIn, raysOut, connections, triData );
shade->Run( N /* todo: 64 * SM count */ );
updateCounters2->Run( 1 );
}
traceShadows->SetArguments( accumulator, connections );
traceShadows->Run( 1024 );
// wavefront step 4: finalize to pixel buffer
finalize->SetArguments( accumulator, 1.0f / (float)spp++, pixels );
finalize->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
pixels->CopyFromDevice();
Expand All @@ -153,6 +158,6 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
void Shutdown()
{
// save camera position / direction to file
std::fstream s = std::fstream{ "camera.bin", s.binary | s.out };
std::fstream s = std::fstream{ "camera_gpu.bin", s.binary | s.out };
s.write( (char*)&rd, sizeof( rd ) );
}
129 changes: 112 additions & 17 deletions wavefront.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

#include "traverse.cl"

#define PI 3.14159265358979323846264f
#define INVPI 0.31830988618379067153777f
#define INV2PI 0.15915494309189533576888f
#define TWOPI 6.28318530717958647692528f

// struct for rendering parameters - keep in sync with CPU side.
struct RenderData
{
Expand All @@ -18,11 +23,39 @@ struct RenderData
__global volatile int extendTasks, shadeTasks, connectTasks;
__global struct RenderData rd;


// Xor32 RNG
uint WangHash( uint s ) { s = (s ^ 61) ^ (s >> 16), s *= 9, s = s ^ (s >> 4), s *= 0x27d4eb2d; return s ^ (s >> 15); }
uint RandomUInt( uint* seed ) { *seed ^= *seed << 13, *seed ^= *seed >> 17, *seed ^= *seed << 5; return *seed; }
float RandomFloat( uint* seed ) { return RandomUInt( seed ) * 2.3283064365387e-10f; }

// DiffuseReflection: Uniform random bounce in the hemisphere
float3 DiffuseReflection( float3 N, uint* seed )
{
float3 R;
do
{
R = (float3)( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 );
} while (dot( R, R ) > 1);
return normalize( dot( R, N ) > 0 ? R : -R );
}

// CosWeightedDiffReflection: Cosine-weighted random bounce in the hemisphere
float3 CosWeightedDiffReflection( const float3 N, uint* seed )
{
float3 R;
do
{
R = (float3)( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 );
} while (dot( R, R ) > 1);
return normalize( N + normalize( R ) );
}

// PathState: path throughput, current extension ray, pixel index
struct PathState
{
float4 T; // xyz = rgb, postponed pdf in w
float4 O; // pixel index in O.w
float4 O; // pixel index and path depth in O.w
float4 D; // t in D.w
float4 hit;
};
Expand All @@ -39,7 +72,7 @@ struct Potential
void kernel SetRenderData(
int primaryRayCount,
float4 eye, float4 p0, float4 p1, float4 p2,
unsigned frameIdx,
uint frameIdx,
global float4* cwbvhNodes,
global float4* cwbvhTris
)
Expand Down Expand Up @@ -68,14 +101,15 @@ void kernel Clear( global float4* accumulator )
// primary ray generation
void kernel Generate( global struct PathState* raysOut )
{
const unsigned x = get_global_id( 0 ), y = get_global_id( 1 );
const unsigned id = x + y * get_global_size( 0 );
const uint x = get_global_id( 0 ), y = get_global_id( 1 );
const uint id = x + y * get_global_size( 0 );
const float u = (float)x / (float)get_global_size( 0 );
const float v = (float)y / (float)get_global_size( 1 );
const float4 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0);
raysOut[id].T = (float4)( 1, 1, 1, 1 /* pdf */ );
raysOut[id].O = (float4)( rd.eye.xyz, as_float( id ) );
raysOut[id].O = (float4)( rd.eye.xyz, as_float( id << 4 /* low bits: depth */ ) );
raysOut[id].D = (float4)( normalize( P.xyz - rd.eye.xyz ), 1e30f );
raysOut[id].hit = (float4)( 1e30f, 0, 0, as_float( 0 ) );
}

// extend: trace the generated rays to find the nearest intersection point.
Expand All @@ -84,6 +118,7 @@ void kernel Extend( global struct PathState* raysIn )
while (1)
{
// obtain task
if (extendTasks < 1) break;
const int pathId = atomic_dec( &extendTasks ) - 1;
if (pathId < 0) break;
const float4 O4 = raysIn[pathId].O;
Expand All @@ -93,23 +128,83 @@ void kernel Extend( global struct PathState* raysIn )
}
}

// shade: process intersection results; this evaluates the BRDF and creates extension
// rays and shadow rays.
void kernel Shade( global float4* accumulator, global struct PathState* raysIn, global struct PathState* raysOut, global struct Potential* shadowOut )
// syncing counters: at this point, we need to reset the extendTasks counter.
void kernel UpdateCounters1()
{
if (get_global_id( 0 ) != 0) return;
extendTasks = 0;
}

// shade: process intersection results; this evaluates the BRDF and creates
// extension rays and shadow rays.
void kernel Shade(
global float4* accumulator,
global struct PathState* raysIn,
global struct PathState* raysOut,
global struct Potential* shadowOut,
global float4* verts
)
{
while (1)
{
// obntain task
// obtain task
if (shadeTasks < 1) break;
const int pathId = atomic_dec( &shadeTasks ) - 1;
if (pathId < 0) break;
float4 data1 = raysIn[pathId].O;
float4 data2 = raysIn[pathId].D;
float4 data3 = raysIn[pathId].hit;
uint pixelIdx = as_uint( data1.w );
accumulator[pixelIdx] += (float4)(data3.x * 0.01f );
// fetch path data
float4 data0 = raysIn[pathId].T; // xyz = rgb, postponed pdf in w
float4 data1 = raysIn[pathId].O; // pixel index in O.w
float4 data2 = raysIn[pathId].D; // t in D.w
float4 data3 = raysIn[pathId].hit; // dist, u, v, prim
// prepare for shading
uint depth = as_uint( data1.w ) & 15;
uint pixelIdx = as_uint( data1.w ) >> 4;
uint seed = WangHash( as_uint( data1.w ) + rd.frameIdx * 17117);
// end path on sky
if (data3.x == 1e30f)
{
float3 skyColor = (float3)( 0.7f, 0.7f, 1.2f );
accumulator[pixelIdx] += (float4)( data0.xyz * skyColor, 1 );
continue;
}
// fetch geometry at intersection point
uint vertIdx = as_uint( data3.w ) * 3;
float4 v0 = verts[vertIdx];
float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz;
float3 N = normalize( cross( vert1 - vert0, vert2 - vert0 ) );
float3 D = data2.xyz;
if (dot( N, D ) > 0) N *= -1;
float3 T = data0.xyz;
float3 O = data1.xyz;
float t = data3.x;
float3 I = O + t * D;
// bounce
if (depth < 4)
{
uint newRayIdx = atomic_inc( &extendTasks );
float3 BRDF = (float3)(1) /* just white for now */ * INVPI;
#if 0
float3 R = DiffuseReflection( N, &seed );
float PDF = INV2PI;
#else
float3 R = CosWeightedDiffReflection( N, &seed );
float PDF = dot( N, R ) * INVPI;
#endif
T *= dot( N, R ) * BRDF * (1.0f / PDF);
raysOut[newRayIdx].T = (float4)( T, 1 );
raysOut[newRayIdx].O = (float4)( I + R * 0.001f, as_float( (pixelIdx << 4) + depth + 1 ) );
raysOut[newRayIdx].D = (float4)( R, 1e30f );
}
}
}

// syncing counters: we generated extensions; those will need shading too.
void kernel UpdateCounters2()
{
if (get_global_id( 0 ) != 0) return;
shadeTasks = extendTasks;
}

// connect: trace shadow rays and deposit their potential contribution to the pixels
// if not occluded.
void kernel Connect( global float4* accumulator, global struct Potential* shadowIn )
Expand All @@ -123,7 +218,7 @@ void kernel Connect( global float4* accumulator, global struct Potential* shadow
const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z );
bool occluded = false; // isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w );
if (occluded) return;
unsigned pixelIdx = as_uint( O4.w );
uint pixelIdx = as_uint( O4.w );
accumulator[pixelIdx] += T4;
}

Expand All @@ -133,8 +228,8 @@ void kernel Connect( global float4* accumulator, global struct Potential* shadow
// interop do write directly to a texture.
void kernel Finalize( global float4* accumulator, const float scale, global uint* pixels )
{
const unsigned x = get_global_id( 0 ), y = get_global_id( 1 );
const unsigned pixelIdx = x + y * get_global_size( 0 );
const uint x = get_global_id( 0 ), y = get_global_id( 1 );
const uint pixelIdx = x + y * get_global_size( 0 );
const float4 p = accumulator[pixelIdx] * scale;
const int r = (int)(255.0f * min( 1.0f, sqrt( p.x ) ));
const int g = (int)(255.0f * min( 1.0f, sqrt( p.y ) ));
Expand Down

0 comments on commit 299bc27

Please sign in to comment.