Skip to content

Commit

Permalink
Merge pull request #56 from aytekaman/drop-openmp
Browse files Browse the repository at this point in the history
Replace OpenMP with cpp11 threads
  • Loading branch information
jbikker authored Dec 13, 2024
2 parents b872ed1 + 73ded76 commit 3eafebb
Showing 1 changed file with 81 additions and 24 deletions.
105 changes: 81 additions & 24 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,62 @@ void ValidateTraceResult( Ray* batch, float* ref, unsigned N, unsigned line )
}
}

// Multi-threading
#include <atomic>
#include <thread>
#include <vector>

static unsigned threadCount = std::thread::hardware_concurrency();
static std::atomic<int> batchIdx(0);

#if defined(TRAVERSE_2WAY_MT) || defined(ENABLE_OPENCL)

void IntersectBvhWorkerThread(int batchCount, Ray* fullBatch, int threadIdx)
{
int batch = threadIdx;
while (batch < batchCount)
{
const int batchStart = batch * 10000;
for (int i = 0; i < 10000; i++) bvh->Intersect(fullBatch[batchStart + i]);

batch = batchIdx++;
}
}

#endif

#ifdef TRAVERSE_2WAY_MT_PACKET

void IntersectBvh256WorkerThread(int batchCount, Ray* fullBatch, int threadIdx)
{
int batch = threadIdx;
while (batch < batchCount)
{
const int batchStart = batch * 30 * 256;
for (int i = 0; i < 30; i++) bvh->Intersect256Rays(fullBatch + batchStart + i * 256);

batch = batchIdx++;
}
}

#endif

#ifdef BVH_USEAVX

void IntersectBvh256SSEWorkerThread(int batchCount, Ray* fullBatch, int threadIdx)
{
int batch = threadIdx;
while (batch < batchCount)
{
const int batchStart = batch * 30 * 256;
for (int i = 0; i < 30; i++) bvh->Intersect256RaysSSE(fullBatch + batchStart + i * 256);

batch = batchIdx++;
}
}

#endif

int main()
{
int minor = TINY_BVH_VERSION_MINOR;
Expand Down Expand Up @@ -651,12 +707,13 @@ int main()

// calculate full res reference distances using threaded traversal on CPU.
const int batchCount = Nfull / 10000;
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
{
const int batchStart = batch * 10000;
for (int i = 0; i < 10000; i++) bvh->Intersect( fullBatch[batchStart + i] );
}

batchIdx = threadCount;
std::vector<std::thread> threads;
for (uint32_t i = 0; i < threadCount; i++)
threads.emplace_back(&IntersectBvhWorkerThread, batchCount, fullBatch, i);
for (auto& thread : threads) thread.join();

refDistFull = new float[Nfull];
for (int i = 0; i < Nfull; i++) refDistFull[i] = fullBatch[i].hit.t;

Expand Down Expand Up @@ -803,12 +860,12 @@ int main()
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / 10000;
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
{
const int batchStart = batch * 10000;
for (int i = 0; i < 10000; i++) bvh->Intersect( fullBatch[batchStart + i] );
}

batchIdx = threadCount;
std::vector<std::thread> threads;
for (uint32_t i = 0; i < threadCount; i++)
threads.emplace_back(&IntersectBvhWorkerThread, batchCount, fullBatch, i);
for (auto& thread : threads) thread.join();
}
traceTime = t.elapsed() / 3.0f;
printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f );
Expand All @@ -823,12 +880,12 @@ int main()
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
{
const int batchStart = batch * 30 * 256;
for (int i = 0; i < 30; i++) bvh->Intersect256Rays( fullBatch + batchStart + i * 256 );
}

batchIdx = threadCount;
std::vector<std::thread> threads;
for (uint32_t i = 0; i < threadCount; i++)
threads.emplace_back(&IntersectBvh256WorkerThread, batchCount, fullBatch, i);
for (auto& thread : threads) thread.join();
}
traceTime = t.elapsed() / 3.0f;
printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f );
Expand All @@ -842,12 +899,12 @@ int main()
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
{
const int batchStart = batch * 30 * 256;
for (int i = 0; i < 30; i++) bvh->Intersect256RaysSSE( fullBatch + batchStart + i * 256 );
}

batchIdx = threadCount;
std::vector<std::thread> threads;
for (uint32_t i = 0; i < threadCount; i++)
threads.emplace_back(&IntersectBvh256SSEWorkerThread, batchCount, fullBatch, i);
for (auto& thread : threads) thread.join();
}
traceTime = t.elapsed() / 3.0f;
printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f );
Expand Down

0 comments on commit 3eafebb

Please sign in to comment.