PSO function optimization #135

NicerNewerCar · 2023-06-22T19:14:26Z

This issue will be used to document the optimization of the PSO function.

Information was obtained from Intel VTune Profiler after tracking 20 frames (10 mc3, 10 rad) of the WN00105 dataset.

Initial Results

As we can see the worst offender during the PSO function run is the NCC function:

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 186 to 226 in 62a7679

    
           float ncc(Buffer* f, Buffer* g, Buffer* mask, unsigned n) 
        
           { 
        
             float nbPixel = ncc_sum(mask, n); 
        
             float meanF = ncc_sum(f, n) / nbPixel; 
        
             float meanG = ncc_sum(g, n) / nbPixel; 
        
           #if DEBUG 
        
             std::cerr << "meanF: " << meanF << std::endl; 
        
             std::cerr << "meanG: " << meanG << std::endl; 
        
           #endif 
        
             size_t numThreads, numBlocks, sizeMem; 
        
             get_device_params(n, numThreads, numBlocks, sizeMem); 
        
             Kernel* kernel = ncc_kernel_.compile(Ncc_cl, "ncc_kernel"); 
        
             kernel->block1d(numThreads); 
        
             kernel->grid1d(numBlocks); 
        
             kernel->addBufferArg(f); 
        
             kernel->addArg(meanF); 
        
             kernel->addBufferArg(g); 
        
             kernel->addArg(meanG); 
        
             kernel->addBufferArg(mask); 
        
             kernel->addBufferArg(d_nums); 
        
             kernel->addBufferArg(d_den1s); 
        
             kernel->addBufferArg(d_den2s); 
        
             kernel->addArg(n); 
        
             kernel->launch(); 
        
             delete kernel; 
        
             float den = sqrt(ncc_sum(d_den1s,n)*ncc_sum(d_den2s,n)); 
        
             if (den < 1e-5) { 
        
               return 1e5; 
        
             } 
        
             return ncc_sum(d_nums,n)/den; 
        
           }

We can also see that the majority of the run time of the NCC function is from the NCC_SUM function (which should be expected since we are making 6 calls to the sum function during a single run of the ncc function). We can also see that the sum function is dominated by calls for launching the kernel and the Buffer write method. WIthin the kernel launch function calls to the OpenCL API are made, with the calls to clEnqueueNDRangeKernel taking up ~5.196 seconds and calls to clFinish taking up ~4.330 seconds. As for buffer writes, this is a little counter-intuitive because we are actually reading from the buffer and writing to the variable we passed in. But all of the runtime for Buffer::write is caused by clEnqueueReadBuffer command.

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 106 to 153 in 62a7679

    
           static float ncc_sum(Buffer* f, unsigned n) 
        
           { 
        
             size_t numThreads, numBlocks, sizeMem; 
        
             get_device_params(n, numThreads, numBlocks, sizeMem); 
        
             Kernel* kernel = ncc_sum_kernel_.compile(NccSum_cl, "ncc_sum_kernel"); 
        
             while (n > 1) 
        
             { 
        
           #if DEBUG 
        
               std::cerr << "ncc_sum[" << n << "] numThreads = " << numThreads << std::endl; 
        
               std::cerr << "ncc_sum[" << n << "] numBlocks = " << numBlocks << std::endl; 
        
               std::cerr << "ncc_sum[" << n << "] sizeMem = " << sizeMem << std::endl; 
        
           #endif 
        
               kernel->block2d(numThreads, 1); 
        
               kernel->grid2d(1, numBlocks); 
        
               kernel->addBufferArg(f); 
        
               kernel->addBufferArg(d_sums); 
        
               kernel->addLocalMem(sizeMem); 
        
               kernel->addArg(n); 
        
               kernel->launch(); 
        
           #if DEBUG 
        
               float *tmp = new float[numBlocks]; 
        
               d_sums->write(tmp, numBlocks*sizeof(float)); 
        
               for (unsigned j=0; j<numBlocks; j++) { 
        
                 std::cerr << " " << tmp[j]; 
        
               } 
        
               std::cerr << std::endl; 
        
               delete tmp; 
        
           #endif 
        
               n = numBlocks; 
        
               get_device_params(n, numThreads, numBlocks, sizeMem); 
        
               f = d_sums; 
        
               kernel->reset(); 
        
             } 
        
             delete kernel; 
        
             float h_sum; 
        
             d_sums->write(&h_sum, sizeof(float)); 
        
             return h_sum; 
        
           }

Moving forward

Buffer::Write

I would suggest removing the unnecessary call to Buffer::Write from within the ncc_sum driver:

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 150 to 152 in 62a7679

    
           float h_sum; 
        
           d_sums->write(&h_sum, sizeof(float)); 
        
           return h_sum;

That way we can keep the sums on the GPU, since they are used by the regular NCC kernel. The calculation of meanG and meanF can take place in the NCC kernel

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 188 to 190 in 62a7679

    
           float nbPixel = ncc_sum(mask, n); 
        
           float meanF = ncc_sum(f, n) / nbPixel; 
        
           float meanG = ncc_sum(g, n) / nbPixel;

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 205 to 213 in 62a7679

    
           kernel->addBufferArg(f); 
        
           kernel->addArg(meanF); 
        
           kernel->addBufferArg(g); 
        
           kernel->addArg(meanG); 
        
           kernel->addBufferArg(mask); 
        
           kernel->addBufferArg(d_nums); 
        
           kernel->addBufferArg(d_den1s); 
        
           kernel->addBufferArg(d_den2s); 
        
           kernel->addArg(n);

We will still need to read from the summation buffer once the execution of the regular ncc kernel is over, to return the expected float value. This will reduce our calls to Buffer::write during the ncc driver from 6 to 3. We could also investigate the viability of utilizing a kernel to calculate the den value, further reducing our calls from 3 to 2.

Autoscoper/libautoscoper/src/gpu/opencl/Ncc.cpp

Lines 219 to 225 in 62a7679

    
           float den = sqrt(ncc_sum(d_den1s,n)*ncc_sum(d_den2s,n)); 
        
           if (den < 1e-5) { 
        
             return 1e5; 
        
           } 
        
           return ncc_sum(d_nums,n)/den;

Kernel::Launch

I would suggest looking into improving the sum kernel somehow. ~~Currently, I am building OCLGrind to help aid in this investigation.~~ OCLGrind does not support OpenGL-OpenCL interop.

Autoscoper/libautoscoper/src/gpu/opencl/kernel/NccSum.cl

Lines 1 to 26 in 62a7679

    
           __kernel 
        
           void ncc_sum_kernel( 
        
               __global const float* f, 
        
               __global float* sums, 
        
               __local float* buffer, 
        
               unsigned n) 
        
           { 
        
             unsigned i = get_global_id(0) + get_global_id(1)*get_global_size(1); // global index 
        
             unsigned t = get_local_id(0); // thread index 
        
             buffer[t] = (i < n) ? f[i] : 0.0f; 
        
             barrier(CLK_LOCAL_MEM_FENCE); 
        
             for(unsigned s = get_local_size(0)/2; s > 0; s >>= 1) { 
        
               if (t < s) { 
        
                 buffer[t] += buffer[t + s]; 
        
               } 
        
               barrier(CLK_LOCAL_MEM_FENCE); 
        
             } 
        
             if (t == 0) { 
        
               sums[get_global_id(1)] = buffer[0]; 
        
             } 
        
           } 
        
           // vim: ts=4 syntax=cpp noexpandtab

Outside of the PSO function

It would appear that everything outside of the PSO function runs in a negligible time. NOTE: Anything in the stack that is in the format func@{memory address} are calls to functions from external libraries such as OpenCL, OpenGL, Microsoft Direct3D, Windows USER32, etc

The text was updated successfully, but these errors were encountered:

amymmorton · 2023-10-23T16:43:41Z

This was really helpful to revisit today- Thanks @NicerNewerCar

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PSO function optimization #135

PSO function optimization #135

NicerNewerCar commented Jun 22, 2023 •

edited

Loading

amymmorton commented Oct 23, 2023

PSO function optimization #135

PSO function optimization #135

Comments

NicerNewerCar commented Jun 22, 2023 • edited Loading

Initial Results

Moving forward

Buffer::Write

Kernel::Launch

Outside of the PSO function

amymmorton commented Oct 23, 2023

NicerNewerCar commented Jun 22, 2023 •

edited

Loading