Skip to content

Commit

Permalink
First working GPU NLPAR
Browse files Browse the repository at this point in the history
Signed-off by: David Rowenhorst <[email protected]>
  • Loading branch information
drowenhorst-nrl committed May 8, 2024
1 parent 2f615dd commit 5b249f6
Show file tree
Hide file tree
Showing 2 changed files with 445 additions and 15 deletions.
238 changes: 226 additions & 12 deletions pyebsdindex/opencl/clnlpar.cl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,28 @@ float sum16(const float16 *v1){

}

void writeadd16(__global float *placetowrite, float16 *v1);
void writeadd16(__global float *placetowrite, float16 *v1){

placetowrite[0] += v1[0].s0;
placetowrite[1] += v1[0].s1;
placetowrite[2] += v1[0].s2;
placetowrite[3] += v1[0].s3;
placetowrite[4] += v1[0].s4;
placetowrite[5] += v1[0].s5;
placetowrite[6] += v1[0].s6;
placetowrite[7] += v1[0].s7;
placetowrite[8] += v1[0].s8;
placetowrite[9] += v1[0].s9;
placetowrite[10] += v1[0].sa;
placetowrite[11] += v1[0].sb;
placetowrite[12] += v1[0].sc;
placetowrite[13] += v1[0].sd;
placetowrite[14] += v1[0].se;
placetowrite[15] += v1[0].sf;

}

void print16(const float16 v1);
void print16(const float16 v1){

Expand All @@ -46,30 +68,26 @@ void print16(const float16 v1){
}


__kernel void nlloadpat8bit(const __global uchar *datain, __global float *dataout){
__kernel void nlloadpat8bit(__global uchar *datain, __global float *dataout){
const unsigned long int x = get_global_id(0);
uchar imVal;
float imValflt;
imVal = datain[x];
imValflt = convert_float(imVal);
uchar imVal = datain[x];
float imValflt = convert_float(imVal);
dataout[x] = imValflt;
}

__kernel void nlloadpat16bit(const __global ushort *datain, __global float *dataout){
__kernel void nlloadpat16bit(__global ushort *datain, __global float *dataout){
const unsigned long int x = get_global_id(0);
ushort imVal;
float imValflt;
imVal = datain[x];
imValflt = convert_float(imVal);
ushort imVal = datain[x];
float imValflt = convert_float(imVal);
dataout[x] = imValflt;
}

__kernel void nlloadpat32flt(const __global float *datain, __global float *dataout){
__kernel void nlloadpat32flt(__global float *datain, __global float *dataout){
const unsigned long int x = get_global_id(0);
dataout[x] = datain[x];
}

__kernel void calcsigma( const __global float *data, const __global float16 *mask,
__kernel void calcsigma( __global float *data, __global float16 *mask,
__global float *sig,
__global float *dg, __global float *ng,
const long nn, const long ndatchunk, const long npatpoint, const float maxlim){
Expand Down Expand Up @@ -179,3 +197,199 @@ __kernel void calcsigma( const __global float *data, const __global float16 *mas



__kernel void calcnlpar(
const __global float *data,
const __global float16 *mask,
const __global float *sigma,
const __global long *crlimits,
__global float *dataout,
const long sr,
const long ndatchunk,
const long npatpoint,
const float maxlim,
const float lam2,
const float dthresh){
//IDs of work-item represent x and y coordinates in image
//const long4 calclim = crlimits[0];
const long x = get_global_id(0)+crlimits[0];
const long y = get_global_id(1)+crlimits[1];
const long ncol = crlimits[2]; //get_global_size(0);
const long nrow = crlimits[3];//get_global_size(1);
const long indx_xy = x+y*ncol;
//printf("%d\n", indx_xy);
const long indx0 = npatpoint * indx_xy;
//printf("%d, %d, %d, %d\n", x,y,ncol, nrow);
long i, j, z;
long indx_j, indx_ij, count;

long nnn = (2*sr+1) * (2*sr+1);

float16 d1, d0;
float16 mask0, mask1;
float dd, nn, sigma_ij, norm;
float sigma0 = sigma[indx_xy];
sigma0 *= sigma0;
//float16* writeloc;

float d[512]; // taking a risk here that noone will want a SR > 10
float n[512];


for(j=0; j < nnn; ++j){
d[j] = 0.0;
n[j] = 1.0e-6;
}


for(z = 0; z<ndatchunk; ++z){
count = 0;


d0 = *(__global float16*) (data + indx0+16*z);

mask0 = mask[z];
mask0 = (mask0 > (float16) 1.e-3) ? (float16) 1.0 : (float16) 0.0;
mask0 = d0 < (float16) (maxlim) ? mask0 : (float16) (0.0);


for(j=y-sr; j<=y+sr; ++j){

indx_j = (j >= 0) ? (j): abs(j);
indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
indx_j = ncol * indx_j;

for(i=x-sr; i<=x+sr; ++i){

indx_ij = (i >= 0) ? (i): abs(i);
indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);

indx_ij = (indx_ij + indx_j);

indx_ij *= npatpoint;

mask1 = mask0;
d1 = *(__global float16*) (data + indx_ij+16*z);


mask1 = d1 < (float16) (maxlim) ? mask1 : (float16) (0.0);
//mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6)));
//printf("%*s\n", 'here');

d1 = (d0-d1);
d1 *= d1 ;
d1 *= mask1;

dd = sum16(&d1);
dd = (indx_ij == indx0) ? -1.0 : dd; // mark the center point

d[count] += dd;
n[count] += sum16(&mask1);
//d[count+nnn*indx_xy] += dd;
//n[count+nnn*indx_xy] += sum16(&mask1);

count += 1;
}
}

}
// calculate the weights
count = 0;
float sum = 0.0;
nn = 1.0;
for(j=y-sr; j<=y+sr; ++j){

indx_j = (j >= 0) ? (j): abs(j);
indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
indx_j = ncol * indx_j;

for(i=x-sr; i<=x+sr; ++i){

dd = d[count];
if (dd > 1.e-3){
indx_ij = (i >= 0) ? (i): abs(i);
indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);

indx_ij = (indx_ij + indx_j);

sigma_ij = sigma[indx_xy];
sigma_ij *= sigma_ij;
nn = n[count];
dd -= nn*(sigma_ij+sigma0);

norm = (sigma0 + sigma_ij)*sqrt(2.0*nn);
if (norm > 1.0e-8){
dd /= norm;
} else {
dd = 1e6*nn;
}

} else {
dd = 0.0;
nn = 1.0;
}

dd -= dthresh;
dd = dd >= 0.0 ? dd : 0.0;

dd = exp(-1.0*dd*lam2);
sum += dd;
d[count] = dd;

count += 1;

}
}

for (j =0;j<nnn;j++){d[j] *= 1.0/sum; }

// now one more time through the loops to average across the patterns with the weights
// and place in the output array
for(z = 0; z<ndatchunk; ++z){
count = 0;
d0 = *(__global float16*) (data + indx0+16*z);

mask0 = mask[z];
mask0 = (mask0 > (float16) -1.e-3) ? (float16) 1.0 : (float16) 0.0;

for(j=y-sr; j<=y+sr; ++j){

indx_j = (j >= 0) ? (j): abs(j);
indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
indx_j = ncol * indx_j;

for(i=x-sr; i<=x+sr; ++i){

indx_ij = (i >= 0) ? (i): abs(i);
indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);

indx_ij = (indx_ij + indx_j);

indx_ij *= npatpoint;


d1 = *(__global float16*) (data + indx_ij+16*z);


d1 *= (float16) d[count];
d1 *= mask0;

*(__global float16*) (dataout + indx0+16*z) += d1;

//writeadd16(&(dataout[indx0+16*z]), &d1 );

//d[count+nnn*indx_xy] += dd;
//n[count+nnn*indx_xy] += sum16(&mask1);

count += 1;
}
}
}



}





Loading

0 comments on commit 5b249f6

Please sign in to comment.