From 27f825dd50075abeaa4d64522b0f62397b0908a9 Mon Sep 17 00:00:00 2001 From: David Rowenhorst Date: Mon, 13 May 2024 08:05:53 -0400 Subject: [PATCH] Optimize lambda working for opencl. Signed-off by: David Rowenhorst --- pyebsdindex/opencl/clnlpar.cl | 40 +++++++++++++++------------------ pyebsdindex/opencl/nlpar_cl.py | 41 +++++++++------------------------- 2 files changed, 29 insertions(+), 52 deletions(-) diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl index dbadeb9..9b34e8c 100644 --- a/pyebsdindex/opencl/clnlpar.cl +++ b/pyebsdindex/opencl/clnlpar.cl @@ -179,7 +179,7 @@ __kernel void normd( const __global float *sigma, const __global float *n, __global float *d, - const long sr){ + const long nn){ const long x = get_global_id(0); const long y = get_global_id(1); @@ -190,13 +190,13 @@ __kernel void normd( long i, j, q; long indx_j, indx_ij; - long nnn = (2*sr+1) * (2*sr+1); - + long nnn = (2*nn+1) * (2*nn+1); + //printf("%d\n", nnn) ; float sigma_xy = sigma[indx_xy]; sigma_xy *= sigma_xy; - //printf("%f", sigma_xy); - float sigma_ij, nn, dd; - + // printf("%f", sigma_xy); + float sigma_ij, n_ij, d_ij; + q = 0; for(j=y-nn; j<=y+nn; ++j){ indx_j = (j >= 0) ? (j): abs(j); @@ -212,23 +212,19 @@ __kernel void normd( sigma_ij *= sigma_ij; sigma_ij = sigma_ij + sigma_xy; - for(q=0;q 1.0e-3){ - dd -= nn*sigma_ij; - dd /= (sigma_ij * sqrt(2.0*nn)); - //printf("%f\n", dd) ; - d[q+nnn*indx_xy] = dd; - } - - } - + + d_ij = d[q+nnn*indx_xy]; + n_ij = n[q+nnn*indx_xy]; + if (n_ij > 1.0e-3){ + d_ij -= n_ij*sigma_ij; + d_ij *= 1.0/( sigma_ij * sqrt(2.0*n_ij) ); + + d[q+nnn*indx_xy] = d_ij; + } + q += 1.0; } - - } - - + } + //printf("%d, %d, %d\n",nn, q, nnn); } diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py index 061fb0d..9c40672 100644 --- a/pyebsdindex/opencl/nlpar_cl.py +++ b/pyebsdindex/opencl/nlpar_cl.py @@ -27,38 +27,18 @@ def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs): saturation_protect=saturation_protect,automask=automask, **kwargs) def opt_lambda_cl(self, saturation_protect=True, automask=True, backsub=False, - target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True): + target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs): target_weights = np.asarray(target_weights) def loptfunc(lam, d2, tw, dthresh): - temp = (d2 > dthresh).choose(dthresh, d2) + temp = np.maximum(d2, dthresh)#(d2 > dthresh).choose(dthresh, d2) dw = np.exp(-(temp) / lam ** 2) w = np.sum(dw, axis=2) + 1e-12 metric = np.mean(np.abs(tw - 1.0 / w)) return metric - @numba.njit(fastmath=True, cache=True, parallel=True) - def d2normcl(d2, n2, sigmapad): - sftpat = np.array([[-1,-1], [-1, 0], [-1, 1], - [0, -1], [0, 0], [0, 1], - [1, -1], [1, 0], [1, 1]], dtype = np.int64) - #sftpat = sftpat.reshape(-1) - shp = d2.shape - s2 = sigmapad ** 2 - for j in numba.prange(shp[0]): - for i in range(shp[1]): - s_ij = s2[j+1, i+1] - for q in range(shp[2]): - if n2[j, i, q] > 0: - jj = np.int64(j+1+sftpat[q,0]) - ii = np.int64(i+1+sftpat[q, 1]) - s_q =s2[jj,ii ] - s2_12 = (s_ij + s_q) - d2[j, i, q] -= n2[j, i, q] * s2_12 - d2[j, i, q] /= s2_12 * np.sqrt(2.0 * n2[j, i, q]) - patternfile = self.getinfileobj() patternfile.read_header() nrows = np.uint64(self.nrows) # np.uint64(patternfile.nRows) @@ -71,7 +51,7 @@ def d2normcl(d2, n2, sigmapad): dthresh = np.float32(dthresh) lamopt_values = [] - sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True) + sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True, **kwargs) #sigmapad = np.pad(sigma, 1, mode='reflect') #d2normcl(d2, n2, sigmapad) @@ -225,12 +205,13 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa dist_local, count_local, np.int64(nn), np.int64(npatsteps), np.int64(npat_point), np.float32(mxval) ) - cl.enqueue_barrier(queue) - prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, - sigmachunk_gpu, - count_local, dist_local, - np.int64(nn)) - queue.flush() + if normalize_d is True: + cl.enqueue_barrier(queue) + prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, + sigmachunk_gpu, + count_local, dist_local, + np.int64(nn)) + queue.finish() cl.enqueue_copy(queue, distchunk, dist_local, is_blocking=False) cl.enqueue_copy(queue, countchunk, count_local, is_blocking=False) @@ -432,7 +413,7 @@ def calcnlpar_cl(self,chunksize=0, searchradius=None, lam = None, dthresh = None calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64) crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim) - queue.flush() + cl.enqueue_barrier(queue) data_gpu.release() prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None, #prg.calcnlpar(queue, (1, 1), None,