Skip to content

Commit

Permalink
Optimize lambda working for opencl.
Browse files Browse the repository at this point in the history
Signed-off by: David Rowenhorst <[email protected]>
  • Loading branch information
drowenhorst-nrl committed May 13, 2024
1 parent b117a9e commit 27f825d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 52 deletions.
40 changes: 18 additions & 22 deletions pyebsdindex/opencl/clnlpar.cl
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ __kernel void normd(
const __global float *sigma,
const __global float *n,
__global float *d,
const long sr){
const long nn){

const long x = get_global_id(0);
const long y = get_global_id(1);
Expand All @@ -190,13 +190,13 @@ __kernel void normd(
long i, j, q;
long indx_j, indx_ij;

long nnn = (2*sr+1) * (2*sr+1);

long nnn = (2*nn+1) * (2*nn+1);
//printf("%d\n", nnn) ;
float sigma_xy = sigma[indx_xy];
sigma_xy *= sigma_xy;
//printf("%f", sigma_xy);
float sigma_ij, nn, dd;

// printf("%f", sigma_xy);
float sigma_ij, n_ij, d_ij;
q = 0;
for(j=y-nn; j<=y+nn; ++j){

indx_j = (j >= 0) ? (j): abs(j);
Expand All @@ -212,23 +212,19 @@ __kernel void normd(
sigma_ij *= sigma_ij;

sigma_ij = sigma_ij + sigma_xy;
for(q=0;q<nnn;q++){
dd = d[q+nnn*indx_xy];
nn = n[q+nnn*indx_xy];
if (nn > 1.0e-3){
dd -= nn*sigma_ij;
dd /= (sigma_ij * sqrt(2.0*nn));
//printf("%f\n", dd) ;
d[q+nnn*indx_xy] = dd;
}

}


d_ij = d[q+nnn*indx_xy];
n_ij = n[q+nnn*indx_xy];
if (n_ij > 1.0e-3){
d_ij -= n_ij*sigma_ij;
d_ij *= 1.0/( sigma_ij * sqrt(2.0*n_ij) );

d[q+nnn*indx_xy] = d_ij;
}
q += 1.0;
}

}


}
//printf("%d, %d, %d\n",nn, q, nnn);
}


Expand Down
41 changes: 11 additions & 30 deletions pyebsdindex/opencl/nlpar_cl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,18 @@ def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs):
saturation_protect=saturation_protect,automask=automask, **kwargs)

def opt_lambda_cl(self, saturation_protect=True, automask=True, backsub=False,
target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True):
target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs):

target_weights = np.asarray(target_weights)

def loptfunc(lam, d2, tw, dthresh):
temp = (d2 > dthresh).choose(dthresh, d2)
temp = np.maximum(d2, dthresh)#(d2 > dthresh).choose(dthresh, d2)
dw = np.exp(-(temp) / lam ** 2)
w = np.sum(dw, axis=2) + 1e-12

metric = np.mean(np.abs(tw - 1.0 / w))
return metric

@numba.njit(fastmath=True, cache=True, parallel=True)
def d2normcl(d2, n2, sigmapad):
sftpat = np.array([[-1,-1], [-1, 0], [-1, 1],
[0, -1], [0, 0], [0, 1],
[1, -1], [1, 0], [1, 1]], dtype = np.int64)
#sftpat = sftpat.reshape(-1)
shp = d2.shape
s2 = sigmapad ** 2
for j in numba.prange(shp[0]):
for i in range(shp[1]):
s_ij = s2[j+1, i+1]
for q in range(shp[2]):
if n2[j, i, q] > 0:
jj = np.int64(j+1+sftpat[q,0])
ii = np.int64(i+1+sftpat[q, 1])
s_q =s2[jj,ii ]
s2_12 = (s_ij + s_q)
d2[j, i, q] -= n2[j, i, q] * s2_12
d2[j, i, q] /= s2_12 * np.sqrt(2.0 * n2[j, i, q])

patternfile = self.getinfileobj()
patternfile.read_header()
nrows = np.uint64(self.nrows) # np.uint64(patternfile.nRows)
Expand All @@ -71,7 +51,7 @@ def d2normcl(d2, n2, sigmapad):
dthresh = np.float32(dthresh)
lamopt_values = []

sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True)
sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True, **kwargs)

#sigmapad = np.pad(sigma, 1, mode='reflect')
#d2normcl(d2, n2, sigmapad)
Expand Down Expand Up @@ -225,12 +205,13 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
dist_local, count_local,
np.int64(nn), np.int64(npatsteps), np.int64(npat_point),
np.float32(mxval) )
cl.enqueue_barrier(queue)
prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
sigmachunk_gpu,
count_local, dist_local,
np.int64(nn))
queue.flush()
if normalize_d is True:
cl.enqueue_barrier(queue)
prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
sigmachunk_gpu,
count_local, dist_local,
np.int64(nn))
queue.finish()

cl.enqueue_copy(queue, distchunk, dist_local, is_blocking=False)
cl.enqueue_copy(queue, countchunk, count_local, is_blocking=False)
Expand Down Expand Up @@ -432,7 +413,7 @@ def calcnlpar_cl(self,chunksize=0, searchradius=None, lam = None, dthresh = None

calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64)
crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim)
queue.flush()
cl.enqueue_barrier(queue)
data_gpu.release()
prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None,
#prg.calcnlpar(queue, (1, 1), None,
Expand Down

0 comments on commit 27f825d

Please sign in to comment.