From 27f825dd50075abeaa4d64522b0f62397b0908a9 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 13 May 2024 08:05:53 -0400
Subject: [PATCH] Optimize lambda working for opencl.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/clnlpar.cl  | 40 +++++++++++++++------------------
 pyebsdindex/opencl/nlpar_cl.py | 41 +++++++++-------------------------
 2 files changed, 29 insertions(+), 52 deletions(-)

diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl
index dbadeb9..9b34e8c 100644
--- a/pyebsdindex/opencl/clnlpar.cl
+++ b/pyebsdindex/opencl/clnlpar.cl
@@ -179,7 +179,7 @@ __kernel void normd(
     const __global float *sigma, 
     const __global float *n, 
      __global float *d,
-     const long sr){
+     const long nn){
 
     const long x = get_global_id(0);
     const long y = get_global_id(1);
@@ -190,13 +190,13 @@ __kernel void normd(
     long i, j, q;
     long indx_j, indx_ij; 
 
-    long nnn = (2*sr+1) * (2*sr+1);
-
+    long nnn = (2*nn+1) * (2*nn+1);
+    //printf("%d\n", nnn) ;
     float sigma_xy = sigma[indx_xy];  
     sigma_xy *= sigma_xy;
-    //printf("%f", sigma_xy);
-    float sigma_ij, nn, dd;  
-
+    // printf("%f", sigma_xy);
+    float sigma_ij, n_ij, d_ij;  
+    q = 0; 
     for(j=y-nn; j<=y+nn; ++j){
           
           indx_j =  (j >= 0) ? (j): abs(j);
@@ -212,23 +212,19 @@ __kernel void normd(
                sigma_ij *= sigma_ij;
               
                sigma_ij = sigma_ij + sigma_xy;
-               for(q=0;q<nnn;q++){
-                dd = d[q+nnn*indx_xy];
-                nn = n[q+nnn*indx_xy];    
-                if (nn > 1.0e-3){           
-                  dd -= nn*sigma_ij;
-                  dd /= (sigma_ij * sqrt(2.0*nn)); 
-                  //printf("%f\n", dd) ;
-                  d[q+nnn*indx_xy] = dd;    
-               }
-                    
-             }
-             
+               
+               d_ij = d[q+nnn*indx_xy];
+               n_ij = n[q+nnn*indx_xy];    
+               if (n_ij > 1.0e-3){           
+                  d_ij -= n_ij*sigma_ij;
+                  d_ij *= 1.0/( sigma_ij * sqrt(2.0*n_ij) ); 
+                  
+                  d[q+nnn*indx_xy] = d_ij;    
+                }
+               q += 1.0; 
            }
-
-     }      
-
-
+      } 
+      //printf("%d, %d, %d\n",nn, q, nnn);     
 }
 
 
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index 061fb0d..9c40672 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -27,38 +27,18 @@ def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs):
                             saturation_protect=saturation_protect,automask=automask, **kwargs)
 
   def opt_lambda_cl(self, saturation_protect=True, automask=True, backsub=False,
-                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True):
+                 target_weights=[0.5, 0.34, 0.25], dthresh=0.0, autoupdate=True, **kwargs):
 
     target_weights = np.asarray(target_weights)
 
     def loptfunc(lam, d2, tw, dthresh):
-      temp = (d2 > dthresh).choose(dthresh, d2)
+      temp = np.maximum(d2, dthresh)#(d2 > dthresh).choose(dthresh, d2)
       dw = np.exp(-(temp) / lam ** 2)
       w = np.sum(dw, axis=2) + 1e-12
 
       metric = np.mean(np.abs(tw - 1.0 / w))
       return metric
 
-    @numba.njit(fastmath=True, cache=True, parallel=True)
-    def d2normcl(d2, n2, sigmapad):
-      sftpat = np.array([[-1,-1], [-1, 0], [-1, 1],
-                [0, -1], [0, 0], [0, 1],
-                [1, -1], [1, 0], [1, 1]], dtype = np.int64)
-      #sftpat = sftpat.reshape(-1)
-      shp = d2.shape
-      s2 = sigmapad ** 2
-      for j in numba.prange(shp[0]):
-        for i in range(shp[1]):
-          s_ij = s2[j+1, i+1]
-          for q in range(shp[2]):
-            if n2[j, i, q] > 0:
-              jj = np.int64(j+1+sftpat[q,0])
-              ii = np.int64(i+1+sftpat[q, 1])
-              s_q =s2[jj,ii ]
-              s2_12 = (s_ij + s_q)
-              d2[j, i, q] -= n2[j, i, q] * s2_12
-              d2[j, i, q] /= s2_12 * np.sqrt(2.0 * n2[j, i, q])
-
     patternfile = self.getinfileobj()
     patternfile.read_header()
     nrows = np.uint64(self.nrows)  # np.uint64(patternfile.nRows)
@@ -71,7 +51,7 @@ def d2normcl(d2, n2, sigmapad):
     dthresh = np.float32(dthresh)
     lamopt_values = []
 
-    sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True)
+    sigma, d2, n2 = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, normalize_d=True, **kwargs)
 
     #sigmapad = np.pad(sigma, 1, mode='reflect')
     #d2normcl(d2, n2, sigmapad)
@@ -225,12 +205,13 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
                                dist_local, count_local,
                                np.int64(nn), np.int64(npatsteps), np.int64(npat_point),
                                np.float32(mxval) )
-        cl.enqueue_barrier(queue)
-        prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
-                        sigmachunk_gpu,
-                        count_local, dist_local,
-                        np.int64(nn))
-        queue.flush()
+        if normalize_d is True:
+          cl.enqueue_barrier(queue)
+          prg.normd(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
+                          sigmachunk_gpu,
+                          count_local, dist_local,
+                          np.int64(nn))
+        queue.finish()
 
         cl.enqueue_copy(queue, distchunk, dist_local, is_blocking=False)
         cl.enqueue_copy(queue, countchunk, count_local,  is_blocking=False)
@@ -432,7 +413,7 @@ def calcnlpar_cl(self,chunksize=0, searchradius=None, lam = None, dthresh = None
 
         calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64)
         crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim)
-        queue.flush()
+        cl.enqueue_barrier(queue)
         data_gpu.release()
         prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None,
         #prg.calcnlpar(queue, (1, 1), None,