diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl
index 54a8844..14bd0d3 100644
--- a/pyebsdindex/opencl/clnlpar.cl
+++ b/pyebsdindex/opencl/clnlpar.cl
@@ -23,6 +23,28 @@ float sum16(const float16 *v1){
 
 }
 
+void writeadd16(__global float *placetowrite, float16 *v1);
+void writeadd16(__global float *placetowrite, float16 *v1){
+     
+     placetowrite[0] += v1[0].s0;
+     placetowrite[1] += v1[0].s1;
+     placetowrite[2] += v1[0].s2;
+     placetowrite[3] += v1[0].s3;
+     placetowrite[4] += v1[0].s4;
+     placetowrite[5] += v1[0].s5;
+     placetowrite[6] += v1[0].s6;
+     placetowrite[7] += v1[0].s7;
+     placetowrite[8] += v1[0].s8;
+     placetowrite[9] += v1[0].s9;
+     placetowrite[10] += v1[0].sa;
+     placetowrite[11] += v1[0].sb;
+     placetowrite[12] += v1[0].sc;
+     placetowrite[13] += v1[0].sd;
+     placetowrite[14] += v1[0].se;
+     placetowrite[15] += v1[0].sf;
+
+}
+
 void print16(const float16 v1);
 void print16(const float16 v1){
      
@@ -46,30 +68,26 @@ void print16(const float16 v1){
 }
 
 
-__kernel void nlloadpat8bit(const __global uchar *datain, __global float *dataout){
+__kernel void nlloadpat8bit(__global uchar *datain, __global float *dataout){
   const unsigned long int x = get_global_id(0);
-  uchar imVal;
-  float imValflt;
-  imVal =  datain[x];
-  imValflt = convert_float(imVal);
+  uchar imVal =  datain[x];
+  float imValflt = convert_float(imVal);
   dataout[x] = imValflt;
 }
 
-__kernel void nlloadpat16bit(const __global ushort *datain, __global float *dataout){
+__kernel void nlloadpat16bit(__global ushort *datain, __global float *dataout){
   const unsigned long int x = get_global_id(0);
-  ushort imVal;
-  float imValflt;
-  imVal =  datain[x];
-  imValflt = convert_float(imVal);
+  ushort imVal = datain[x];
+  float imValflt = convert_float(imVal);
   dataout[x] = imValflt;
 }
 
-__kernel void nlloadpat32flt(const __global float *datain, __global float *dataout){
+__kernel void nlloadpat32flt(__global float *datain, __global float *dataout){
   const unsigned long int x = get_global_id(0);
   dataout[x] = datain[x];
 }
 
-__kernel void calcsigma( const __global float *data, const __global float16 *mask, 
+__kernel void calcsigma( __global float *data, __global float16 *mask, 
       __global float *sig, 
       __global float *dg, __global float *ng,
       const long  nn, const long ndatchunk, const long npatpoint, const float maxlim){
@@ -179,3 +197,199 @@ __kernel void calcsigma( const __global float *data, const __global float16 *mas
 
 
 
+__kernel void calcnlpar( 
+      const __global float *data, 
+      const __global float16 *mask, 
+      const __global float *sigma, 
+      const __global long *crlimits, 
+      __global float *dataout, 
+      const long sr, 
+      const long ndatchunk, 
+      const long npatpoint, 
+      const float maxlim, 
+      const float lam2, 
+      const float dthresh){
+  //IDs of work-item represent x and y coordinates in image
+  //const long4 calclim =  crlimits[0];
+  const long x = get_global_id(0)+crlimits[0];
+  const long y = get_global_id(1)+crlimits[1];
+  const long ncol = crlimits[2]; //get_global_size(0);
+  const long nrow = crlimits[3];//get_global_size(1);
+  const long indx_xy = x+y*ncol;
+  //printf("%d\n", indx_xy);
+  const long indx0 = npatpoint * indx_xy;
+  //printf("%d, %d, %d, %d\n", x,y,ncol, nrow); 
+  long i, j, z;
+  long indx_j, indx_ij, count; 
+
+  long nnn = (2*sr+1) * (2*sr+1);
+
+  float16 d1, d0; 
+  float16 mask0, mask1;
+  float dd, nn, sigma_ij, norm; 
+  float sigma0 = sigma[indx_xy];
+  sigma0 *= sigma0; 
+  //float16* writeloc; 
+
+  float d[512]; // taking a risk here that noone will want a SR > 10
+  float n[512]; 
+
+  
+  for(j=0; j < nnn; ++j){
+      d[j] = 0.0;
+      n[j] = 1.0e-6; 
+  }
+
+
+  for(z = 0; z<ndatchunk; ++z){
+      count = 0; 
+      
+      
+      d0 =  *(__global float16*) (data + indx0+16*z); 
+      
+      mask0 = mask[z];
+      mask0 =  (mask0 > (float16) 1.e-3) ? (float16) 1.0 : (float16) 0.0; 
+      mask0 =  d0 < (float16) (maxlim) ? mask0 : (float16) (0.0);
+      
+      
+      for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+            
+              indx_ij =  (i >= 0) ? (i): abs(i);
+              indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+              
+              indx_ij = (indx_ij + indx_j);
+              
+              indx_ij *= npatpoint;
+              
+              mask1 = mask0;
+              d1 =  *(__global float16*) (data + indx_ij+16*z); 
+              
+
+              mask1 =  d1 < (float16) (maxlim) ? mask1 : (float16) (0.0);
+              //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6)));
+              //printf("%*s\n", 'here');
+              
+              d1 = (d0-d1);
+              d1 *= d1 ; 
+              d1 *= mask1; 
+
+              dd = sum16(&d1);
+              dd = (indx_ij == indx0) ? -1.0 : dd; // mark the center point 
+              
+              d[count] += dd;
+              n[count] += sum16(&mask1);
+              //d[count+nnn*indx_xy] += dd;
+              //n[count+nnn*indx_xy] += sum16(&mask1);
+
+              count += 1; 
+          }
+      }
+      
+  }
+// calculate the weights
+  count = 0; 
+  float sum = 0.0; 
+  nn = 1.0;
+  for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+              
+              dd = d[count]; 
+              if (dd > 1.e-3){
+                indx_ij =  (i >= 0) ? (i): abs(i);
+                indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+                
+                indx_ij = (indx_ij + indx_j);
+                
+                sigma_ij = sigma[indx_xy];
+                sigma_ij *= sigma_ij; 
+                nn = n[count]; 
+                dd -= nn*(sigma_ij+sigma0);
+                
+                norm = (sigma0 + sigma_ij)*sqrt(2.0*nn);
+                if (norm > 1.0e-8){
+                  dd /= norm;  
+                } else {
+                  dd = 1e6*nn;
+                } 
+
+              } else {
+                dd = 0.0;
+                nn = 1.0;
+              }
+              
+              dd -= dthresh; 
+              dd = dd >= 0.0 ? dd : 0.0; 
+
+              dd = exp(-1.0*dd*lam2); 
+              sum += dd; 
+              d[count] = dd;
+
+              count += 1; 
+              
+          }
+  }
+  
+  for (j =0;j<nnn;j++){d[j] *= 1.0/sum; }
+
+// now one more time through the loops to average across the patterns with the weights  
+// and place in the output array
+  for(z = 0; z<ndatchunk; ++z){
+      count = 0; 
+      d0 =  *(__global float16*) (data + indx0+16*z); 
+      
+      mask0 = mask[z];
+      mask0 =  (mask0 > (float16) -1.e-3) ? (float16) 1.0 : (float16) 0.0; 
+      
+      for(j=y-sr; j<=y+sr; ++j){
+          
+          indx_j =  (j >= 0) ? (j): abs(j);
+          indx_j =  (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1);
+          indx_j = ncol * indx_j;
+          
+          for(i=x-sr; i<=x+sr; ++i){
+            
+              indx_ij =  (i >= 0) ? (i): abs(i);
+              indx_ij =  (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1);
+              
+              indx_ij = (indx_ij + indx_j);
+              
+              indx_ij *= npatpoint;
+              
+              
+              d1 =  *(__global float16*) (data + indx_ij+16*z); 
+              
+              
+              d1 *= (float16) d[count]; 
+              d1 *= mask0; 
+              
+              *(__global float16*) (dataout + indx0+16*z) += d1; 
+              
+              //writeadd16(&(dataout[indx0+16*z]), &d1 );
+
+              //d[count+nnn*indx_xy] += dd;
+              //n[count+nnn*indx_xy] += sum16(&mask1);
+
+              count += 1; 
+          }
+      }
+  }
+
+
+
+}
+
+
+
+
+
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index 0e847b9..78fb34b 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -14,10 +14,14 @@ def __init__( self, **kwargs):
     self.useCPU = False
 
   def calcsigma(self,nn=1, saturation_protect=True,automask=True, **kwargs):
-    return self.calcsigmacl(nn=nn,
+    return self.calcsigma_cl(nn=nn,
                             saturation_protect=saturation_protect,
                             automask=automask, **kwargs)[0]
-  def calcsigmacl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, **kwargs):
+  def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs):
+    return nlpar.NLPAR.calcsigma(self, nn=nn,
+                            saturation_protect=saturation_protect,
+                            automask=automask, **kwargs)
+  def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, **kwargs):
 
     if gpuid is None:
       clparams = openclparam.OpenClParam()
@@ -158,7 +162,219 @@ def calcsigmacl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, *
     datapad_gpu.release()
     return sigma, dist
 
-  def _calcchunks(self, patdim, ncol, nrow, target_bytes=4e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0):
+  def calcnlpar_cl(self,chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True,
+                filename=None, fileout=None, reset_sigma=True, backsub = False, rescale = False, gpuid = None, **kwargs):
+
+    if lam is not None:
+      self.lam = lam
+
+    if dthresh is not None:
+      self.dthresh = dthresh
+    if self.dthresh is None:
+      self.dthresh = 0.0
+
+    if searchradius is not None:
+      self.searchradius = searchradius
+
+    lam = np.float32(self.lam)
+    dthresh = np.float32(self.dthresh)
+    sr = np.int64(self.searchradius)
+
+    if filename is not None:
+      self.setfile(filepath=filename)
+
+    patternfile = self.getinfileobj()
+    self.setoutfile(patternfile, filepath=fileout)
+    patternfileout = self.getoutfileobj()
+
+    nrows = np.int64(self.nrows)  # np.uint64(patternfile.nRows)
+    ncols = np.int64(self.ncols)  # np.uint64(patternfile.nCols)
+
+    pwidth = np.uint64(patternfile.patternW)
+    pheight = np.uint64(patternfile.patternH)
+    npat_point = int(pwidth * pheight)
+
+    if reset_sigma:
+      self.sigma = None
+
+    if np.asarray(self.sigma).size == 1 and (self.sigma is not None):
+      tmp = np.asarray(self.sigma)[0]
+      self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp
+      calcsigma = False
+
+    if self.sigma is not None:
+      shpsigma = np.asarray(self.sigma).shape
+      if (shpsigma[0] != nrows) and (shpsigma[1] != ncols):
+        self.sigma = None
+
+    if self.sigma is None:
+      self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpuid=gpuid)[0]
+
+    sigma = np.asarray(self.sigma).astype(np.float32)
+
+
+
+    if (automask is True) and (self.mask is None):
+      self.mask = (self.automask(pheight, pwidth))
+    if self.mask is None:
+      self.mask = np.ones((pheight, pwidth), dytype=np.uint8)
+
+    scalemethod = 'clip'
+    if rescale == True:
+      if np.issubdtype(patternfileout.filedatatype, np.integer):
+        mxval = np.iinfo(patternfileout.filedatatype).max
+        scalemethod = 'fullscale'
+      else:  # not int, so no rescale.
+        rescale = False
+
+
+
+    if gpuid is None:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      target_mem = 0
+      gpuid = 0
+      count = 0
+
+      for gpu in clparams.gpu:
+        gmem = gpu.max_mem_alloc_size
+        if target_mem < gmem:
+          gpuid = count
+          target_mem = gmem
+        count += 1
+    else:
+      clparams = openclparam.OpenClParam()
+      clparams.get_gpu()
+      gpuid = min(len(clparams.gpu)-1, gpuid)
+
+
+    #print(gpuid)
+    clparams.get_context(gpu_id=gpuid, kfile = 'clnlpar.cl')
+    clparams.get_queue()
+    target_mem = clparams.queue.device.max_mem_alloc_size//4
+    ctx = clparams.ctx
+    prg = clparams.prg
+    queue = clparams.queue
+    mf = clparams.memflags
+    clvectlen = 16
+
+
+
+    chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem,
+                              col_overlap=sr, row_overlap=sr)
+
+    print(lam, sr, dthresh)
+
+    # precalculate some needed arrays for the GPU
+    mask = self.mask.astype(np.float32)
+
+    npad = clvectlen * int(np.ceil(mask.size/clvectlen))
+    maskpad = np.zeros((npad) , np.float32) -1 # negative numbers will indicate a clvector overflow.
+    maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32)
+    mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad)
+
+    npatsteps = int(maskpad.size/clvectlen) # how many clvector chunks to move through a pattern.
+
+    chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \
+                     (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1)
+
+    #return chunks, chunksize
+    mxchunk = int(chunksize.max())
+    npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen))
+
+    datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+    datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+
+    nnn = int((2 * sr + 1) ** 2)
+
+    for colchunk in range(chunks[0]):
+      cstart = chunks[2][colchunk, 0]
+      cend = chunks[2][colchunk, 1]
+      ncolchunk = cend - cstart
+
+      cstartcalc = sr if (colchunk > 0) else 0
+      cendcalc = ncolchunk-sr if (colchunk < (chunks[0]-1)) else ncolchunk
+      ncolcalc = np.int64(cendcalc - cstartcalc)
+
+      for rowchunk in range(chunks[1]):
+        rstart = chunks[3][rowchunk, 0]
+        rend = chunks[3][rowchunk, 1]
+        nrowchunk = rend - rstart
+
+        rstartcalc = sr if (rowchunk > 0) else 0
+        rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk
+        nrowcalc = np.int64(rendcalc - rstartcalc)
+
+        data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]],
+                                          convertToFloat=False, returnArrayOnly=True)
+
+        mxval = data.max()
+        if saturation_protect == False:
+          mxval += 1.0
+        else:
+          mxval *= 0.9961
+
+        #filldatain = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx))
+        #cl.enqueue_fill_buffer(queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx))
+
+        sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32))
+        sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk)
+        szdata = data.size
+        npad = clvectlen * int(np.ceil(szdata / clvectlen))
+
+        #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10)
+        #datapad[0:szdata] = data.reshape(-1)
+
+        data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
+
+        if data.dtype.type is np.float32:
+          prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain])
+        if data.dtype.type is np.ubyte:
+          prg.nlloadpat8bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain])
+        if data.dtype.type is np.uint16:
+          prg.nlloadpat16bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain])
+
+
+
+        calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64)
+        crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim)
+
+        prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None,
+        #prg.calcnlpar(queue, (1, 1), None,
+                               datapad_gpu,
+                               mask_gpu,
+                               sigmachunk_gpu,
+                               crlimits_gpu,
+                               datapadout_gpu,
+                               np.int64(sr),
+                               np.int64(npatsteps),
+                               np.int64(npat_point),
+                               np.float32(mxval),
+                               np.float32(1.0/lam**2),
+                               np.float32(dthresh) )
+
+        data = data.astype(np.float32) # prepare to receive data back from GPU
+        data.reshape(-1)[:] = 0.0
+        queue.finish()
+        sigmachunk_gpu.release()
+        cl.enqueue_copy(queue, data, datapadout_gpu).wait()
+
+        if rescale == True:
+          for i in range(data.shape[0]):
+            temp = data[i, :, :]
+            temp -= temp.min()
+            temp *= np.float32(mxval) / temp.max()
+            data[i, :, :] = temp
+
+        patternfileout.write_data(newpatterns=data, patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]],
+                                  flt2int='clip', scalevalue=1.0)
+
+    return str(patternfileout.filepath)
+
+
+
+
+  def _calcchunks(self, patdim, ncol, nrow, target_bytes=2e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0):
 
     col_overlap = min(col_overlap, ncol - 1)
     row_overlap = min(row_overlap, nrow - 1)