diff --git a/pyebsdindex/opencl/clnlpar.cl b/pyebsdindex/opencl/clnlpar.cl index 54a8844..14bd0d3 100644 --- a/pyebsdindex/opencl/clnlpar.cl +++ b/pyebsdindex/opencl/clnlpar.cl @@ -23,6 +23,28 @@ float sum16(const float16 *v1){ } +void writeadd16(__global float *placetowrite, float16 *v1); +void writeadd16(__global float *placetowrite, float16 *v1){ + + placetowrite[0] += v1[0].s0; + placetowrite[1] += v1[0].s1; + placetowrite[2] += v1[0].s2; + placetowrite[3] += v1[0].s3; + placetowrite[4] += v1[0].s4; + placetowrite[5] += v1[0].s5; + placetowrite[6] += v1[0].s6; + placetowrite[7] += v1[0].s7; + placetowrite[8] += v1[0].s8; + placetowrite[9] += v1[0].s9; + placetowrite[10] += v1[0].sa; + placetowrite[11] += v1[0].sb; + placetowrite[12] += v1[0].sc; + placetowrite[13] += v1[0].sd; + placetowrite[14] += v1[0].se; + placetowrite[15] += v1[0].sf; + +} + void print16(const float16 v1); void print16(const float16 v1){ @@ -46,30 +68,26 @@ void print16(const float16 v1){ } -__kernel void nlloadpat8bit(const __global uchar *datain, __global float *dataout){ +__kernel void nlloadpat8bit(__global uchar *datain, __global float *dataout){ const unsigned long int x = get_global_id(0); - uchar imVal; - float imValflt; - imVal = datain[x]; - imValflt = convert_float(imVal); + uchar imVal = datain[x]; + float imValflt = convert_float(imVal); dataout[x] = imValflt; } -__kernel void nlloadpat16bit(const __global ushort *datain, __global float *dataout){ +__kernel void nlloadpat16bit(__global ushort *datain, __global float *dataout){ const unsigned long int x = get_global_id(0); - ushort imVal; - float imValflt; - imVal = datain[x]; - imValflt = convert_float(imVal); + ushort imVal = datain[x]; + float imValflt = convert_float(imVal); dataout[x] = imValflt; } -__kernel void nlloadpat32flt(const __global float *datain, __global float *dataout){ +__kernel void nlloadpat32flt(__global float *datain, __global float *dataout){ const unsigned long int x = get_global_id(0); dataout[x] = datain[x]; } -__kernel void calcsigma( const __global float *data, const __global float16 *mask, +__kernel void calcsigma( __global float *data, __global float16 *mask, __global float *sig, __global float *dg, __global float *ng, const long nn, const long ndatchunk, const long npatpoint, const float maxlim){ @@ -179,3 +197,199 @@ __kernel void calcsigma( const __global float *data, const __global float16 *mas +__kernel void calcnlpar( + const __global float *data, + const __global float16 *mask, + const __global float *sigma, + const __global long *crlimits, + __global float *dataout, + const long sr, + const long ndatchunk, + const long npatpoint, + const float maxlim, + const float lam2, + const float dthresh){ + //IDs of work-item represent x and y coordinates in image + //const long4 calclim = crlimits[0]; + const long x = get_global_id(0)+crlimits[0]; + const long y = get_global_id(1)+crlimits[1]; + const long ncol = crlimits[2]; //get_global_size(0); + const long nrow = crlimits[3];//get_global_size(1); + const long indx_xy = x+y*ncol; + //printf("%d\n", indx_xy); + const long indx0 = npatpoint * indx_xy; + //printf("%d, %d, %d, %d\n", x,y,ncol, nrow); + long i, j, z; + long indx_j, indx_ij, count; + + long nnn = (2*sr+1) * (2*sr+1); + + float16 d1, d0; + float16 mask0, mask1; + float dd, nn, sigma_ij, norm; + float sigma0 = sigma[indx_xy]; + sigma0 *= sigma0; + //float16* writeloc; + + float d[512]; // taking a risk here that noone will want a SR > 10 + float n[512]; + + + for(j=0; j < nnn; ++j){ + d[j] = 0.0; + n[j] = 1.0e-6; + } + + + for(z = 0; z (float16) 1.e-3) ? (float16) 1.0 : (float16) 0.0; + mask0 = d0 < (float16) (maxlim) ? mask0 : (float16) (0.0); + + + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + indx_ij *= npatpoint; + + mask1 = mask0; + d1 = *(__global float16*) (data + indx_ij+16*z); + + + mask1 = d1 < (float16) (maxlim) ? mask1 : (float16) (0.0); + //mask1 = select((float16) (1.0), (float16) (0.0), isgreater(mask0, (float16)(1e-6)) && isgreater(mask1,(float16)(1e-6))); + //printf("%*s\n", 'here'); + + d1 = (d0-d1); + d1 *= d1 ; + d1 *= mask1; + + dd = sum16(&d1); + dd = (indx_ij == indx0) ? -1.0 : dd; // mark the center point + + d[count] += dd; + n[count] += sum16(&mask1); + //d[count+nnn*indx_xy] += dd; + //n[count+nnn*indx_xy] += sum16(&mask1); + + count += 1; + } + } + + } +// calculate the weights + count = 0; + float sum = 0.0; + nn = 1.0; + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + dd = d[count]; + if (dd > 1.e-3){ + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + sigma_ij = sigma[indx_xy]; + sigma_ij *= sigma_ij; + nn = n[count]; + dd -= nn*(sigma_ij+sigma0); + + norm = (sigma0 + sigma_ij)*sqrt(2.0*nn); + if (norm > 1.0e-8){ + dd /= norm; + } else { + dd = 1e6*nn; + } + + } else { + dd = 0.0; + nn = 1.0; + } + + dd -= dthresh; + dd = dd >= 0.0 ? dd : 0.0; + + dd = exp(-1.0*dd*lam2); + sum += dd; + d[count] = dd; + + count += 1; + + } + } + + for (j =0;j (float16) -1.e-3) ? (float16) 1.0 : (float16) 0.0; + + for(j=y-sr; j<=y+sr; ++j){ + + indx_j = (j >= 0) ? (j): abs(j); + indx_j = (indx_j < nrow) ? (indx_j): nrow - (indx_j -nrow +1); + indx_j = ncol * indx_j; + + for(i=x-sr; i<=x+sr; ++i){ + + indx_ij = (i >= 0) ? (i): abs(i); + indx_ij = (indx_ij < ncol) ? (indx_ij): ncol - (indx_ij -ncol +1); + + indx_ij = (indx_ij + indx_j); + + indx_ij *= npatpoint; + + + d1 = *(__global float16*) (data + indx_ij+16*z); + + + d1 *= (float16) d[count]; + d1 *= mask0; + + *(__global float16*) (dataout + indx0+16*z) += d1; + + //writeadd16(&(dataout[indx0+16*z]), &d1 ); + + //d[count+nnn*indx_xy] += dd; + //n[count+nnn*indx_xy] += sum16(&mask1); + + count += 1; + } + } + } + + + +} + + + + + diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py index 0e847b9..78fb34b 100644 --- a/pyebsdindex/opencl/nlpar_cl.py +++ b/pyebsdindex/opencl/nlpar_cl.py @@ -14,10 +14,14 @@ def __init__( self, **kwargs): self.useCPU = False def calcsigma(self,nn=1, saturation_protect=True,automask=True, **kwargs): - return self.calcsigmacl(nn=nn, + return self.calcsigma_cl(nn=nn, saturation_protect=saturation_protect, automask=automask, **kwargs)[0] - def calcsigmacl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, **kwargs): + def calcsigma_cpu(self,nn=1, saturation_protect=True,automask=True, **kwargs): + return nlpar.NLPAR.calcsigma(self, nn=nn, + saturation_protect=saturation_protect, + automask=automask, **kwargs) + def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, **kwargs): if gpuid is None: clparams = openclparam.OpenClParam() @@ -158,7 +162,219 @@ def calcsigmacl(self,nn=1,saturation_protect=True,automask=True, gpuid = None, * datapad_gpu.release() return sigma, dist - def _calcchunks(self, patdim, ncol, nrow, target_bytes=4e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0): + def calcnlpar_cl(self,chunksize=0, searchradius=None, lam = None, dthresh = None, saturation_protect=True, automask=True, + filename=None, fileout=None, reset_sigma=True, backsub = False, rescale = False, gpuid = None, **kwargs): + + if lam is not None: + self.lam = lam + + if dthresh is not None: + self.dthresh = dthresh + if self.dthresh is None: + self.dthresh = 0.0 + + if searchradius is not None: + self.searchradius = searchradius + + lam = np.float32(self.lam) + dthresh = np.float32(self.dthresh) + sr = np.int64(self.searchradius) + + if filename is not None: + self.setfile(filepath=filename) + + patternfile = self.getinfileobj() + self.setoutfile(patternfile, filepath=fileout) + patternfileout = self.getoutfileobj() + + nrows = np.int64(self.nrows) # np.uint64(patternfile.nRows) + ncols = np.int64(self.ncols) # np.uint64(patternfile.nCols) + + pwidth = np.uint64(patternfile.patternW) + pheight = np.uint64(patternfile.patternH) + npat_point = int(pwidth * pheight) + + if reset_sigma: + self.sigma = None + + if np.asarray(self.sigma).size == 1 and (self.sigma is not None): + tmp = np.asarray(self.sigma)[0] + self.sigma = np.zeros((nrows, ncols), dtype=np.float32) + tmp + calcsigma = False + + if self.sigma is not None: + shpsigma = np.asarray(self.sigma).shape + if (shpsigma[0] != nrows) and (shpsigma[1] != ncols): + self.sigma = None + + if self.sigma is None: + self.sigma = self.calcsigma_cl(nn=1, saturation_protect=saturation_protect, automask=automask, gpuid=gpuid)[0] + + sigma = np.asarray(self.sigma).astype(np.float32) + + + + if (automask is True) and (self.mask is None): + self.mask = (self.automask(pheight, pwidth)) + if self.mask is None: + self.mask = np.ones((pheight, pwidth), dytype=np.uint8) + + scalemethod = 'clip' + if rescale == True: + if np.issubdtype(patternfileout.filedatatype, np.integer): + mxval = np.iinfo(patternfileout.filedatatype).max + scalemethod = 'fullscale' + else: # not int, so no rescale. + rescale = False + + + + if gpuid is None: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + target_mem = 0 + gpuid = 0 + count = 0 + + for gpu in clparams.gpu: + gmem = gpu.max_mem_alloc_size + if target_mem < gmem: + gpuid = count + target_mem = gmem + count += 1 + else: + clparams = openclparam.OpenClParam() + clparams.get_gpu() + gpuid = min(len(clparams.gpu)-1, gpuid) + + + #print(gpuid) + clparams.get_context(gpu_id=gpuid, kfile = 'clnlpar.cl') + clparams.get_queue() + target_mem = clparams.queue.device.max_mem_alloc_size//4 + ctx = clparams.ctx + prg = clparams.prg + queue = clparams.queue + mf = clparams.memflags + clvectlen = 16 + + + + chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem, + col_overlap=sr, row_overlap=sr) + + print(lam, sr, dthresh) + + # precalculate some needed arrays for the GPU + mask = self.mask.astype(np.float32) + + npad = clvectlen * int(np.ceil(mask.size/clvectlen)) + maskpad = np.zeros((npad) , np.float32) -1 # negative numbers will indicate a clvector overflow. + maskpad[0:mask.size] = mask.reshape(-1).astype(np.float32) + mask_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=maskpad) + + npatsteps = int(maskpad.size/clvectlen) # how many clvector chunks to move through a pattern. + + chunksize = (chunks[2][:,1] - chunks[2][:,0]).reshape(1,-1) * \ + (chunks[3][:, 1] - chunks[3][:, 0]).reshape(-1, 1) + + #return chunks, chunksize + mxchunk = int(chunksize.max()) + npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen)) + + datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + + nnn = int((2 * sr + 1) ** 2) + + for colchunk in range(chunks[0]): + cstart = chunks[2][colchunk, 0] + cend = chunks[2][colchunk, 1] + ncolchunk = cend - cstart + + cstartcalc = sr if (colchunk > 0) else 0 + cendcalc = ncolchunk-sr if (colchunk < (chunks[0]-1)) else ncolchunk + ncolcalc = np.int64(cendcalc - cstartcalc) + + for rowchunk in range(chunks[1]): + rstart = chunks[3][rowchunk, 0] + rend = chunks[3][rowchunk, 1] + nrowchunk = rend - rstart + + rstartcalc = sr if (rowchunk > 0) else 0 + rendcalc = nrowchunk - sr if (rowchunk < (chunks[1] - 1)) else nrowchunk + nrowcalc = np.int64(rendcalc - rstartcalc) + + data, xyloc = patternfile.read_data(patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]], + convertToFloat=False, returnArrayOnly=True) + + mxval = data.max() + if saturation_protect == False: + mxval += 1.0 + else: + mxval *= 0.9961 + + #filldatain = cl.enqueue_fill_buffer(queue, datapad_gpu, np.float32(mxval+10), 0,int(4*npadmx)) + #cl.enqueue_fill_buffer(queue, datapadout_gpu, np.float32(0.0), 0, int(4 * npadmx)) + + sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32)) + sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk) + szdata = data.size + npad = clvectlen * int(np.ceil(szdata / clvectlen)) + + #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10) + #datapad[0:szdata] = data.reshape(-1) + + data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data) + + if data.dtype.type is np.float32: + prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain]) + if data.dtype.type is np.ubyte: + prg.nlloadpat8bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain]) + if data.dtype.type is np.uint16: + prg.nlloadpat16bit(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu)#, wait_for=[filldatain]) + + + + calclim = np.array([cstartcalc, rstartcalc, ncolchunk, nrowchunk], dtype=np.int64) + crlimits_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=calclim) + + prg.calcnlpar(queue, (np.uint32(ncolcalc), np.uint32(nrowcalc)), None, + #prg.calcnlpar(queue, (1, 1), None, + datapad_gpu, + mask_gpu, + sigmachunk_gpu, + crlimits_gpu, + datapadout_gpu, + np.int64(sr), + np.int64(npatsteps), + np.int64(npat_point), + np.float32(mxval), + np.float32(1.0/lam**2), + np.float32(dthresh) ) + + data = data.astype(np.float32) # prepare to receive data back from GPU + data.reshape(-1)[:] = 0.0 + queue.finish() + sigmachunk_gpu.release() + cl.enqueue_copy(queue, data, datapadout_gpu).wait() + + if rescale == True: + for i in range(data.shape[0]): + temp = data[i, :, :] + temp -= temp.min() + temp *= np.float32(mxval) / temp.max() + data[i, :, :] = temp + + patternfileout.write_data(newpatterns=data, patStartCount=[[cstart, rstart], [ncolchunk, nrowchunk]], + flt2int='clip', scalevalue=1.0) + + return str(patternfileout.filepath) + + + + + def _calcchunks(self, patdim, ncol, nrow, target_bytes=2e9, col_overlap=0, row_overlap=0, col_offset=0, row_offset=0): col_overlap = min(col_overlap, ncol - 1) row_overlap = min(row_overlap, nrow - 1)