From 59a6fbde1afbccefebeb91376ddb6f037a4bf5be Mon Sep 17 00:00:00 2001 From: Kevin Stratford Date: Fri, 27 Dec 2024 19:23:16 +0000 Subject: [PATCH] Updated LE to allow GPU --- src/lb_data.c | 53 ++-- src/ludwig.c | 2 +- src/model_le.c | 686 ++++++++++++++++++++++++++++++------------------- src/model_le.h | 4 +- 4 files changed, 457 insertions(+), 288 deletions(-) diff --git a/src/lb_data.c b/src/lb_data.c index f22b3e4da..b5cfa23ff 100644 --- a/src/lb_data.c +++ b/src/lb_data.c @@ -200,19 +200,20 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options, if (obj->model.cv[p][X] == +1) nprop += 1; } - int ncrossdist = ndist*nprop*nlocal[Y]*nlocal[Z]; - int ndata = 2*nplane*ncrossdist; /* 2 sides for each plane */ - obj->sbuff = (double *) malloc(ndata*sizeof(double)); - obj->rbuff = (double *) malloc(ndata*sizeof(double)); + /* Lees Edwards buffer for crossing distributions */ + int nxdist = ndist*nprop*(nlocal[Y] + 1)*nlocal[Z]; + int nxbuff = 2*nplane*nxdist; /* 2 sides for each plane */ + obj->sbuff = (double *) malloc(nxbuff*sizeof(double)); + obj->rbuff = (double *) malloc(nxbuff*sizeof(double)); tdpGetDeviceCount(&ndevice); if (ndevice > 0) { double * tmp = NULL; - tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) ); + tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) ); tdpAssert( tdpMemcpy(&obj->target->sbuff, &tmp, sizeof(double *), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) ); + tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) ); tdpAssert( tdpMemcpy(&obj->target->rbuff, &tmp, sizeof(double *), tdpMemcpyHostToDevice) ); } @@ -251,6 +252,13 @@ __host__ int lb_free(lb_t * lb) { tdpAssert( tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *), tdpMemcpyDeviceToHost) ); tdpAssert( tdpFree(tmp) ); + tdpAssert( tdpMemcpy(&tmp, &lb->target->sbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + tdpAssert( tdpFree(tmp) ); + tdpAssert( tdpMemcpy(&tmp, &lb->target->rbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + tdpAssert( tdpFree(tmp) ); + tdpAssert( tdpFree(lb->target) ); } @@ -261,7 +269,6 @@ __host__ int lb_free(lb_t * lb) { free(lb->f); free(lb->fprime); - /* FIXME: device buffer to be removed */ free(lb->rbuff); free(lb->sbuff); @@ -291,6 +298,8 @@ int lb_data_initialise_device_model(lb_t * lb) { if (ndevice > 0) { int nvel = lb->model.nvel; + lb_model_t * hm = &lb->model; + lb_model_t * dm = &lb->target->model; int8_t (*d_cv)[3] = {0}; double * d_wv = NULL; double * d_na = NULL; @@ -299,26 +308,26 @@ int lb_data_initialise_device_model(lb_t * lb) { tdpAssert( tdpMalloc((void **) &d_wv, nvel*sizeof(double)) ); tdpAssert( tdpMalloc((void **) &d_na, nvel*sizeof(double)) ); - tdpAssert( tdpMemcpy(d_cv, &(lb->model.cv), nvel*sizeof(int8_t[3]), + tdpAssert( tdpMemcpy(d_cv, hm->cv, nvel*sizeof(int8_t[3]), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(d_wv, &(lb->model.wv), nvel*sizeof(double), + tdpAssert( tdpMemcpy(d_wv, hm->wv, nvel*sizeof(double), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(d_na, &(lb->model.na), nvel*sizeof(double), + tdpAssert( tdpMemcpy(d_na, hm->na, nvel*sizeof(double), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.cv), &d_cv, sizeof(int8_t(*)[3]), + tdpAssert( tdpMemcpy(&(dm->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.wv), &d_wv, sizeof(double *), + tdpAssert( tdpMemcpy(&(dm->wv), &d_wv, sizeof(double *), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.na), &d_na, sizeof(double *), + tdpAssert( tdpMemcpy(&(dm->na), &d_na, sizeof(double *), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.ndim), &(lb->model.ndim), - sizeof(int8_t), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.nvel), &(lb->model.nvel), - sizeof(int8_t), tdpMemcpyHostToDevice) ); - tdpAssert( tdpMemcpy(&(lb->target->model.cs2), &(lb->model.cs2), - sizeof(double), tdpMemcpyHostToDevice) ); + tdpAssert( tdpMemcpy(&(dm->ndim), &(hm->ndim), sizeof(int8_t), + tdpMemcpyHostToDevice) ); + tdpAssert( tdpMemcpy(&(dm->nvel), &(hm->nvel), sizeof(int8_t), + tdpMemcpyHostToDevice) ); + tdpAssert( tdpMemcpy(&(dm->cs2), &(hm->cs2), sizeof(double), + tdpMemcpyHostToDevice) ); /* We do not copy the eigenvectors currently. */ } @@ -351,9 +360,9 @@ int lb_data_free_device_model(lb_t * lb) { tdpAssert( tdpMemcpy(&d_na, &lb->target->model.na, sizeof(double *), tdpMemcpyDeviceToHost) ); - tdpAssert( tdpFree(&d_cv) ); - tdpAssert( tdpFree(&d_wv) ); - tdpAssert( tdpFree(&d_na) ); + tdpAssert( tdpFree(d_cv) ); + tdpAssert( tdpFree(d_wv) ); + tdpAssert( tdpFree(d_na) ); } return 0; diff --git a/src/ludwig.c b/src/ludwig.c index cff116770..2e2b15617 100644 --- a/src/ludwig.c +++ b/src/ludwig.c @@ -806,7 +806,7 @@ void ludwig_run(const char * inputfile) { /* Boundary conditions */ if (ludwig->le) { - lb_le_apply_boundary_conditions(ludwig->lb, ludwig->le); + lb_data_apply_le_boundary_conditions(ludwig->lb, ludwig->le); } TIMER_start(TIMER_HALO_LATTICE); diff --git a/src/model_le.c b/src/model_le.c index 2b4fe62c1..f943b59e4 100644 --- a/src/model_le.c +++ b/src/model_le.c @@ -36,6 +36,17 @@ static int le_reproject(lb_t * lb, lees_edw_t * le); static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le); static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le); +#ifdef HAVE_OPENMPI_ +/* This provides MPIX_CUDA_AWARE_SUPPORT .. */ +#include "mpi-ext.h" +#endif + +#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT +static const int have_gpu_aware_mpi_ = 1; +#else +static const int have_gpu_aware_mpi_ = 0; +#endif + /***************************************************************************** * * lb_le_apply_boundary_conditions @@ -55,7 +66,7 @@ static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le); * *****************************************************************************/ -__host__ int lb_le_apply_boundary_conditions_old(lb_t * lb, lees_edw_t * le) { +__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) { int mpi_cartsz[3]; @@ -715,243 +726,219 @@ int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) { return 0; } - - - - -/* FIXME ADDITIONS */ +/***************************************************************************** + * + * The preceding functions are scheduled for removal. + * The following are operational... + * + *****************************************************************************/ /* Kernel helper structure intended to be passed by value to kernel */ typedef struct lek_s { int nlocal[3]; /* 12 */ int nplane; /* 16 */ - int nprop; /* 20 */ - int ndist; /* 24 */ - int nxdist; /* 28 */ + int ndist; /* 20 */ + int nxdist; /* 24 */ + int nxbuff; /* 28 */ + int nprop; /* 1 <= nprop <= 9 maximum */ + int8_t prop[2][9]; /* prop[0] is side 0 (cx +ve); */ + /* prop[1] is side 1 (cx -ve); */ + /* p values of cross-plane propagating distributions */ } le_kernel_helper_t; +static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le); + + +__global__ void lb_data_reproject_kernel(kernel_3d_t k3d, + le_kernel_helper_t lekh, lb_t * lb, + lees_edw_t * le, double t); +__global__ void lb_data_displace_kernel(kernel_3d_t k3d, + le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, double t); +__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d, + le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, double t); + +static int lb_data_displace_communicate(le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, + double t); + /***************************************************************************** * - * my_ibuf(iside, nlocal[Y], nlocal[Z], nplane, ndist, nprop) + * lb_data_apply_le_boundary_conditions + * + * Driver for the parallel update. * *****************************************************************************/ -__host__ __device__ static int my_ibuf(const le_kernel_helper_t * s, - int jc, int kc, int iplane, int iside, - int n, int p) { - int ib = 0; +int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le) { - assert(s); - assert(1 <= jc && jc <= s->nlocal[Y]); /* FIXME recv has one more? */ - assert(1 <= kc && kc <= s->nlocal[Z]); - assert(0 <= iplane && iplane < s->nplane); - assert(0 <= iside && iside <= 1); - assert(0 <= n && n < s->ndist); - assert(0 <= p && p < s->nprop); - - ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane* - (kc - 1 + s->nlocal[Z]*(jc - 1)))); + assert(lb); + assert(le); - //printf("p iplane, ib nxdisp %d %d (%d %d) %d %d\n", p, iplane, jc, kc, ib, s->nxdist); - assert(0 <= ib && ib < s->nxdist); /* Number of crossing distributions */ + int mpi_cartsz[3] = {0}; + le_kernel_helper_t lekh = le_kernel_helper(lb, le); - /* All same sides are together */ - ib = iside*s->nxdist + ib; + lees_edw_cartsz(le, mpi_cartsz); - return ib; -} + if (lekh.nplane == 0) { + /* No planes, no action. */ + } + else { + int ndevice = 0; + lees_edw_t * le_target; + double t = -1.0; + TIMER_start(TIMER_LE); -__global__ void lb_data_reproject_kernel(kernel_3d_t k3d, - le_kernel_helper_t lekh, lb_t * lb, - lees_edw_t * le, double t); -__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d, - le_kernel_helper_t lekh, - lb_t * lb, - lees_edw_t * le, double t); -__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le); + tdpAssert( tdpGetDeviceCount(&ndevice) ); + lees_edw_target(le, &le_target); -static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le); -static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le); + /* Require the time t = time step */ + { + physics_t * phys = NULL; + physics_ref(&phys); + t = 1.0*physics_control_timestep(phys); + } + /* First, the reprojected distributions are computed and stored to + * the "send" buffer for each side/plane. */ -/***************************************************************************** - * - * lb_data_displace_and_interpolate - * - * Parallel kernel for interpolation stage. - * - *****************************************************************************/ + { + int nx = 2*lekh.nplane; /* Two x positions (sides) for each plane */ + dim3 nblk = {0}; + dim3 ntpb = {0}; + cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]}; + kernel_3d_t k3d = kernel_3d(lb->cs, lim); -__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d, - le_kernel_helper_t lekh, - lb_t * lb, - lees_edw_t * le, double t) { - int kindex = 0; - int ndist = 0; - int nlocal[3] = {0}; - int nhalo = 0; - double ltot[3] = {0}; + kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb); - lb_ndist(lb, &ndist); - lees_edw_nlocal(le, nlocal); - lees_edw_nhalo(le, &nhalo); - lees_edw_ltot(le, ltot); + tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0, + k3d, lekh, lb->target, le_target, t); + tdpAssert( tdpPeekAtLastError() ); + tdpAssert( tdpStreamSynchronize(0) ); + } - for_simt_parallel(kindex, k3d.kiterations, 1) { - int iplane = kernel_3d_ic(&k3d, kindex); /* encodes plane */ - int jc = kernel_3d_jc(&k3d, kindex); - int kc = kernel_3d_kc(&k3d, kindex); + /* Second, displacement. */ + if (have_gpu_aware_mpi_ || mpi_cartsz[Y] > 1) { + lb_data_displace_communicate(lekh, lb, le, t); + } + else { + /* The kernel form is only really required for the stub MPI case + * in serial; it could be removed if point-to-point messages were + * handled by the stub version. */ - if (jc <= nlocal[Y] && kc <= nlocal[Z] && iplane < lekh.nplane) { + int nx = 2*lekh.nplane; + dim3 nblk = {0}; + dim3 ntpb = {0}; + cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y] + 1, 1, lekh.nlocal[Z]}; + kernel_3d_t k3d = kernel_3d(lb->cs, lim); - int ic = lees_edw_plane_location(le, iplane); - double dy = 0.0; - int jdy = 0; - double fr = 0.0; - int iside = 0; + kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb); - lees_edw_buffer_displacement(le, nhalo, t, &dy); - dy = fmod(dy, ltot[Y]); - jdy = floor(dy); - fr = dy - jdy; + tdpLaunchKernel(lb_data_displace_kernel, nblk, ntpb, 0, 0, + k3d, lekh, lb->target, le_target, t); - int j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y]; - int j2 = 1 + (j1 % nlocal[Y]); + tdpAssert( tdpPeekAtLastError() ); + tdpAssert( tdpStreamSynchronize(0) ); + } - int index0 = lees_edw_index(le, ic, jc, kc); - //int index1 = lees_edw_index(le, ic, j2, kc); - - for (int n = 0; n < ndist; n++) { - int ip = 0; /* FIXME avoid if by lookup table [iside][9] */ - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == +1) { - //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p); - //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p); - /* FIXME .... */ - //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane *nxdist; - //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1]; - int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip); - int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip); - double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1]; - lb_f_set(lb, index0, p, n, f); - ip++; - } - } - } + /* Lastly, the recv buffer is interpolated to reset the plane-crossing + * distributions */ + { + int nx = 2*lekh.nplane; + dim3 nblk = {0}; + dim3 ntpb = {0}; + cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]}; + kernel_3d_t k3d = kernel_3d(lb->cs, lim); - /* OTHER DIRECTION */ - iside = 1; - ic = lees_edw_plane_location(le, iplane) + 1; - lees_edw_buffer_displacement(le, nhalo, t, &dy); - dy = fmod(-dy, ltot[Y]); - jdy = floor(dy); - fr = dy - jdy; + kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb); - j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y]; - j2 = 1 + (j1 % nlocal[Y]); + tdpLaunchKernel(lb_data_interpolate_kernel, nblk, ntpb, 0, 0, + k3d, lekh, lb->target, le_target, t); - index0 = lees_edw_index(le, ic, jc, kc); - //index1 = lees_edw_index(le, ic, j2, kc); - - for (int n = 0; n < ndist; n++) { - int ip = 0; assert(ndist == 1); /* FIXME */ - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == -1) { - //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p); - //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p); - //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2 * plane + 1) *nxdist; - //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1]; - int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip); - int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip); - double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1]; - lb_f_set(lb, index0, p, n, f); - ip++; - } - } - } - /* next plane */ + tdpAssert( tdpPeekAtLastError() ); + tdpAssert( tdpStreamSynchronize(0) ); } + + TIMER_stop(TIMER_LE); } - return; + return 0; } /***************************************************************************** * - * We must not overwrite any of the existing distributions until - * all the interpolations have been computed. + * le_kernel_helper * *****************************************************************************/ -__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le) { +static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) { - int kindex = 0; + le_kernel_helper_t lekh = {0}; - int ndist = 0; - int nlocal[3] = {0}; - int nhalo = 0; - int nplane = 0; - int nprop = 0; - int nxdist = 0; + assert(le); - lb_ndist(lb, &ndist); - lees_edw_nlocal(le, nlocal); - lees_edw_nhalo(le, &nhalo); - nplane = lees_edw_nplane_local(le); + lees_edw_nlocal(le, lekh.nlocal); + lekh.nplane = lees_edw_nplane_local(le); - nprop = 0; - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == +1) nprop += 1; + { + int ip = 0; + for (int p = 1; p < lb->model.nvel; p++) { + if (lb->model.cv[p][X] == +1) lekh.prop[0][ip++] = p; /* +ve cx */ + } + ip = 0; + for (int p = 1; p < lb->model.nvel; p++) { + if (lb->model.cv[p][X] == -1) lekh.prop[1][ip++] = p; /* -ve cx */ + } + lekh.nprop = ip; + assert(lekh.nprop <= 9); } - nxdist = ndist*nprop*nlocal[Y]*nlocal[Z]; + lekh.ndist = lb->ndist; + lekh.nxdist = lekh.ndist*lekh.nprop*(lekh.nlocal[Y] + 1)*lekh.nlocal[Z]; + lekh.nxbuff = 2*lekh.nplane*lekh.nxdist; - for_simt_parallel(kindex, k3d.kiterations, 1) { + return lekh; +} - int plane = kernel_3d_ic(&k3d, kindex); /* Encode plane */ - int jc = kernel_3d_jc(&k3d, kindex); - int kc = kernel_3d_kc(&k3d, kindex); +/***************************************************************************** + * + * le_ibuf + * + * Defines the storage order for quantites in the cross-plane buffers. + * Note that all values on the same side (iside = 0 or iside = 1) are + * stored contiguously for the purposes of communication. + * + *****************************************************************************/ - if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) { - int ic = lees_edw_plane_location(le, plane); - int index0 = lees_edw_index(le, ic, jc, kc); +__host__ __device__ static inline int le_ibuf(const le_kernel_helper_t * s, + int jc, int kc, int iplane, + int iside, int n, int p) { + int ib = 0; - for (int n = 0; n < ndist; n++) { - int ip = 0; - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == +1) { - /* ibuf = ibuf(jc, kc, n, p, ...) */ - int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane * nxdist; - int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p); - lb->f[la] = lb->sbuff[ibuf]; - ip++; - } - } - } + assert(s); + assert(1 <= jc && jc <= (s->nlocal[Y] + 1)); + assert(1 <= kc && kc <= s->nlocal[Z]); + assert(0 <= iplane && iplane < s->nplane); + assert(0 <= iside && iside <= 1); + assert(0 <= n && n < s->ndist); + assert(0 <= p && p < s->nprop); - /* Other direction */ + ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane* + (kc - 1 + s->nlocal[Z]*(jc - 1)))); - ic = lees_edw_plane_location(le, plane) + 1; - index0 = lees_edw_index(le, ic, jc, kc); + /* All same sides are together for all planes */ + ib = iside*s->nxdist*s->nplane + ib; - for (int n = 0; n < ndist; n++) { - int ip = 0; - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == -1) { - int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2*plane + 1) * nxdist; - int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p); - lb->f[la] = lb->sbuff[ibuf]; - ip++; - } - } - } - } - } + assert(0 <= ib && ib < s->nxbuff); - return; + return ib; } /***************************************************************************** @@ -1004,14 +991,13 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d, for (int n = 0; n < lekh.ndist; n++) { - int ip = 0; double rho = 0.0; double g[3] = {0}; double ds[3][3] = {0}; /* Compute 0th and 1st moments */ lb_dist_enum_t ndn = (lb_dist_enum_t) n; - /* EXPAND ? */ + /* Could expand these ... */ lb_0th_moment(lb, index, ndn, &rho); lb_1st_moment(lb, index, ndn, g); @@ -1021,8 +1007,10 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d, } } - /* Now update the distribution */ - for (int p = 1; p < lb->model.nvel; p++) { + /* Now for relevant distributions ... */ + for (int ip = 0; ip < lekh.nprop; ip++) { + + int p = lekh.prop[iside][ip]; double cs2 = lb->model.cs2; double rcs2 = 1.0/cs2; @@ -1030,25 +1018,23 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d, double udotc = du[Y]*lb->model.cv[p][Y]; double sdotq = 0.0; - if (lb->model.cv[p][X] != cx) continue; + assert(lb->model.cv[p][X] == cx); for (int ia = 0; ia < 3; ia++) { for (int ib = 0; ib < 3; ib++) { - double dab = cs2 * (ia == ib); - double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab); - sdotq += ds[ia][ib] * q; + double dab = cs2*(ia == ib); + double q = (lb->model.cv[p][ia]*lb->model.cv[p][ib] - dab); + sdotq += ds[ia][ib]*q; } } /* Project all this back to the distribution. */ { - int ibuf = my_ibuf(&lekh, jc, kc, iplane, iside, n, ip); + int ibuf = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip); double f = 0.0; lb_f(lb, index, p, n, &f); - f += lb->model.wv[p] * (rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2); - /* REPLACE lb_f_set(lb, index, p, n, f); BY */ + f += lb->model.wv[p]*(rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2); lb->sbuff[ibuf] = f; - ++ip; } } } @@ -1060,88 +1046,223 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d, /***************************************************************************** * - * lb_le_apply_boundary_conditions + * lb_data_displace_kernel * - * Driver for the parallel update. + * Displace/copy send buffer (1 <= jc <= nlocal[Y]) to the recv buffer + * (1 <= jc <= nlocal[Y] + 1). + * + * Version where there is no communication via MPI (i.e., serial). * *****************************************************************************/ -__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) { +__global__ void lb_data_displace_kernel(kernel_3d_t k3d, + le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, double t) { + int kindex = 0; + int nhalo = 0; + double ltot[3] = {0}; + + lees_edw_nhalo(le, &nhalo); + lees_edw_ltot(le, ltot); + + for_simt_parallel(kindex, k3d.kiterations, 1) { + + int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */ + int jc = kernel_3d_jc(&k3d, kindex); + int kc = kernel_3d_kc(&k3d, kindex); + + if (jc <= lekh.nlocal[Y]+1 && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) { + + int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane); + int iside = ix % 2; assert(iside == 0 || iside == 1); + int cx = 1 - 2*iside; /* below going up, or above going down */ + + /* buffer location; js is the displaced position needed ... */ + int ic = iside + lees_edw_plane_location(le, iplane); + int js = -1; + int dj = -1; + double dy = 0.0; + + lees_edw_buffer_displacement(le, nhalo, t, &dy); + dj = floor(fmod(dy*cx, ltot[Y])); + + js = 1 + (jc + dj - 1 + 2*lekh.nlocal[Y]) % lekh.nlocal[Y]; + assert(1 <= js && js <= lekh.nlocal[Y]); + + for (int n = 0; n < lekh.ndist; n++) { + for (int ip = 0; ip < lekh.nprop; ip++) { + int isend = le_ibuf(&lekh, js, kc, iplane, iside, n, ip); + int irecv = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip); + lb->rbuff[irecv] = lb->sbuff[isend]; + } + } + } + } + + return; +} + +/***************************************************************************** + * + * lb_data_displace_communicate + * + * General displacement procedure when MPI is required. The displacement + * is always an integer number of latttice sites in the y-direction. + * + *****************************************************************************/ + +static int lb_data_displace_communicate(le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, + double t) { + const int tag1 = 3102; + const int tag2 = 3103; + const int tag3 = 3104; + const int tag4 = 3105; + + double * sbuff = NULL; /* Send buffer (previously reprojected values) */ + double * rbuff = NULL; /* Recv buffer (to be interpolated) */ + + int nhalo = 0; + int ntotal[3] = {0}; + int offset[3] = {0}; + int nrank_s[2] = {0}; + int nrank_r[2] = {0}; + MPI_Comm comm = MPI_COMM_NULL; + MPI_Request req[8] = {0}; assert(lb); assert(le); - int mpi_cartsz[3] = {0}; - le_kernel_helper_t lekh = le_kernel_helper(lb, le); + int nrowdata = lekh.nlocal[Z]*lekh.nplane*lekh.ndist*lekh.nprop; - lees_edw_cartsz(le, mpi_cartsz); + /* If there is GPU-aware MPI just communicate the GPU buffers; if + * not, copy in relevant direction at the start and finish */ - if (lekh.nplane > 0) { - int ndevice = 0; - lees_edw_t * le_target; - double t = -1.0; + if (have_gpu_aware_mpi_) { + tdpAssert( tdpMemcpy(&sbuff, &lb->target->sbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + tdpAssert( tdpMemcpy(&rbuff, &lb->target->rbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + } + else { + int nbuffsz = lekh.nxbuff*sizeof(double); + double * target = NULL; + sbuff = lb->sbuff; + rbuff = lb->rbuff; + tdpAssert( tdpMemcpy(&target, &lb->target->sbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + tdpAssert( tdpMemcpy(sbuff, target, nbuffsz, tdpMemcpyDeviceToHost) ); + } - TIMER_start(TIMER_LE); + lees_edw_comm(le, &comm); + lees_edw_ntotal(le, ntotal); + lees_edw_nlocal_offset(le, offset); + lees_edw_nhalo(le, &nhalo); - tdpAssert( tdpGetDeviceCount(&ndevice) ); - lees_edw_target(le, &le_target); + /* For each plane, there are 4 sends and 4 recvs; two each for each + * side of the plane. The construction of the buffer is such that + * we can treat all the planes for a given side in one go. */ - /* Require the time t = time step */ - { - physics_t * phys = NULL; - physics_ref(&phys); - t = 1.0*physics_control_timestep(phys); - } + /* A total of 8 requests; 2 sends and 2 recvs for side = 0, and + * 2 sends and 2 revcs for side = 1, which communicate in different + * directions... */ - /* First, set up the interpolation buffer with the relevant elements of - * the distribution, i.e., side below the nplane propagating up, and - * side above plane propagating down. ("Up" and "down" in x-direction) */ + /* Each process sends a total of (nlocal[Y] + 1) y values all taken + * from 1 <= jc <= nlocal[Y]; the values at j = j0 = jrow2 are sent to + * both destinations. */ - { - int nx = 2*lekh.nplane; /* Two x positions (sides) for each plane */ - dim3 nblk = {0}; - dim3 ntpb = {0}; - cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]}; - kernel_3d_t k3d = kernel_3d(lb->cs, lim); + { + int iside = 0; + int jdy = -1; + double dy = 0.0; + lees_edw_buffer_displacement(le, nhalo, t, &dy); + dy = fmod(dy, 1.0*ntotal[Y]); + jdy = floor(dy); - kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb); + /* Starting y coordinate is j0: 1 <= j0 <= ntotal[y] */ - tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0, - k3d, lekh, lb->target, le_target, t); - tdpAssert( tdpPeekAtLastError() ); - tdpAssert( tdpStreamSynchronize(0) ); - } + int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y]; + lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r); - /* Swap the send and recv buffer */ + j0 = 1 + (j0 - 1) % lekh.nlocal[Y]; /* 1 <= j0 <= nlocal[Y] */ + int jrow1 = lekh.nlocal[Y] + 1 - j0; + int jrow2 = j0; - tdpAssert( tdpMemcpy(lb->target->rbuff, lb->target->sbuff, - 2*lekh.nxdist*sizeof(double), - tdpMemcpyDeviceToDevice) ); + int n1 = jrow1*nrowdata; + int n2 = jrow2*nrowdata; - /* Displacement and interpolation to replace distributions */ - if (mpi_cartsz[Y] > 1) { - assert(0); /* PENDING comms for send to recv buffer */ - } - else { - int nx = lekh.nplane; - dim3 nblk = {0}; - dim3 ntpb = {0}; - cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]}; - kernel_3d_t k3d = kernel_3d(lb->cs, lim); + /* Post the receives (are disjoint) */ + /* 1 -> jrow1 incl., and 1 + jrow1 -> nlocal[Y] + 1 incl. */ - kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb); + int ibuf1 = le_ibuf(&lekh, 1, 1, 0, iside, 0, 0); + int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0); - /* Two kernels to provide synchronisation: interpolate to temporary - * and then can copy back values to the distribution itself. */ + MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag1, comm, req + 0); + MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag2, comm, req + 1); - tdpLaunchKernel(lb_data_displace_and_interpolate, nblk, ntpb, 0, 0, - k3d, lekh, lb->target, le_target, t); + /* Post sends (overlap at jrow2) */ + /* jrow2 -> nlocal[Y] incl., and 1 -> jrow2 incl. */ - tdpAssert( tdpPeekAtLastError() ); - tdpAssert( tdpStreamSynchronize(0) ); - } + ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0); + ibuf2 = le_ibuf(&lekh, 1, 1, 0, iside, 0, 0); - TIMER_stop(TIMER_LE); + MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag1, comm, req + 2); + MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag2, comm, req + 3); + } + + /* Other side */ + { + int iside = 1; + int jdy = -1; + double dy = 0.0; + + lees_edw_buffer_displacement(le, nhalo, t, &dy); + dy = fmod(-dy, 1.0*ntotal[Y]); /* note sign */ + jdy = floor(dy); + + /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */ + + int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y]; + lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r); + + j0 = 1 + (j0 - 1) % lekh.nlocal[Y]; + int jrow1 = lekh.nlocal[Y] + 1 - j0; + int jrow2 = j0; + + int n1 = jrow1*nrowdata; + int n2 = jrow2*nrowdata; + + /* Post the receives */ + + int ibuf1 = le_ibuf(&lekh, 1, 1, 0, iside, 0, 0); + int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0); + + MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag3, comm, req + 4); + MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag4, comm, req + 5); + + /* Post sends */ + + ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0); + ibuf2 = le_ibuf(&lekh, 1, 1, 0, iside, 0, 0); + + MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag3, comm, req + 6); + MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag4, comm, req + 7); + } + + /* Complete */ + MPI_Waitall(8, req, MPI_STATUSES_IGNORE); + + if (have_gpu_aware_mpi_) { + /* No further action */ + } + else { + int nbuffsz = lekh.nxbuff*sizeof(double); + double * target = NULL; + tdpAssert( tdpMemcpy(&target, &lb->target->rbuff, sizeof(double *), + tdpMemcpyDeviceToHost) ); + tdpAssert( tdpMemcpy(target, rbuff, nbuffsz, tdpMemcpyHostToDevice) ); } return 0; @@ -1149,27 +1270,66 @@ __host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) { /***************************************************************************** * - * le_kernel_helper + * lb_data_interpolate_kernel + * + * Interpolate the final recv buffer to reset plane-crossing distribiutions. + * The linear interpolation is always between jc and jc+1 for + * 1 <= jc <= nlocal[Y], ie., an appropriate displacement + * of the send buffer should have occurred. A consistent + * fractional part of a lattice spacing is used. + * + * Always linear interpoaltion to conserve mass. * *****************************************************************************/ -static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) { +__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d, + le_kernel_helper_t lekh, + lb_t * lb, + lees_edw_t * le, double t) { + int kindex = 0; + int nhalo = 0; + double ltot[3] = {0}; - le_kernel_helper_t lekh = {0}; + lees_edw_nhalo(le, &nhalo); + lees_edw_ltot(le, ltot); - assert(le); + for_simt_parallel(kindex, k3d.kiterations, 1) { - lees_edw_nlocal(le, lekh.nlocal); - lekh.nplane = lees_edw_nplane_local(le); + int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */ + int jc = kernel_3d_jc(&k3d, kindex); + int kc = kernel_3d_kc(&k3d, kindex); - for (int p = 1; p < lb->model.nvel; p++) { - if (lb->model.cv[p][X] == +1) lekh.nprop += 1; - } + if (jc <= lekh.nlocal[Y] && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) { - /* FIXME: nxdist to include the number of planes here but ncrossdist - * does not include the number of planes */ - lekh.ndist = lb->ndist; - lekh.nxdist = lekh.ndist*lekh.nprop*lekh.nplane*lekh.nlocal[Y]*lekh.nlocal[Z]; + int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane); + int iside = ix % 2; assert(iside == 0 || iside == 1); + int cx = 1 - 2*iside; /* below going up, or above going down */ - return lekh; + /* buffer location, fractional part of the displacement, ... */ + int ic = iside + lees_edw_plane_location(le, iplane); + int jdy = 0; + + double dy = 0.0; + double fr = 0.0; + + lees_edw_buffer_displacement(le, nhalo, t, &dy); + dy = fmod(dy*cx, ltot[Y]); + jdy = floor(dy); + fr = dy - jdy; + + int index0 = lees_edw_index(le, ic, jc, kc); + + for (int n = 0; n < lekh.ndist; n++) { + for (int ip = 0; ip < lekh.nprop; ip++) { + int p = lekh.prop[iside][ip]; + int ibuf0 = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip); + int ibuf1 = le_ibuf(&lekh, jc + 1, kc, iplane, iside, n, ip); + double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1]; + lb_f_set(lb, index0, p, n, f); + } + } + } + } + + return; } diff --git a/src/model_le.h b/src/model_le.h index 546013975..01f285ad4 100644 --- a/src/model_le.h +++ b/src/model_le.h @@ -5,7 +5,7 @@ * Edinburgh Soft Matter and Statistical Physics Group and * Edinburgh Parallel Computing Centre * - * (c) 2009-2022 The University of Edinburgh + * (c) 2009-2024 The University of Edinburgh * Kevin Stratford (kevin@epcc.ed.ac.uk) * *****************************************************************************/ @@ -16,7 +16,7 @@ #include "lb_data.h" #include "leesedwards.h" -int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le); +int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le); int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le); #endif