From 59a6fbde1afbccefebeb91376ddb6f037a4bf5be Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:23:16 +0000
Subject: [PATCH] Updated LE to allow GPU

---
 src/lb_data.c  |  53 ++--
 src/ludwig.c   |   2 +-
 src/model_le.c | 686 ++++++++++++++++++++++++++++++-------------------
 src/model_le.h |   4 +-
 4 files changed, 457 insertions(+), 288 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index f22b3e4da..b5cfa23ff 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -200,19 +200,20 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
 	if (obj->model.cv[p][X] == +1) nprop += 1;
       }
 
-      int ncrossdist = ndist*nprop*nlocal[Y]*nlocal[Z];
-      int ndata      = 2*nplane*ncrossdist; /* 2 sides for each plane */
-      obj->sbuff = (double *) malloc(ndata*sizeof(double));
-      obj->rbuff = (double *) malloc(ndata*sizeof(double));
+      /* Lees Edwards buffer for crossing distributions */
+      int nxdist = ndist*nprop*(nlocal[Y] + 1)*nlocal[Z];
+      int nxbuff = 2*nplane*nxdist; /* 2 sides for each plane */
+      obj->sbuff = (double *) malloc(nxbuff*sizeof(double));
+      obj->rbuff = (double *) malloc(nxbuff*sizeof(double));
 
       tdpGetDeviceCount(&ndevice);
 
       if (ndevice > 0) {
 	double * tmp = NULL;
-	tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+	tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) );
 	tdpAssert( tdpMemcpy(&obj->target->sbuff, &tmp, sizeof(double *),
 			     tdpMemcpyHostToDevice) );
-	tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+	tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) );
 	tdpAssert( tdpMemcpy(&obj->target->rbuff, &tmp, sizeof(double *),
 			     tdpMemcpyHostToDevice) );
       }
@@ -251,6 +252,13 @@ __host__ int lb_free(lb_t * lb) {
     tdpAssert( tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+
     tdpAssert( tdpFree(lb->target) );
   }
 
@@ -261,7 +269,6 @@ __host__ int lb_free(lb_t * lb) {
   free(lb->f);
   free(lb->fprime);
 
-  /* FIXME: device buffer to be removed */
   free(lb->rbuff);
   free(lb->sbuff);
 
@@ -291,6 +298,8 @@ int lb_data_initialise_device_model(lb_t * lb) {
   if (ndevice > 0) {
 
     int nvel = lb->model.nvel;
+    lb_model_t * hm = &lb->model;
+    lb_model_t * dm = &lb->target->model;
     int8_t (*d_cv)[3] = {0};
     double * d_wv = NULL;
     double * d_na = NULL;
@@ -299,26 +308,26 @@ int lb_data_initialise_device_model(lb_t * lb) {
     tdpAssert( tdpMalloc((void **) &d_wv, nvel*sizeof(double)) );
     tdpAssert( tdpMalloc((void **) &d_na, nvel*sizeof(double)) );
 
-    tdpAssert( tdpMemcpy(d_cv, &(lb->model.cv), nvel*sizeof(int8_t[3]),
+    tdpAssert( tdpMemcpy(d_cv, hm->cv, nvel*sizeof(int8_t[3]),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(d_wv, &(lb->model.wv), nvel*sizeof(double),
+    tdpAssert( tdpMemcpy(d_wv, hm->wv, nvel*sizeof(double),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(d_na, &(lb->model.na), nvel*sizeof(double),
+    tdpAssert( tdpMemcpy(d_na, hm->na, nvel*sizeof(double),
 			 tdpMemcpyHostToDevice) );
 
-    tdpAssert( tdpMemcpy(&(lb->target->model.cv), &d_cv, sizeof(int8_t(*)[3]),
+    tdpAssert( tdpMemcpy(&(dm->cv), &d_cv, sizeof(int8_t(*)[3]),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.wv), &d_wv, sizeof(double *),
+    tdpAssert( tdpMemcpy(&(dm->wv), &d_wv, sizeof(double *),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.na), &d_na, sizeof(double *),
+    tdpAssert( tdpMemcpy(&(dm->na), &d_na, sizeof(double *),
 			 tdpMemcpyHostToDevice) );
 
-    tdpAssert( tdpMemcpy(&(lb->target->model.ndim), &(lb->model.ndim),
-			 sizeof(int8_t), tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.nvel), &(lb->model.nvel),
-			 sizeof(int8_t), tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.cs2), &(lb->model.cs2),
-			 sizeof(double), tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->ndim), &(hm->ndim), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->nvel), &(hm->nvel), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->cs2), &(hm->cs2), sizeof(double),
+			 tdpMemcpyHostToDevice) );
 
     /* We do not copy the eigenvectors currently. */
   }
@@ -351,9 +360,9 @@ int lb_data_free_device_model(lb_t * lb) {
     tdpAssert( tdpMemcpy(&d_na, &lb->target->model.na, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
 
-    tdpAssert( tdpFree(&d_cv) );
-    tdpAssert( tdpFree(&d_wv) );
-    tdpAssert( tdpFree(&d_na) );
+    tdpAssert( tdpFree(d_cv) );
+    tdpAssert( tdpFree(d_wv) );
+    tdpAssert( tdpFree(d_na) );
   }
 
   return 0;
diff --git a/src/ludwig.c b/src/ludwig.c
index cff116770..2e2b15617 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -806,7 +806,7 @@ void ludwig_run(const char * inputfile) {
       /* Boundary conditions */
 
       if (ludwig->le) {
-	lb_le_apply_boundary_conditions(ludwig->lb, ludwig->le);
+	lb_data_apply_le_boundary_conditions(ludwig->lb, ludwig->le);
       }
 
       TIMER_start(TIMER_HALO_LATTICE);
diff --git a/src/model_le.c b/src/model_le.c
index 2b4fe62c1..f943b59e4 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -36,6 +36,17 @@ static int le_reproject(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
 
+#ifdef HAVE_OPENMPI_
+/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
+#include "mpi-ext.h"
+#endif
+
+#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+static const int have_gpu_aware_mpi_ = 1;
+#else
+static const int have_gpu_aware_mpi_ = 0;
+#endif
+
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -55,7 +66,7 @@ static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
  *
  *****************************************************************************/
 
-__host__ int lb_le_apply_boundary_conditions_old(lb_t * lb, lees_edw_t * le) {
+__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
   int mpi_cartsz[3];
 
@@ -715,243 +726,219 @@ int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
   return 0;
 }
 
-
-
-
-
-/* FIXME ADDITIONS */
+/*****************************************************************************
+ *
+ *  The preceding functions are scheduled for removal.
+ *  The following are operational...
+ *
+ *****************************************************************************/
 
 /* Kernel helper structure intended to be passed by value to kernel */
 
 typedef struct lek_s {
   int nlocal[3];        /* 12 */
   int nplane;           /* 16 */
-  int nprop;            /* 20 */
-  int ndist;            /* 24 */
-  int nxdist;           /* 28 */
+  int ndist;            /* 20 */
+  int nxdist;           /* 24 */
+  int nxbuff;           /* 28 */
+  int nprop;            /* 1 <= nprop <= 9  maximum */
+  int8_t prop[2][9];    /* prop[0] is side 0 (cx +ve); */
+                        /* prop[1] is side 1 (cx -ve); */
+                        /* p values of cross-plane propagating distributions */
 } le_kernel_helper_t;
 
+static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le);
+
+
+__global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
+					 le_kernel_helper_t lekh, lb_t * lb,
+					 lees_edw_t * le, double t);
+__global__ void lb_data_displace_kernel(kernel_3d_t k3d,
+					le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le, double t);
+__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d,
+					   le_kernel_helper_t lekh,
+					   lb_t * lb,
+					   lees_edw_t * le, double t);
+
+static int lb_data_displace_communicate(le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le,
+					double t);
+
 /*****************************************************************************
  *
- *  my_ibuf(iside, nlocal[Y], nlocal[Z], nplane, ndist, nprop)
+ *  lb_data_apply_le_boundary_conditions
+ *
+ *  Driver for the parallel update.
  *
  *****************************************************************************/
 
-__host__ __device__ static int my_ibuf(const le_kernel_helper_t * s,
-				       int jc,  int kc, int iplane, int iside,
-				       int n, int p) {
-  int ib = 0;
+int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
-  assert(s);
-  assert(1 <= jc && jc <= s->nlocal[Y]); /* FIXME recv has one more? */
-  assert(1 <= kc && kc <= s->nlocal[Z]);
-  assert(0 <= iplane && iplane < s->nplane);
-  assert(0 <= iside  && iside <= 1);
-  assert(0 <= n && n < s->ndist);
-  assert(0 <= p && p < s->nprop);
-
-  ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane*
-				   (kc - 1 + s->nlocal[Z]*(jc - 1))));
+  assert(lb);
+  assert(le);
 
-  //printf("p iplane, ib nxdisp %d %d (%d %d) %d %d\n", p, iplane, jc, kc, ib, s->nxdist);
-  assert(0 <= ib && ib < s->nxdist); /* Number of crossing distributions */
+  int mpi_cartsz[3] = {0};
+  le_kernel_helper_t lekh = le_kernel_helper(lb, le);
 
-  /* All same sides are together */
-  ib = iside*s->nxdist + ib;
+  lees_edw_cartsz(le, mpi_cartsz);
 
-  return ib;
-}
+  if (lekh.nplane == 0) {
+    /* No planes, no action. */
+  }
+  else {
+    int ndevice = 0;
+    lees_edw_t * le_target;
+    double t = -1.0;
 
+    TIMER_start(TIMER_LE);
 
-__global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
-					 le_kernel_helper_t lekh, lb_t * lb,
-					 lees_edw_t * le, double t);
-__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d,
-						 le_kernel_helper_t lekh,
-						 lb_t * lb,
-						 lees_edw_t * le, double t);
-__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le);
+    tdpAssert( tdpGetDeviceCount(&ndevice) );
+    lees_edw_target(le, &le_target);
 
-static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
+    /* Require the time t = time step */
+    {
+      physics_t * phys = NULL;
+      physics_ref(&phys);
+      t = 1.0*physics_control_timestep(phys);
+    }
 
+    /* First, the reprojected distributions are computed and stored to
+     * the "send" buffer for each side/plane. */
 
-/*****************************************************************************
- *
- *  lb_data_displace_and_interpolate
- *
- *  Parallel kernel for interpolation stage.
- *
- *****************************************************************************/
+    {
+      int  nx   = 2*lekh.nplane; /* Two x positions (sides) for each plane */
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d,
-						 le_kernel_helper_t lekh,
-						 lb_t * lb,
-						 lees_edw_t * le, double t) {
-  int kindex = 0;
-  int ndist = 0;
-  int nlocal[3] = {0};
-  int nhalo = 0;
-  double ltot[3] = {0};
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-  lb_ndist(lb, &ndist);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  lees_edw_ltot(le, ltot);
+      tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
+    }
 
-  for_simt_parallel(kindex, k3d.kiterations, 1) {
 
-    int iplane = kernel_3d_ic(&k3d, kindex); /* encodes plane */
-    int jc     = kernel_3d_jc(&k3d, kindex);
-    int kc     = kernel_3d_kc(&k3d, kindex);
+    /* Second, displacement. */
+    if (have_gpu_aware_mpi_ || mpi_cartsz[Y] > 1) {
+      lb_data_displace_communicate(lekh, lb, le, t);
+    }
+    else {
+      /* The kernel form is only really required for the stub MPI case
+       * in serial; it could be removed if point-to-point messages were
+       * handled by the stub version. */
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && iplane < lekh.nplane) {
+      int  nx   = 2*lekh.nplane;
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y] + 1, 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-      int ic = lees_edw_plane_location(le, iplane);
-      double dy = 0.0;
-      int jdy = 0;
-      double fr = 0.0;
-      int iside = 0;
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-      lees_edw_buffer_displacement(le, nhalo, t, &dy);
-      dy = fmod(dy, ltot[Y]);
-      jdy = floor(dy);
-      fr = dy - jdy;
+      tdpLaunchKernel(lb_data_displace_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
 
-      int j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      int j2 = 1 + (j1 % nlocal[Y]);
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
+    }
 
-      int index0 = lees_edw_index(le, ic, jc, kc);
-      //int index1 = lees_edw_index(le, ic, j2, kc);
-
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0; /* FIXME  avoid if by lookup table [iside][9] */
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == +1) {
-	    //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	    /* FIXME .... */
-	    //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane *nxdist;
-	    //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-	    int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip);
-	    int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip);
-	    double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
-	    lb_f_set(lb, index0, p, n, f);
-	    ip++;
-	  }
-	}
-      }
+    /* Lastly, the recv buffer is interpolated to reset the plane-crossing
+     * distributions */
+    {
+      int  nx   = 2*lekh.nplane;
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-      /* OTHER DIRECTION */
-      iside = 1;
-      ic = lees_edw_plane_location(le, iplane) + 1;
-      lees_edw_buffer_displacement(le, nhalo, t, &dy);
-      dy = fmod(-dy, ltot[Y]);
-      jdy = floor(dy);
-      fr = dy - jdy;
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]);
+      tdpLaunchKernel(lb_data_interpolate_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
 
-      index0 = lees_edw_index(le, ic, jc, kc);
-      //index1 = lees_edw_index(le, ic, j2, kc);
-
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0; assert(ndist == 1); /* FIXME */
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == -1) {
-	    //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	    //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2 * plane + 1) *nxdist;
-	    //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-	    int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip);
-	    int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip);
-	    double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
-	    lb_f_set(lb, index0, p, n, f);
-	    ip++;
-	  }
-	}
-      }
-      /* next plane */
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
     }
+
+    TIMER_stop(TIMER_LE);
   }
 
-  return;
+  return 0;
 }
 
 /*****************************************************************************
  *
- *  We must not overwrite any of the existing distributions until
- *  all the interpolations have been computed.
+ *  le_kernel_helper
  *
  *****************************************************************************/
 
-__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le) {
+static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) {
 
-  int kindex = 0;
+  le_kernel_helper_t lekh = {0};
 
-  int ndist = 0;
-  int nlocal[3] = {0};
-  int nhalo = 0;
-  int nplane = 0;
-  int nprop = 0;
-  int nxdist = 0;
+  assert(le);
 
-  lb_ndist(lb, &ndist);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  nplane = lees_edw_nplane_local(le);
+  lees_edw_nlocal(le, lekh.nlocal);
+  lekh.nplane = lees_edw_nplane_local(le);
 
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
+  {
+    int ip = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+      if (lb->model.cv[p][X] == +1) lekh.prop[0][ip++] = p;  /* +ve cx */
+    }
+    ip = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+      if (lb->model.cv[p][X] == -1) lekh.prop[1][ip++] = p;  /* -ve cx */
+    }
+    lekh.nprop = ip;
+    assert(lekh.nprop <= 9);
   }
-  nxdist = ndist*nprop*nlocal[Y]*nlocal[Z];
 
+  lekh.ndist  = lb->ndist;
+  lekh.nxdist = lekh.ndist*lekh.nprop*(lekh.nlocal[Y] + 1)*lekh.nlocal[Z];
+  lekh.nxbuff = 2*lekh.nplane*lekh.nxdist;
 
-  for_simt_parallel(kindex, k3d.kiterations, 1) {
+  return lekh;
+}
 
-    int plane = kernel_3d_ic(&k3d, kindex); /* Encode plane */
-    int jc    = kernel_3d_jc(&k3d, kindex);
-    int kc    = kernel_3d_kc(&k3d, kindex);
+/*****************************************************************************
+ *
+ *  le_ibuf
+ *
+ *  Defines the storage order for quantites in the cross-plane buffers.
+ *  Note that all values on the same side (iside = 0 or iside = 1) are
+ *  stored contiguously for the purposes of communication.
+ *
+ *****************************************************************************/
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
-      int ic = lees_edw_plane_location(le, plane);
-      int index0 = lees_edw_index(le, ic, jc, kc);
+__host__ __device__ static inline int le_ibuf(const le_kernel_helper_t * s,
+					      int jc,  int kc, int iplane,
+					      int iside, int n, int p) {
+  int ib = 0;
 
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0;
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == +1) {
-	    /* ibuf = ibuf(jc, kc, n, p, ...) */
-	    int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane * nxdist;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = lb->sbuff[ibuf];
-	    ip++;
-	  }
-	}
-      }
+  assert(s);
+  assert(1 <= jc && jc <= (s->nlocal[Y] + 1));
+  assert(1 <= kc && kc <= s->nlocal[Z]);
+  assert(0 <= iplane && iplane < s->nplane);
+  assert(0 <= iside  && iside <= 1);
+  assert(0 <= n && n < s->ndist);
+  assert(0 <= p && p < s->nprop);
 
-      /* Other direction */
+  ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane*
+				   (kc - 1 + s->nlocal[Z]*(jc - 1))));
 
-      ic = lees_edw_plane_location(le, plane) + 1;
-      index0 = lees_edw_index(le, ic, jc, kc);
+  /* All same sides are together for all planes */
+  ib = iside*s->nxdist*s->nplane + ib;
 
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0;
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == -1) {
-	    int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2*plane + 1) * nxdist;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = lb->sbuff[ibuf];
-	    ip++;
-	  }
-	}
-      }
-    }
-  }
+  assert(0 <= ib && ib < s->nxbuff);
 
-  return;
+  return ib;
 }
 
 /*****************************************************************************
@@ -1004,14 +991,13 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 
       for (int n = 0; n < lekh.ndist; n++) {
 
-	int ip = 0;
 	double rho = 0.0;
 	double g[3] = {0};
 	double ds[3][3] = {0};
 
 	/* Compute 0th and 1st moments */
 	lb_dist_enum_t ndn = (lb_dist_enum_t) n;
-	/* EXPAND ? */
+	/* Could expand these ... */
 	lb_0th_moment(lb, index, ndn, &rho);
 	lb_1st_moment(lb, index, ndn, g);
 
@@ -1021,8 +1007,10 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 	  }
 	}
 
-	/* Now update the distribution */
-	for (int p = 1; p < lb->model.nvel; p++) {
+	/* Now for relevant distributions ... */
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+
+	  int p = lekh.prop[iside][ip];
 
 	  double cs2 = lb->model.cs2;
 	  double rcs2 = 1.0/cs2;
@@ -1030,25 +1018,23 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 	  double udotc = du[Y]*lb->model.cv[p][Y];
 	  double sdotq = 0.0;
 
-	  if (lb->model.cv[p][X] != cx) continue;
+	  assert(lb->model.cv[p][X] == cx);
 
 	  for (int ia = 0; ia < 3; ia++) {
 	    for (int ib = 0; ib < 3; ib++) {
-	      double dab = cs2 * (ia == ib);
-	      double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
-	      sdotq += ds[ia][ib] * q;
+	      double dab = cs2*(ia == ib);
+	      double q = (lb->model.cv[p][ia]*lb->model.cv[p][ib] - dab);
+	      sdotq += ds[ia][ib]*q;
 	    }
 	  }
 
 	  /* Project all this back to the distribution. */
 	  {
-	    int ibuf = my_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
+	    int ibuf = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
 	    double f = 0.0;
 	    lb_f(lb, index, p, n, &f);
-	    f += lb->model.wv[p] * (rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
-	    /* REPLACE lb_f_set(lb, index, p, n, f); BY */
+	    f += lb->model.wv[p]*(rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
 	    lb->sbuff[ibuf] = f;
-	    ++ip;
 	  }
 	}
       }
@@ -1060,88 +1046,223 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 
 /*****************************************************************************
  *
- *  lb_le_apply_boundary_conditions
+ *  lb_data_displace_kernel
  *
- *  Driver for the parallel update.
+ *  Displace/copy send buffer (1 <= jc <= nlocal[Y]) to the recv buffer
+ *  (1 <= jc <= nlocal[Y] + 1).
+ *
+ *  Version where there is no communication via MPI (i.e., serial).
  *
  *****************************************************************************/
 
-__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
+__global__ void lb_data_displace_kernel(kernel_3d_t k3d,
+					le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le, double t) {
+  int kindex = 0;
+  int nhalo = 0;
+  double ltot[3] = {0};
+
+  lees_edw_nhalo(le, &nhalo);
+  lees_edw_ltot(le, ltot);
+
+  for_simt_parallel(kindex, k3d.kiterations, 1) {
+
+    int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */
+    int jc = kernel_3d_jc(&k3d, kindex);
+    int kc = kernel_3d_kc(&k3d, kindex);
+
+    if (jc <= lekh.nlocal[Y]+1 && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) {
+
+      int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane);
+      int iside  = ix % 2; assert(iside == 0 || iside == 1);
+      int cx     = 1 - 2*iside; /* below going up, or above going down */
+
+      /* buffer location; js is the displaced position needed ... */
+      int ic = iside + lees_edw_plane_location(le, iplane);
+      int js = -1;
+      int dj = -1;
+      double dy = 0.0;
+
+      lees_edw_buffer_displacement(le, nhalo, t, &dy);
+      dj = floor(fmod(dy*cx, ltot[Y]));
+
+      js = 1 + (jc + dj - 1 + 2*lekh.nlocal[Y]) % lekh.nlocal[Y];
+      assert(1 <= js && js <= lekh.nlocal[Y]);
+
+      for (int n = 0; n < lekh.ndist; n++) {
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+	  int isend = le_ibuf(&lekh, js, kc, iplane, iside, n, ip);
+	  int irecv = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
+	  lb->rbuff[irecv] = lb->sbuff[isend];
+	}
+      }
+    }
+  }
+
+  return;
+}
+
+/*****************************************************************************
+ *
+ *  lb_data_displace_communicate
+ *
+ *  General displacement procedure when MPI is required. The displacement
+ *  is always an integer number of latttice sites in the y-direction.
+ *
+ *****************************************************************************/
+
+static int lb_data_displace_communicate(le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le,
+					double t) {
+  const int tag1 = 3102;
+  const int tag2 = 3103;
+  const int tag3 = 3104;
+  const int tag4 = 3105;
+
+  double * sbuff = NULL;  /* Send buffer (previously reprojected values) */
+  double * rbuff = NULL;  /* Recv buffer (to be interpolated) */
+
+  int nhalo = 0;
+  int ntotal[3]  = {0};
+  int offset[3]  = {0};
+  int nrank_s[2] = {0};
+  int nrank_r[2] = {0};
+  MPI_Comm comm = MPI_COMM_NULL;
+  MPI_Request req[8] = {0};
 
   assert(lb);
   assert(le);
 
-  int mpi_cartsz[3] = {0};
-  le_kernel_helper_t lekh = le_kernel_helper(lb, le);
+  int nrowdata = lekh.nlocal[Z]*lekh.nplane*lekh.ndist*lekh.nprop;
 
-  lees_edw_cartsz(le, mpi_cartsz);
+  /* If there is GPU-aware MPI just communicate the GPU buffers; if
+   * not, copy in relevant direction at the start and finish */
 
-  if (lekh.nplane > 0) {
-    int ndevice = 0;
-    lees_edw_t * le_target;
-    double t = -1.0;
+  if (have_gpu_aware_mpi_) {
+    tdpAssert( tdpMemcpy(&sbuff, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(&rbuff, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+  }
+  else {
+    int nbuffsz = lekh.nxbuff*sizeof(double);
+    double * target = NULL;
+    sbuff = lb->sbuff;
+    rbuff = lb->rbuff;
+    tdpAssert( tdpMemcpy(&target, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(sbuff, target, nbuffsz, tdpMemcpyDeviceToHost) );
+  }
 
-    TIMER_start(TIMER_LE);
+  lees_edw_comm(le, &comm);
+  lees_edw_ntotal(le, ntotal);
+  lees_edw_nlocal_offset(le, offset);
+  lees_edw_nhalo(le, &nhalo);
 
-    tdpAssert( tdpGetDeviceCount(&ndevice) );
-    lees_edw_target(le, &le_target);
+  /* For each plane, there are 4 sends and 4 recvs; two each for each
+   * side of the plane. The construction  of the buffer is such that
+   * we can treat all the planes for a given side in one go. */
 
-    /* Require the time t = time step */
-    {
-      physics_t * phys = NULL;
-      physics_ref(&phys);
-      t = 1.0*physics_control_timestep(phys);
-    }
+  /* A total of 8 requests; 2 sends and 2 recvs for side = 0, and
+   * 2 sends and 2 revcs for side = 1, which communicate in different
+   * directions... */
 
-    /* First, set up the interpolation buffer with the relevant elements of
-     * the distribution, i.e., side below the nplane propagating up, and
-     * side above plane propagating down. ("Up" and "down" in x-direction) */
+  /* Each process sends a total of (nlocal[Y] + 1) y values all taken
+   * from 1 <= jc <= nlocal[Y]; the values at j = j0 = jrow2 are sent to
+   * both destinations. */
 
-    {
-      int  nx   = 2*lekh.nplane; /* Two x positions (sides) for each plane */
-      dim3 nblk = {0};
-      dim3 ntpb = {0};
-      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
-      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
+  {
+    int iside = 0;
+    int jdy = -1;
+    double dy = 0.0;
+    lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    dy  = fmod(dy, 1.0*ntotal[Y]);
+    jdy = floor(dy);
 
-      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
+    /* Starting y coordinate is j0: 1 <= j0 <= ntotal[y] */
 
-      tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0,
-		      k3d, lekh, lb->target, le_target, t);
-      tdpAssert( tdpPeekAtLastError() );
-      tdpAssert( tdpStreamSynchronize(0) );
-    }
+    int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y];
+    lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r);
 
-    /* Swap the send and recv buffer */
+    j0        = 1 + (j0 - 1) % lekh.nlocal[Y]; /* 1 <= j0 <= nlocal[Y] */
+    int jrow1 = lekh.nlocal[Y] + 1 - j0;
+    int jrow2 = j0;
 
-    tdpAssert( tdpMemcpy(lb->target->rbuff, lb->target->sbuff,
-			 2*lekh.nxdist*sizeof(double),
-			 tdpMemcpyDeviceToDevice) );
+    int n1 = jrow1*nrowdata;
+    int n2 = jrow2*nrowdata;
 
-    /* Displacement and interpolation to replace distributions */
-    if (mpi_cartsz[Y] > 1) {
-      assert(0); /* PENDING comms for send to recv buffer */
-    }
-    else {
-      int  nx   = lekh.nplane;
-      dim3 nblk = {0};
-      dim3 ntpb = {0};
-      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
-      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
+    /* Post the receives (are disjoint) */
+    /* 1 -> jrow1 incl., and 1 + jrow1 -> nlocal[Y] + 1 incl. */
 
-      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
+    int ibuf1 = le_ibuf(&lekh, 1,         1, 0, iside, 0, 0);
+    int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0);
 
-      /* Two kernels to provide synchronisation: interpolate to temporary
-       * and then can copy back values to the distribution itself. */
+    MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag1, comm, req + 0);
+    MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag2, comm, req + 1);
 
-      tdpLaunchKernel(lb_data_displace_and_interpolate, nblk, ntpb, 0, 0,
-		      k3d, lekh, lb->target, le_target, t);
+    /* Post sends (overlap at jrow2) */
+    /* jrow2 -> nlocal[Y] incl., and 1 -> jrow2 incl. */
 
-      tdpAssert( tdpPeekAtLastError() );
-      tdpAssert( tdpStreamSynchronize(0) );
-    }
+    ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0);
+    ibuf2 = le_ibuf(&lekh,     1, 1, 0, iside, 0, 0);
 
-    TIMER_stop(TIMER_LE);
+    MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag1, comm, req + 2);
+    MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag2, comm, req + 3);
+  }
+
+  /* Other side */
+  {
+    int iside = 1;
+    int jdy = -1;
+    double dy = 0.0;
+
+    lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    dy  = fmod(-dy, 1.0*ntotal[Y]); /* note sign */
+    jdy = floor(dy);
+
+    /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */
+
+    int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y];
+    lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r);
+
+    j0        = 1 + (j0 - 1) % lekh.nlocal[Y];
+    int jrow1 = lekh.nlocal[Y] + 1 - j0;
+    int jrow2 = j0;
+
+    int n1 = jrow1*nrowdata;
+    int n2 = jrow2*nrowdata;
+
+    /* Post the receives */
+
+    int ibuf1 = le_ibuf(&lekh, 1,         1, 0, iside, 0, 0);
+    int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0);
+
+    MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag3, comm, req + 4);
+    MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag4, comm, req + 5);
+
+    /* Post sends */
+
+    ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0);
+    ibuf2 = le_ibuf(&lekh,     1, 1, 0, iside, 0, 0);
+
+    MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag3, comm, req + 6);
+    MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag4, comm, req + 7);
+  }
+
+  /* Complete */
+  MPI_Waitall(8, req, MPI_STATUSES_IGNORE);
+
+  if (have_gpu_aware_mpi_) {
+    /* No further action */
+  }
+  else {
+    int nbuffsz = lekh.nxbuff*sizeof(double);
+    double * target = NULL;
+    tdpAssert( tdpMemcpy(&target, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(target, rbuff, nbuffsz, tdpMemcpyHostToDevice) );
   }
 
   return 0;
@@ -1149,27 +1270,66 @@ __host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
 /*****************************************************************************
  *
- *  le_kernel_helper
+ *  lb_data_interpolate_kernel
+ *
+ *  Interpolate the final recv buffer to reset plane-crossing distribiutions.
+ *  The linear interpolation is always between jc and jc+1 for
+ *  1 <= jc <= nlocal[Y], ie., an appropriate displacement
+ *  of the send buffer should have occurred. A consistent
+ *  fractional part of a lattice spacing is used.
+ *
+ *  Always linear interpoaltion to conserve mass.
  *
  *****************************************************************************/
 
-static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) {
+__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d,
+					   le_kernel_helper_t lekh,
+					   lb_t * lb,
+					   lees_edw_t * le, double t) {
+  int kindex = 0;
+  int nhalo = 0;
+  double ltot[3] = {0};
 
-  le_kernel_helper_t lekh = {0};
+  lees_edw_nhalo(le, &nhalo);
+  lees_edw_ltot(le, ltot);
 
-  assert(le);
+  for_simt_parallel(kindex, k3d.kiterations, 1) {
 
-  lees_edw_nlocal(le, lekh.nlocal);
-  lekh.nplane = lees_edw_nplane_local(le);
+    int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */
+    int jc = kernel_3d_jc(&k3d, kindex);
+    int kc = kernel_3d_kc(&k3d, kindex);
 
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) lekh.nprop += 1;
-  }
+    if (jc <= lekh.nlocal[Y] && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) {
 
-  /* FIXME: nxdist to include the number of planes here but ncrossdist
-   * does not include the number of planes */
-  lekh.ndist  = lb->ndist;
-  lekh.nxdist = lekh.ndist*lekh.nprop*lekh.nplane*lekh.nlocal[Y]*lekh.nlocal[Z];
+      int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane);
+      int iside  = ix % 2; assert(iside == 0 || iside == 1);
+      int cx     = 1 - 2*iside; /* below going up, or above going down */
 
-  return lekh;
+      /* buffer location, fractional part of the displacement, ... */
+      int ic = iside + lees_edw_plane_location(le, iplane);
+      int jdy = 0;
+
+      double dy = 0.0;
+      double fr = 0.0;
+
+      lees_edw_buffer_displacement(le, nhalo, t, &dy);
+      dy = fmod(dy*cx, ltot[Y]);
+      jdy = floor(dy);
+      fr = dy - jdy;
+
+      int index0 = lees_edw_index(le, ic, jc, kc);
+
+      for (int n = 0; n < lekh.ndist; n++) {
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+	  int p = lekh.prop[iside][ip];
+	  int ibuf0 = le_ibuf(&lekh, jc,     kc, iplane, iside, n, ip);
+	  int ibuf1 = le_ibuf(&lekh, jc + 1, kc, iplane, iside, n, ip);
+	  double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
+	  lb_f_set(lb, index0, p, n, f);
+	}
+      }
+    }
+  }
+
+  return;
 }
diff --git a/src/model_le.h b/src/model_le.h
index 546013975..01f285ad4 100644
--- a/src/model_le.h
+++ b/src/model_le.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2009-2022 The University of Edinburgh
+ *  (c) 2009-2024 The University of Edinburgh
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
  *****************************************************************************/
@@ -16,7 +16,7 @@
 #include "lb_data.h"
 #include "leesedwards.h"
 
-int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le);
+int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le);
 int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le);
 
 #endif