diff --git a/potentials/ilp/ilp_tmd_sw_ILP_MoS2_oywg_2018.txt b/potentials/ilp/ilp_tmd_sw_ILP_MoS2_oywg_2018.txt
new file mode 100644
index 000000000..bd5064b8e
--- /dev/null
+++ b/potentials/ilp/ilp_tmd_sw_ILP_MoS2_oywg_2018.txt
@@ -0,0 +1,5 @@
+ilp_tmd_sw 2 Mo S
+5.57945037 9.37766246 2.02722246 144.15177543 97.97857017 89.43759676 2.05903055 5.12205490 491850.31619468 1.0 4.0 16.0
+3.62715212 19.97137510 7.58503123 76.10193100 3.31749559 45.72032846 0.94747037 4.41042503 150597.85771629 1.0 4.0 16.0
+3.62715212 19.97137510 7.58503123 76.10193100 3.31749559 45.72032846 0.94747037 4.41042503 150597.85771629 1.0 4.0 16.0
+3.16140219 8.09326275 1.95314020 4.58676418 118.06546646 58.80941584 0.21536658 4.29960013 148811.24340892 1.0 4.0 16.0
diff --git a/potentials/ilp/ilp_tmd_sw_SW_MoS2_jjw_2018.txt b/potentials/ilp/ilp_tmd_sw_SW_MoS2_jjw_2018.txt
new file mode 100644
index 000000000..023e715cf
--- /dev/null
+++ b/potentials/ilp/ilp_tmd_sw_SW_MoS2_jjw_2018.txt
@@ -0,0 +1,11 @@
+0.000 1.000 1.000 1.000 1.000
+6.918 7.223 2.523 1.252 1.000
+0.000 1.000 1.000 1.000 1.000
+0.000 0.143
+0.000 0.143
+0.000 0.143
+67.883 0.143
+62.449 0.143
+0.000 0.143
+0.000 0.143
+0.000 0.143
\ No newline at end of file
diff --git a/src/force/force.cu b/src/force/force.cu
index 8901fa81c..3313e9257 100644
--- a/src/force/force.cu
+++ b/src/force/force.cu
@@ -25,6 +25,7 @@ The driver class calculating force and related quantities.
#include "tersoff1988.cuh"
#include "tersoff1989.cuh"
#include "tersoff_mini.cuh"
+#include "ilp_tmd_sw.cuh"
#include "utilities/common.cuh"
#include "utilities/error.cuh"
#include "utilities/read_file.cuh"
@@ -131,6 +132,13 @@ void Force::parse_potential(
check_types(param[1]);
} else if (strcmp(potential_name, "lj") == 0) {
potential.reset(new LJ(fid_potential, num_types, number_of_atoms));
+ } else if (strcmp(potential_name, "ilp_tmd_sw") == 0) {
+ if (num_param != 3) {
+ PRINT_INPUT_ERROR("potential should contain ILP potential file and SW potential file.\n");
+ }
+ FILE* fid_sw = my_fopen(param[2], "r");
+ potential.reset(new ILP_TMD_SW(fid_potential, fid_sw, num_types, number_of_atoms));
+ fclose(fid_sw);
} else {
PRINT_INPUT_ERROR("illegal potential model.\n");
}
@@ -477,6 +485,10 @@ void Force::compute(
potential_per_atom,
force_per_atom,
virial_per_atom);
+ } else if (1 == potentials[0]->ilp_flag) {
+ // compute the potential with ILP
+ potentials[0]->compute_ilp(
+ box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom, group);
} else {
potentials[0]->compute(
box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom);
@@ -494,6 +506,10 @@ void Force::compute(
potential_per_atom,
force_per_atom,
virial_per_atom);
+ } else if (1 == potentials[i]->ilp_flag) {
+ // compute the potential with ILP
+ potentials[i]->compute_ilp(
+ box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom, group);
} else {
potentials[i]->compute(
box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom);
@@ -764,6 +780,10 @@ void Force::compute(
potential_per_atom,
force_per_atom,
virial_per_atom);
+ } else if (1 == potentials[0]->ilp_flag) {
+ // compute the potential with ILP
+ potentials[0]->compute_ilp(
+ box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom, group);
} else {
potentials[0]->compute(
box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom);
@@ -781,6 +801,10 @@ void Force::compute(
potential_per_atom,
force_per_atom,
virial_per_atom);
+ } else if (1 == potentials[i]->ilp_flag) {
+ // compute the potential with ILP
+ potentials[i]->compute_ilp(
+ box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom, group);
} else {
potentials[i]->compute(
box, type, position_per_atom, potential_per_atom, force_per_atom, virial_per_atom);
diff --git a/src/force/ilp_tmd_sw.cu b/src/force/ilp_tmd_sw.cu
new file mode 100644
index 000000000..931f47348
--- /dev/null
+++ b/src/force/ilp_tmd_sw.cu
@@ -0,0 +1,1527 @@
+/*
+ Copyright 2017 Zheyong Fan and GPUMD development team
+ This file is part of GPUMD.
+ GPUMD is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ GPUMD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with GPUMD. If not, see .
+*/
+
+/*----------------------------------------------------------------------------80
+The class dealing with the interlayer potential(ILP) and SW.
+TODO:
+------------------------------------------------------------------------------*/
+
+#include "ilp_tmd_sw.cuh"
+#include "neighbor.cuh"
+#include "utilities/error.cuh"
+#include "utilities/common.cuh"
+
+#define BLOCK_SIZE_FORCE 128
+
+// there are most 6 intra-layer neighbors for TMD
+#define NNEI 6
+
+
+ILP_TMD_SW::ILP_TMD_SW(FILE* fid_ilp, FILE* fid_sw, int num_types, int num_atoms)
+{
+ // read ILP TMD potential parameter
+ printf("Use %d-element ILP potential with elements:\n", num_types);
+ if (!(num_types >= 1 && num_types <= MAX_TYPE_ILP_TMD_SW)) {
+ PRINT_INPUT_ERROR("Incorrect type number of ILP_TMD_SW parameters.\n");
+ }
+ for (int n = 0; n < num_types; ++n) {
+ char atom_symbol[10];
+ int count = fscanf(fid_ilp, "%s", atom_symbol);
+ PRINT_SCANF_ERROR(count, 1, "Reading error for ILP_TMD_SW potential.");
+ printf(" %s", atom_symbol);
+ }
+ printf("\n");
+
+ // read parameters
+ float beta, alpha, delta, epsilon, C, d, sR;
+ float reff, C6, S, rcut_ilp, rcut_global;
+ rc = 0.0;
+ for (int n = 0; n < num_types; ++n) {
+ for (int m = 0; m < num_types; ++m) {
+ int count = fscanf(fid_ilp, "%f%f%f%f%f%f%f%f%f%f%f%f", \
+ &beta, &alpha, &delta, &epsilon, &C, &d, &sR, &reff, &C6, &S, \
+ &rcut_ilp, &rcut_global);
+ PRINT_SCANF_ERROR(count, 12, "Reading error for ILP_TMD_SW potential.");
+
+ ilp_para.C[n][m] = C;
+ ilp_para.C_6[n][m] = C6;
+ ilp_para.d[n][m] = d;
+ ilp_para.d_Seff[n][m] = d / sR / reff;
+ ilp_para.epsilon[n][m] = epsilon;
+ ilp_para.z0[n][m] = beta;
+ ilp_para.lambda[n][m] = alpha / beta;
+ ilp_para.delta2inv[n][m] = 1.0 / (delta * delta);
+ ilp_para.S[n][m] = S;
+ ilp_para.rcutsq_ilp[n][m] = rcut_ilp * rcut_ilp;
+ ilp_para.rcut_global[n][m] = rcut_global;
+ float meV = 1e-3 * S;
+ ilp_para.C[n][m] *= meV;
+ ilp_para.C_6[n][m] *= meV;
+ ilp_para.epsilon[n][m] *= meV;
+
+ if (rc < rcut_global)
+ rc = rcut_global;
+ }
+ }
+
+ // read SW potential parameter
+ if (num_types == 1) {
+ initialize_sw_1985_1(fid_sw);
+ }
+ if (num_types == 2) {
+ initialize_sw_1985_2(fid_sw);
+ }
+ if (num_types == 3) {
+ initialize_sw_1985_3(fid_sw);
+ }
+
+ // initialize neighbor lists and some temp vectors
+ int max_neighbor_number = min(num_atoms, CUDA_MAX_NL_TMD);
+ ilp_data.NN.resize(num_atoms);
+ ilp_data.NL.resize(num_atoms * max_neighbor_number);
+ ilp_data.cell_count.resize(num_atoms);
+ ilp_data.cell_count_sum.resize(num_atoms);
+ ilp_data.cell_contents.resize(num_atoms);
+
+ // init ilp neighbor list
+ ilp_data.ilp_NN.resize(num_atoms);
+ ilp_data.ilp_NL.resize(num_atoms * MAX_ILP_NEIGHBOR_TMD);
+ ilp_data.reduce_NL.resize(num_atoms * max_neighbor_number);
+ ilp_data.big_ilp_NN.resize(num_atoms);
+ ilp_data.big_ilp_NL.resize(num_atoms * MAX_BIG_ILP_NEIGHBOR_TMD);
+
+ ilp_data.f12x.resize(num_atoms * max_neighbor_number);
+ ilp_data.f12y.resize(num_atoms * max_neighbor_number);
+ ilp_data.f12z.resize(num_atoms * max_neighbor_number);
+
+ ilp_data.f12x_ilp_neigh.resize(num_atoms * MAX_ILP_NEIGHBOR_TMD);
+ ilp_data.f12y_ilp_neigh.resize(num_atoms * MAX_ILP_NEIGHBOR_TMD);
+ ilp_data.f12z_ilp_neigh.resize(num_atoms * MAX_ILP_NEIGHBOR_TMD);
+
+ // intialize sw neighbor list
+ sw2_data.NN.resize(num_atoms);
+ sw2_data.NL.resize(num_atoms * 1024); // the largest supported by CUDA
+ sw2_data.cell_count.resize(num_atoms);
+ sw2_data.cell_count_sum.resize(num_atoms);
+ sw2_data.cell_contents.resize(num_atoms);
+
+ // memory for the partial forces dU_i/dr_ij
+ const int num_of_neighbors = MAX_SW_NEIGHBOR_NUM * num_atoms;
+ sw2_data.f12x.resize(num_of_neighbors);
+ sw2_data.f12y.resize(num_of_neighbors);
+ sw2_data.f12z.resize(num_of_neighbors);
+
+ // init constant cutoff coeff
+ float h_tap_coeff[8] = \
+ {1.0f, 0.0f, 0.0f, 0.0f, -35.0f, 84.0f, -70.0f, 20.0f};
+ cudaMemcpyToSymbol(Tap_coeff_tmd, h_tap_coeff, 8 * sizeof(float));
+ CUDA_CHECK_KERNEL
+
+ // set ilp_flag to 1
+ ilp_flag = 1;
+}
+
+ILP_TMD_SW::~ILP_TMD_SW(void)
+{
+ // nothing
+}
+
+void ILP_TMD_SW::initialize_sw_1985_1(FILE* fid)
+{
+ printf("Use single-element Stillinger-Weber potential.\n");
+ int count;
+ double epsilon, lambda, A, B, a, gamma, sigma, cos0;
+ count =
+ fscanf(fid, "%lf%lf%lf%lf%lf%lf%lf%lf", &epsilon, &lambda, &A, &B, &a, &gamma, &sigma, &cos0);
+ PRINT_SCANF_ERROR(count, 8, "Reading error for SW potential.");
+
+ sw2_para.A[0][0] = epsilon * A;
+ sw2_para.B[0][0] = B;
+ sw2_para.a[0][0] = a;
+ sw2_para.sigma[0][0] = sigma;
+ sw2_para.gamma[0][0] = gamma;
+ sw2_para.rc[0][0] = sigma * a;
+ rc_sw = sw2_para.rc[0][0];
+ sw2_para.lambda[0][0][0] = epsilon * lambda;
+ sw2_para.cos0[0][0][0] = cos0;
+}
+
+void ILP_TMD_SW::initialize_sw_1985_2(FILE* fid)
+{
+ printf("Use two-element Stillinger-Weber potential.\n");
+ int count;
+
+ // 2-body parameters and the force cutoff
+ double A[3], B[3], a[3], sigma[3], gamma[3];
+ rc_sw = 0.0;
+ for (int n = 0; n < 3; n++) {
+ count = fscanf(fid, "%lf%lf%lf%lf%lf", &A[n], &B[n], &a[n], &sigma[n], &gamma[n]);
+ PRINT_SCANF_ERROR(count, 5, "Reading error for SW potential.");
+ }
+ for (int n1 = 0; n1 < 2; n1++)
+ for (int n2 = 0; n2 < 2; n2++) {
+ sw2_para.A[n1][n2] = A[n1 + n2];
+ sw2_para.B[n1][n2] = B[n1 + n2];
+ sw2_para.a[n1][n2] = a[n1 + n2];
+ sw2_para.sigma[n1][n2] = sigma[n1 + n2];
+ sw2_para.gamma[n1][n2] = gamma[n1 + n2];
+ sw2_para.rc[n1][n2] = sigma[n1 + n2] * a[n1 + n2];
+ if (rc_sw < sw2_para.rc[n1][n2])
+ rc_sw = sw2_para.rc[n1][n2];
+ }
+
+ // 3-body parameters
+ double lambda, cos0;
+ for (int n1 = 0; n1 < 2; n1++)
+ for (int n2 = 0; n2 < 2; n2++)
+ for (int n3 = 0; n3 < 2; n3++) {
+ count = fscanf(fid, "%lf%lf", &lambda, &cos0);
+ PRINT_SCANF_ERROR(count, 2, "Reading error for SW potential.");
+ sw2_para.lambda[n1][n2][n3] = lambda;
+ sw2_para.cos0[n1][n2][n3] = cos0;
+ }
+}
+
+void ILP_TMD_SW::initialize_sw_1985_3(FILE* fid)
+{
+ printf("Use three-element Stillinger-Weber potential.\n");
+ int count;
+
+ // 2-body parameters and the force cutoff
+ double A, B, a, sigma, gamma;
+ rc_sw = 0.0;
+ for (int n1 = 0; n1 < 3; n1++)
+ for (int n2 = 0; n2 < 3; n2++) {
+ count = fscanf(fid, "%lf%lf%lf%lf%lf", &A, &B, &a, &sigma, &gamma);
+ PRINT_SCANF_ERROR(count, 5, "Reading error for SW potential.");
+ sw2_para.A[n1][n2] = A;
+ sw2_para.B[n1][n2] = B;
+ sw2_para.a[n1][n2] = a;
+ sw2_para.sigma[n1][n2] = sigma;
+ sw2_para.gamma[n1][n2] = gamma;
+ sw2_para.rc[n1][n2] = sigma * a;
+ if (rc_sw < sw2_para.rc[n1][n2])
+ rc_sw = sw2_para.rc[n1][n2];
+ }
+
+ // 3-body parameters
+ double lambda, cos0;
+ for (int n1 = 0; n1 < 3; n1++) {
+ for (int n2 = 0; n2 < 3; n2++) {
+ for (int n3 = 0; n3 < 3; n3++) {
+ count = fscanf(fid, "%lf%lf", &lambda, &cos0);
+ PRINT_SCANF_ERROR(count, 2, "Reading error for SW potential.");
+ sw2_para.lambda[n1][n2][n3] = lambda;
+ sw2_para.cos0[n1][n2][n3] = cos0;
+ }
+ }
+ }
+}
+
+static __device__ __forceinline__ float calc_Tap(const float r_ij, const float Rcutinv)
+{
+ float Tap, r;
+
+ r = r_ij * Rcutinv;
+ if (r >= 1.0f) {
+ Tap = 0.0f;
+ } else {
+ Tap = Tap_coeff_tmd[7];
+ for (int i = 6; i >= 0; --i) {
+ Tap = Tap * r + Tap_coeff_tmd[i];
+ }
+ }
+
+ return Tap;
+}
+
+// calculate the derivatives of long-range cutoff term
+static __device__ __forceinline__ float calc_dTap(const float r_ij, const float Rcut, const float Rcutinv)
+{
+ float dTap, r;
+
+ r = r_ij * Rcutinv;
+ if (r >= Rcut) {
+ dTap = 0.0f;
+ } else {
+ dTap = 7.0f * Tap_coeff_tmd[7];
+ for (int i = 6; i > 0; --i) {
+ dTap = dTap * r + i * Tap_coeff_tmd[i];
+ }
+ dTap *= Rcutinv;
+ }
+
+ return dTap;
+}
+
+// create ILP neighbor list from main neighbor list to calculate normals
+static __global__ void ILP_neighbor(
+ const int number_of_particles,
+ const int N1,
+ const int N2,
+ const Box box,
+ const int *g_neighbor_number,
+ const int *g_neighbor_list,
+ const int *g_type,
+ ILP_TMD_Para ilp_para,
+ const double* __restrict__ g_x,
+ const double* __restrict__ g_y,
+ const double* __restrict__ g_z,
+ int *ilp_neighbor_number,
+ int *ilp_neighbor_list,
+ const int *group_label)
+{
+ int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1; // particle index
+
+ if (n1 < N2) {
+ // TMD
+ int neighptr[10], check[10], neighsort[10];
+ for (int ll = 0; ll < 10; ++ll) {
+ neighptr[ll] = -1;
+ neighsort[ll] = -1;
+ check[ll] = -1;
+ }
+
+ int count = 0;
+ int neighbor_number = g_neighbor_number[n1];
+ int type1 = g_type[n1];
+ double x1 = g_x[n1];
+ double y1 = g_y[n1];
+ double z1 = g_z[n1];
+
+ for (int i1 = 0; i1 < neighbor_number; ++i1) {
+ int n2 = g_neighbor_list[n1 + number_of_particles * i1];
+ int type2 = g_type[n2];
+
+ double x12 = g_x[n2] - x1;
+ double y12 = g_y[n2] - y1;
+ double z12 = g_z[n2] - z1;
+ apply_mic(box, x12, y12, z12);
+ double d12sq = x12 * x12 + y12 * y12 + z12 * z12;
+ double rcutsq = ilp_para.rcutsq_ilp[type1][type2];
+
+
+ if (group_label[n1] == group_label[n2] && d12sq < rcutsq && type1 == type2 && d12sq != 0) {
+ // ilp_neighbor_list[count++ * number_of_particles + n1] = n2;
+ neighptr[count++] = n2;
+ }
+ }
+
+ // TMD
+ for (int ll = 0; ll < count; ++ll) {
+ neighsort[ll] = neighptr[ll];
+ check[ll] = neighptr[ll];
+ }
+
+ // TMD
+ if (count == NNEI) {
+ neighsort[0] = neighptr[0];
+ check[0] = -1;
+ } else if (count < NNEI && count > 0) {
+ for (int jj = 0; jj < count; ++jj) {
+ int j = neighptr[jj];
+ int jtype = g_type[j];
+ int count_temp = 0;
+ for (int ll = 0; ll < count; ++ll) {
+ int l = neighptr[ll];
+ int ltype = g_type[l];
+ if (l == j) continue;
+ double deljx = g_x[l] - g_x[j];
+ double deljy = g_y[l] - g_y[j];
+ double deljz = g_z[l] - g_z[j];
+ apply_mic(box, deljx, deljy, deljz);
+ double rsqlj = deljx * deljx + deljy * deljy + deljz * deljz;
+ if (rsqlj != 0 && rsqlj < ilp_para.rcutsq_ilp[ltype][jtype]) {
+ ++count_temp;
+ }
+
+ }
+ if (count_temp == 1) {
+ neighsort[0] = neighptr[jj];
+ check[jj] = -1;
+ break;
+ }
+ }
+ } else if (count > NNEI) {
+ printf("ERROR in ILP NEIGHBOR LIST\n");
+ printf("\n===== ILP neighbor number[%d] is greater than 6 =====\n", count);
+ return;
+ }
+
+ // TMD
+ // sort the order of neighbors of atom n1
+ for (int jj = 0; jj < count; ++jj) {
+ int j = neighsort[jj];
+ int jtype = g_type[j];
+ int ll = 0;
+ while (ll < count) {
+ int l = neighptr[ll];
+ if (check[ll] == -1) {
+ ++ll;
+ continue;
+ }
+ int ltype = g_type[l];
+ double deljx = g_x[l] - g_x[j];
+ double deljy = g_y[l] - g_y[j];
+ double deljz = g_z[l] - g_z[j];
+ apply_mic(box, deljx, deljy, deljz);
+ double rsqlj = deljx * deljx + deljy * deljy + deljz * deljz;
+
+ if (abs(rsqlj) >= 1e-6 && rsqlj < ilp_para.rcutsq_ilp[ltype][jtype]) {
+ neighsort[jj + 1] = l;
+ check[ll] = -1;
+ break;
+ }
+ ++ll;
+ }
+ }
+ ilp_neighbor_number[n1] = count;
+ for (int jj = 0; jj < count; ++jj) {
+ ilp_neighbor_list[jj * number_of_particles + n1] = neighsort[jj];
+ }
+ }
+}
+
+// modulo func to change atom index
+static __device__ __forceinline__ int modulo(int k, int range)
+{
+ return (k + range) % range;
+}
+
+// calculate the normals and its derivatives
+static __device__ void calc_normal(
+ float (&vect)[NNEI][3],
+ int cont,
+ float (&normal)[3],
+ float (&dnormdri)[3][3],
+ float (&dnormal)[3][NNEI][3])
+{
+ int id, ip, m;
+ float dni[3];
+ float dnn[3][3], dpvdri[3][3];
+ float Nave[3], pvet[NNEI][3], dpvet1[NNEI][3][3], dpvet2[NNEI][3][3], dNave[3][NNEI][3];
+
+ float nninv;
+
+ // initialize the arrays
+ for (id = 0; id < 3; id++) {
+ dni[id] = 0.0f;
+
+ Nave[id] = 0.0f;
+ for (ip = 0; ip < 3; ip++) {
+ dpvdri[ip][id] = 0.0f;
+ for (m = 0; m < NNEI; m++) {
+ dnn[m][id] = 0.0f;
+ pvet[m][id] = 0.0f;
+ dpvet1[m][ip][id] = 0.0f;
+ dpvet2[m][ip][id] = 0.0f;
+ dNave[id][m][ip] = 0.0f;
+ }
+ }
+ }
+
+ if (cont <= 1) {
+ normal[0] = 0.0f;
+ normal[1] = 0.0f;
+ normal[2] = 1.0f;
+ for (id = 0; id < 3; ++id) {
+ for (ip = 0; ip < 3; ++ip) {
+ dnormdri[id][ip] = 0.0f;
+ for (m = 0; m < NNEI; ++m) {
+ dnormal[id][m][ip] = 0.0f;
+ }
+ }
+ }
+ } else if (cont > 1 && cont < NNEI) {
+ for (int k = 0; k < cont - 1; ++k) {
+ for (ip = 0; ip < 3; ++ip) {
+ pvet[k][ip] = vect[k][modulo(ip + 1, 3)] * vect[k + 1][modulo(ip + 2, 3)] -
+ vect[k][modulo(ip + 2, 3)] * vect[k + 1][modulo(ip + 1, 3)];
+ }
+ // dpvet1[k][l][ip]: the derivatve of the k (=0,...cont-1)th Nik respect to the ip component of atom l
+ // derivatives respect to atom l
+ // dNik,x/drl
+ dpvet1[k][0][0] = 0.0f;
+ dpvet1[k][0][1] = vect[modulo(k + 1, NNEI)][2];
+ dpvet1[k][0][2] = -vect[modulo(k + 1, NNEI)][1];
+ // dNik,y/drl
+ dpvet1[k][1][0] = -vect[modulo(k + 1, NNEI)][2];
+ dpvet1[k][1][1] = 0.0f;
+ dpvet1[k][1][2] = vect[modulo(k + 1, NNEI)][0];
+ // dNik,z/drl
+ dpvet1[k][2][0] = vect[modulo(k + 1, NNEI)][1];
+ dpvet1[k][2][1] = -vect[modulo(k + 1, NNEI)][0];
+ dpvet1[k][2][2] = 0.0f;
+
+ // dpvet2[k][l][ip]: the derivatve of the k (=0,...cont-1)th Nik respect to the ip component of atom l+1
+ // derivatives respect to atom l+1
+ // dNik,x/drl+1
+ dpvet2[k][0][0] = 0.0f;
+ dpvet2[k][0][1] = -vect[modulo(k, NNEI)][2];
+ dpvet2[k][0][2] = vect[modulo(k, NNEI)][1];
+ // dNik,y/drl+1
+ dpvet2[k][1][0] = vect[modulo(k, NNEI)][2];
+ dpvet2[k][1][1] = 0.0f;
+ dpvet2[k][1][2] = -vect[modulo(k, NNEI)][0];
+ // dNik,z/drl+1
+ dpvet2[k][2][0] = -vect[modulo(k, NNEI)][1];
+ dpvet2[k][2][1] = vect[modulo(k, NNEI)][0];
+ dpvet2[k][2][2] = 0.0f;
+ }
+
+ // average the normal vectors by using the NNEI neighboring planes
+ for (ip = 0; ip < 3; ip++) {
+ Nave[ip] = 0.0f;
+ for (int k = 0; k < cont - 1; k++) {
+ Nave[ip] += pvet[k][ip];
+ }
+ Nave[ip] /= (cont - 1);
+ }
+ nninv = rnorm3df(Nave[0], Nave[1], Nave[2]);
+
+ // the unit normal vector
+ normal[0] = Nave[0] * nninv;
+ normal[1] = Nave[1] * nninv;
+ normal[2] = Nave[2] * nninv;
+
+ // derivatives of non-normalized normal vector, dNave:3xcontx3 array
+ // dNave[id][m][ip]: the derivatve of the id component of Nave respect to the ip component of atom m
+ for (id = 0; id < 3; id++) {
+ for (ip = 0; ip < 3; ip++) {
+ for (m = 0; m < cont; m++) {
+ if (m == 0) {
+ dNave[id][m][ip] = dpvet1[m][id][ip] / (cont - 1);
+ } else if (m == cont - 1) {
+ dNave[id][m][ip] = dpvet2[m - 1][id][ip] / (cont - 1);
+ } else { // sum of the derivatives of the mth and (m-1)th normal vector respect to the atom m
+ dNave[id][m][ip] = (dpvet1[m][id][ip] + dpvet2[m - 1][id][ip]) / (cont - 1);
+ }
+ }
+ }
+ }
+ // derivatives of nn, dnn:contx3 vector
+ // dnn[m][id]: the derivative of nn respect to r[m][id], m=0,...NNEI-1; id=0,1,2
+ // r[m][id]: the id's component of atom m
+ for (m = 0; m < cont; m++) {
+ for (id = 0; id < 3; id++) {
+ dnn[m][id] = (Nave[0] * dNave[0][m][id] + Nave[1] * dNave[1][m][id] +
+ Nave[2] * dNave[2][m][id]) * nninv;
+ }
+ }
+ // dnormal[i][id][m][ip]: the derivative of normal[i][id] respect to r[m][ip], id,ip=0,1,2.
+ // for atom m, which is a neighbor atom of atom i, m = 0,...,NNEI-1
+ for (m = 0; m < cont; m++) {
+ for (id = 0; id < 3; id++) {
+ for (ip = 0; ip < 3; ip++) {
+ dnormal[id][m][ip] = dNave[id][m][ip] * nninv - Nave[id] * dnn[m][ip] * nninv * nninv;
+ }
+ }
+ }
+ // Calculte dNave/dri, defined as dpvdri
+ for (id = 0; id < 3; id++) {
+ for (ip = 0; ip < 3; ip++) {
+ dpvdri[id][ip] = 0.0;
+ for (int k = 0; k < cont; k++) {
+ dpvdri[id][ip] -= dNave[id][k][ip];
+ }
+ }
+ }
+
+ // derivatives of nn, dnn:3x1 vector
+ dni[0] = (Nave[0] * dpvdri[0][0] + Nave[1] * dpvdri[1][0] + Nave[2] * dpvdri[2][0]) * nninv;
+ dni[1] = (Nave[0] * dpvdri[0][1] + Nave[1] * dpvdri[1][1] + Nave[2] * dpvdri[2][1]) * nninv;
+ dni[2] = (Nave[0] * dpvdri[0][2] + Nave[1] * dpvdri[1][2] + Nave[2] * dpvdri[2][2]) * nninv;
+ // derivatives of unit vector ni respect to ri, the result is 3x3 matrix
+ for (id = 0; id < 3; id++) {
+ for (ip = 0; ip < 3; ip++) {
+ dnormdri[id][ip] = dpvdri[id][ip] * nninv - Nave[id] * dni[ip] * nninv * nninv;
+ }
+ }
+ } else if (cont == NNEI) {
+ // derivatives of Ni[l] respect to the NNEI neighbors
+ for (int k = 0; k < NNEI; ++k) {
+ for (ip = 0; ip < 3; ++ip) {
+ pvet[k][ip] = vect[modulo(k, NNEI)][modulo(ip + 1, 3)] *
+ vect[modulo(k + 1, NNEI)][modulo(ip + 2, 3)] -
+ vect[modulo(k, NNEI)][modulo(ip + 2, 3)] *
+ vect[modulo(k + 1, NNEI)][modulo(ip + 1, 3)];
+ }
+ // dpvet1[k][l][ip]: the derivatve of the k (=0,...cont-1)th Nik respect to the ip component of atom l
+ // derivatives respect to atom l
+ // dNik,x/drl
+ dpvet1[k][0][0] = 0.0f;
+ dpvet1[k][0][1] = vect[modulo(k + 1, NNEI)][2];
+ dpvet1[k][0][2] = -vect[modulo(k + 1, NNEI)][1];
+ // dNik,y/drl
+ dpvet1[k][1][0] = -vect[modulo(k + 1, NNEI)][2];
+ dpvet1[k][1][1] = 0.0f;
+ dpvet1[k][1][2] = vect[modulo(k + 1, NNEI)][0];
+ // dNik,z/drl
+ dpvet1[k][2][0] = vect[modulo(k + 1, NNEI)][1];
+ dpvet1[k][2][1] = -vect[modulo(k + 1, NNEI)][0];
+ dpvet1[k][2][2] = 0.0f;
+
+ // dpvet2[k][l][ip]: the derivatve of the k (=0,...cont-1)th Nik respect to the ip component of atom l+1
+ // derivatives respect to atom l+1
+ // dNik,x/drl+1
+ dpvet2[k][0][0] = 0.0f;
+ dpvet2[k][0][1] = -vect[modulo(k, NNEI)][2];
+ dpvet2[k][0][2] = vect[modulo(k, NNEI)][1];
+ // dNik,y/drl+1
+ dpvet2[k][1][0] = vect[modulo(k, NNEI)][2];
+ dpvet2[k][1][1] = 0.0f;
+ dpvet2[k][1][2] = -vect[modulo(k, NNEI)][0];
+ // dNik,z/drl+1
+ dpvet2[k][2][0] = -vect[modulo(k, NNEI)][1];
+ dpvet2[k][2][1] = vect[modulo(k, NNEI)][0];
+ dpvet2[k][2][2] = 0.0f;
+ }
+
+ // average the normal vectors by using the NNEI neighboring planes
+ for (ip = 0; ip < 3; ++ip) {
+ Nave[ip] = 0.0f;
+ for (int k = 0; k < NNEI; ++k) {
+ Nave[ip] += pvet[k][ip];
+ }
+ Nave[ip] /= NNEI;
+ }
+ // the magnitude of the normal vector
+ // nn2 = Nave[0] * Nave[0] + Nave[1] * Nave[1] + Nave[2] * Nave[2];
+ nninv = rnorm3df(Nave[0], Nave[1], Nave[2]);
+ // the unit normal vector
+ normal[0] = Nave[0] * nninv;
+ normal[1] = Nave[1] * nninv;
+ normal[2] = Nave[2] * nninv;
+
+ // for the central atoms, dnormdri is always zero
+ for (id = 0; id < 3; ++id) {
+ for (ip = 0; ip < 3; ++ip) {
+ dnormdri[id][ip] = 0.0f;
+ }
+ }
+
+ // derivatives of non-normalized normal vector, dNave:3xNNEIx3 array
+ // dNave[id][m][ip]: the derivatve of the id component of Nave respect to the ip component of atom m
+ for (id = 0; id < 3; ++id) {
+ for (ip = 0; ip < 3; ++ip) {
+ for (
+ m = 0; m < NNEI;
+ ++m) { // sum of the derivatives of the mth and (m-1)th normal vector respect to the atom m
+ dNave[id][m][ip] =
+ (dpvet1[modulo(m, NNEI)][id][ip] + dpvet2[modulo(m - 1, NNEI)][id][ip]) / NNEI;
+ }
+ }
+ }
+ // derivatives of nn, dnn:NNEIx3 vector
+ // dnn[m][id]: the derivative of nn respect to r[m][id], m=0,...NNEI-1; id=0,1,2
+ // r[m][id]: the id's component of atom m
+ for (m = 0; m < NNEI; ++m) {
+ for (id = 0; id < 3; ++id) {
+ dnn[m][id] =
+ (Nave[0] * dNave[0][m][id] + Nave[1] * dNave[1][m][id] + Nave[2] * dNave[2][m][id]) *
+ nninv;
+ }
+ }
+ // dnormal[i][id][m][ip]: the derivative of normal[i][id] respect to r[m][ip], id,ip=0,1,2.
+ // for atom m, which is a neighbor atom of atom i, m = 0,...,NNEI-1
+ for (m = 0; m < NNEI; ++m) {
+ for (id = 0; id < 3; ++id) {
+ for (ip = 0; ip < 3; ++ip) {
+ dnormal[id][m][ip] = dNave[id][m][ip] * nninv - Nave[id] * dnn[m][ip] * nninv * nninv;
+ }
+ }
+ }
+ } else {
+ printf("\n===== ILP neighbor number[%d] is greater than 6 =====\n", cont);
+ return;
+ }
+}
+
+// calculate the van der Waals force and energy
+static __device__ void calc_vdW(
+ float r,
+ float rinv,
+ float rsq,
+ float d,
+ float d_Seff,
+ float C_6,
+ float Tap,
+ float dTap,
+ float &p2_vdW,
+ float &f2_vdW)
+{
+ float r2inv, r6inv, r8inv;
+ float TSvdw, TSvdwinv, Vilp;
+ float fpair, fsum;
+
+ r2inv = 1.0f / rsq;
+ r6inv = r2inv * r2inv * r2inv;
+ r8inv = r2inv * r6inv;
+
+ // TSvdw = 1.0 + exp(-d_Seff * r + d);
+ TSvdw = 1.0f + expf(-d_Seff * r + d);
+ TSvdwinv = 1.0f / TSvdw;
+ Vilp = -C_6 * r6inv * TSvdwinv;
+
+ // derivatives
+ // fpair = -6.0 * C_6 * r8inv * TSvdwinv + \
+ // C_6 * d_Seff * (TSvdw - 1.0) * TSvdwinv * TSvdwinv * r8inv * r;
+ fpair = (-6.0f + d_Seff * (TSvdw - 1.0f) * TSvdwinv * r ) * C_6 * TSvdwinv * r8inv;
+ fsum = fpair * Tap - Vilp * dTap * rinv;
+
+ p2_vdW = Tap * Vilp;
+ f2_vdW = fsum;
+
+}
+
+// force evaluation kernel
+static __global__ void gpu_find_force(
+ ILP_TMD_Para ilp_para,
+ const int number_of_particles,
+ const int N1,
+ const int N2,
+ const Box box,
+ const int *g_neighbor_number,
+ const int *g_neighbor_list,
+ int *g_ilp_neighbor_number,
+ int *g_ilp_neighbor_list,
+ const int *group_label,
+ const int *g_type,
+ const double *__restrict__ g_x,
+ const double *__restrict__ g_y,
+ const double *__restrict__ g_z,
+ double *g_fx,
+ double *g_fy,
+ double *g_fz,
+ double *g_virial,
+ double *g_potential,
+ float *g_f12x,
+ float *g_f12y,
+ float *g_f12z,
+ float *g_f12x_ilp_neigh,
+ float *g_f12y_ilp_neigh,
+ float *g_f12z_ilp_neigh)
+{
+ int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1; // particle index
+ float s_fx = 0.0f; // force_x
+ float s_fy = 0.0f; // force_y
+ float s_fz = 0.0f; // force_z
+ float s_pe = 0.0f; // potential energy
+ float s_sxx = 0.0f; // virial_stress_xx
+ float s_sxy = 0.0f; // virial_stress_xy
+ float s_sxz = 0.0f; // virial_stress_xz
+ float s_syx = 0.0f; // virial_stress_yx
+ float s_syy = 0.0f; // virial_stress_yy
+ float s_syz = 0.0f; // virial_stress_yz
+ float s_szx = 0.0f; // virial_stress_zx
+ float s_szy = 0.0f; // virial_stress_zy
+ float s_szz = 0.0f; // virial_stress_zz
+
+ float r = 0.0f;
+ float rsq = 0.0f;
+ float Rcut = 0.0f;
+
+ if (n1 < N2) {
+ double x12d, y12d, z12d;
+ float x12f, y12f, z12f;
+ int neighor_number = g_neighbor_number[n1];
+ int type1 = g_type[n1];
+ double x1 = g_x[n1];
+ double y1 = g_y[n1];
+ double z1 = g_z[n1];
+
+ float delkix_half[NNEI] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+ float delkiy_half[NNEI] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+ float delkiz_half[NNEI] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+ // calculate the normal
+ int cont = 0;
+ float normal[3];
+ float dnormdri[3][3];
+ float dnormal[3][NNEI][3];
+
+ float vet[NNEI][3];
+ int id, ip, m;
+ for (id = 0; id < 3; ++id) {
+ normal[id] = 0.0f;
+ for (ip = 0; ip < 3; ++ip) {
+ dnormdri[ip][id] = 0.0f;
+ for (m = 0; m < NNEI; ++m) {
+ dnormal[id][m][ip] = 0.0f;
+ vet[m][id] = 0.0f;
+ }
+ }
+ }
+
+ int ilp_neighbor_number = g_ilp_neighbor_number[n1];
+ for (int i1 = 0; i1 < ilp_neighbor_number; ++i1) {
+ int n2_ilp = g_ilp_neighbor_list[n1 + number_of_particles * i1];
+ x12d = g_x[n2_ilp] - x1;
+ y12d = g_y[n2_ilp] - y1;
+ z12d = g_z[n2_ilp] - z1;
+ apply_mic(box, x12d, y12d, z12d);
+ vet[cont][0] = float(x12d);
+ vet[cont][1] = float(y12d);
+ vet[cont][2] = float(z12d);
+ ++cont;
+
+ delkix_half[i1] = float(x12d) * 0.5f;
+ delkiy_half[i1] = float(y12d) * 0.5f;
+ delkiz_half[i1] = float(z12d) * 0.5f;
+ }
+
+ calc_normal(vet, cont, normal, dnormdri, dnormal);
+
+ // calculate energy and force
+ double tt1,tt2,tt3;
+ for (int i1 = 0; i1 < neighor_number; ++i1) {
+ int index = n1 + number_of_particles * i1;
+ int n2 = g_neighbor_list[index];
+ int type2 = g_type[n2];
+
+ tt1 = g_x[n2];
+ tt2 = g_y[n2];
+ tt3 = g_z[n2];
+ x12d = tt1 - x1;
+ y12d = tt2 - y1;
+ z12d = tt3 - z1;
+ apply_mic(box, x12d, y12d, z12d);
+
+ // save x12, y12, z12 in float
+ x12f = float(x12d);
+ y12f = float(y12d);
+ z12f = float(z12d);
+
+ // calculate distance between atoms
+ rsq = x12f * x12f + y12f * y12f + z12f * z12f;
+ r = sqrtf(rsq);
+ Rcut = ilp_para.rcut_global[type1][type2];
+
+ if (r >= Rcut) {
+ continue;
+ }
+
+ // calc att
+ float Tap, dTap, rinv;
+ float Rcutinv = 1.0f / Rcut;
+ rinv = 1.0f / r;
+ Tap = calc_Tap(r, Rcutinv);
+ dTap = calc_dTap(r, Rcut, Rcutinv);
+
+ float p2_vdW, f2_vdW;
+ calc_vdW(
+ r,
+ rinv,
+ rsq,
+ ilp_para.d[type1][type2],
+ ilp_para.d_Seff[type1][type2],
+ ilp_para.C_6[type1][type2],
+ Tap,
+ dTap,
+ p2_vdW,
+ f2_vdW);
+
+ float f12x = -f2_vdW * x12f * 0.5f;
+ float f12y = -f2_vdW * y12f * 0.5f;
+ float f12z = -f2_vdW * z12f * 0.5f;
+ float f21x = -f12x;
+ float f21y = -f12y;
+ float f21z = -f12z;
+
+ s_fx += f12x - f21x;
+ s_fy += f12y - f21y;
+ s_fz += f12z - f21z;
+
+ s_pe += p2_vdW * 0.5f;
+ s_sxx += x12f * f21x;
+ s_sxy += x12f * f21y;
+ s_sxz += x12f * f21z;
+ s_syx += y12f * f21x;
+ s_syy += y12f * f21y;
+ s_syz += y12f * f21z;
+ s_szx += z12f * f21x;
+ s_szy += z12f * f21y;
+ s_szz += z12f * f21z;
+
+
+ // calc rep
+ float C = ilp_para.C[type1][type2];
+ float lambda_ = ilp_para.lambda[type1][type2];
+ float delta2inv = ilp_para.delta2inv[type1][type2];
+ float epsilon = ilp_para.epsilon[type1][type2];
+ float z0 = ilp_para.z0[type1][type2];
+ // calc_rep
+ float prodnorm1, rhosq1, rdsq1, exp0, exp1, frho1, Erep, Vilp;
+ float fpair, fpair1, fsum, delx, dely, delz, fkcx, fkcy, fkcz;
+ float dprodnorm1[3] = {0.0f, 0.0f, 0.0f};
+ float fp1[3] = {0.0f, 0.0f, 0.0f};
+ float fprod1[3] = {0.0f, 0.0f, 0.0f};
+ float fk[3] = {0.0f, 0.0f, 0.0f};
+
+ delx = -x12f;
+ dely = -y12f;
+ delz = -z12f;
+
+ float delx_half = delx * 0.5f;
+ float dely_half = dely * 0.5f;
+ float delz_half = delz * 0.5f;
+
+ // calculate the transverse distance
+ prodnorm1 = normal[0] * delx + normal[1] * dely + normal[2] * delz;
+ rhosq1 = rsq - prodnorm1 * prodnorm1;
+ rdsq1 = rhosq1 * delta2inv;
+
+ // store exponents
+ // exp0 = exp(-lambda_ * (r - z0));
+ // exp1 = exp(-rdsq1);
+ exp0 = expf(-lambda_ * (r - z0));
+ exp1 = expf(-rdsq1);
+
+ frho1 = exp1 * C;
+ Erep = 0.5f * epsilon + frho1;
+ Vilp = exp0 * Erep;
+
+ // derivatives
+ fpair = lambda_ * exp0 * rinv * Erep;
+ fpair1 = 2.0f * exp0 * frho1 * delta2inv;
+ fsum = fpair + fpair1;
+
+ float prodnorm1_m_fpair1 = prodnorm1 * fpair1;
+ float Vilp_m_dTap_m_rinv = Vilp * dTap * rinv;
+
+ // derivatives of the product of rij and ni, the resutl is a vector
+ dprodnorm1[0] =
+ dnormdri[0][0] * delx + dnormdri[1][0] * dely + dnormdri[2][0] * delz;
+ dprodnorm1[1] =
+ dnormdri[0][1] * delx + dnormdri[1][1] * dely + dnormdri[2][1] * delz;
+ dprodnorm1[2] =
+ dnormdri[0][2] * delx + dnormdri[1][2] * dely + dnormdri[2][2] * delz;
+ // fp1[0] = prodnorm1 * normal[0] * fpair1;
+ // fp1[1] = prodnorm1 * normal[1] * fpair1;
+ // fp1[2] = prodnorm1 * normal[2] * fpair1;
+ // fprod1[0] = prodnorm1 * dprodnorm1[0] * fpair1;
+ // fprod1[1] = prodnorm1 * dprodnorm1[1] * fpair1;
+ // fprod1[2] = prodnorm1 * dprodnorm1[2] * fpair1;
+ fp1[0] = prodnorm1_m_fpair1 * normal[0];
+ fp1[1] = prodnorm1_m_fpair1 * normal[1];
+ fp1[2] = prodnorm1_m_fpair1 * normal[2];
+ fprod1[0] = prodnorm1_m_fpair1 * dprodnorm1[0];
+ fprod1[1] = prodnorm1_m_fpair1 * dprodnorm1[1];
+ fprod1[2] = prodnorm1_m_fpair1 * dprodnorm1[2];
+
+ // fkcx = (delx * fsum - fp1[0]) * Tap - Vilp * dTap * delx * rinv;
+ // fkcy = (dely * fsum - fp1[1]) * Tap - Vilp * dTap * dely * rinv;
+ // fkcz = (delz * fsum - fp1[2]) * Tap - Vilp * dTap * delz * rinv;
+ fkcx = (delx * fsum - fp1[0]) * Tap - Vilp_m_dTap_m_rinv * delx;
+ fkcy = (dely * fsum - fp1[1]) * Tap - Vilp_m_dTap_m_rinv * dely;
+ fkcz = (delz * fsum - fp1[2]) * Tap - Vilp_m_dTap_m_rinv * delz;
+
+ s_fx += fkcx - fprod1[0] * Tap;
+ s_fy += fkcy - fprod1[1] * Tap;
+ s_fz += fkcz - fprod1[2] * Tap;
+
+ g_f12x[index] = fkcx;
+ g_f12y[index] = fkcy;
+ g_f12z[index] = fkcz;
+
+ float minus_prodnorm1_m_fpair1_m_Tap = -prodnorm1 * fpair1 * Tap;
+ for (int kk = 0; kk < ilp_neighbor_number; ++kk) {
+ // for (int kk = 0; kk < 0; ++kk) {
+ // int index_ilp = n1 + number_of_particles * kk;
+ // int n2_ilp = g_ilp_neighbor_list[index_ilp];
+ // derivatives of the product of rij and ni respect to rk, k=0,1,2, where atom k is the neighbors of atom i
+ dprodnorm1[0] = dnormal[0][kk][0] * delx + dnormal[1][kk][0] * dely +
+ dnormal[2][kk][0] * delz;
+ dprodnorm1[1] = dnormal[0][kk][1] * delx + dnormal[1][kk][1] * dely +
+ dnormal[2][kk][1] * delz;
+ dprodnorm1[2] = dnormal[0][kk][2] * delx + dnormal[1][kk][2] * dely +
+ dnormal[2][kk][2] * delz;
+ // fk[0] = (-prodnorm1 * dprodnorm1[0] * fpair1) * Tap;
+ // fk[1] = (-prodnorm1 * dprodnorm1[1] * fpair1) * Tap;
+ // fk[2] = (-prodnorm1 * dprodnorm1[2] * fpair1) * Tap;
+ fk[0] = minus_prodnorm1_m_fpair1_m_Tap * dprodnorm1[0];
+ fk[1] = minus_prodnorm1_m_fpair1_m_Tap * dprodnorm1[1];
+ fk[2] = minus_prodnorm1_m_fpair1_m_Tap * dprodnorm1[2];
+
+ g_f12x_ilp_neigh[n1 + number_of_particles * kk] += fk[0];
+ g_f12y_ilp_neigh[n1 + number_of_particles * kk] += fk[1];
+ g_f12z_ilp_neigh[n1 + number_of_particles * kk] += fk[2];
+
+ // delki[0] = g_x[n2_ilp] - x1;
+ // delki[1] = g_y[n2_ilp] - y1;
+ // delki[2] = g_z[n2_ilp] - z1;
+ // apply_mic(box, delki[0], delki[1], delki[2]);
+
+ // s_sxx += delki[0] * fk[0] * 0.5;
+ // s_sxy += delki[0] * fk[1] * 0.5;
+ // s_sxz += delki[0] * fk[2] * 0.5;
+ // s_syx += delki[1] * fk[0] * 0.5;
+ // s_syy += delki[1] * fk[1] * 0.5;
+ // s_syz += delki[1] * fk[2] * 0.5;
+ // s_szx += delki[2] * fk[0] * 0.5;
+ // s_szy += delki[2] * fk[1] * 0.5;
+ // s_szz += delki[2] * fk[2] * 0.5;
+ s_sxx += delkix_half[kk] * fk[0];
+ s_sxy += delkix_half[kk] * fk[1];
+ s_sxz += delkix_half[kk] * fk[2];
+ s_syx += delkiy_half[kk] * fk[0];
+ s_syy += delkiy_half[kk] * fk[1];
+ s_syz += delkiy_half[kk] * fk[2];
+ s_szx += delkiz_half[kk] * fk[0];
+ s_szy += delkiz_half[kk] * fk[1];
+ s_szz += delkiz_half[kk] * fk[2];
+ }
+ s_pe += Tap * Vilp;
+ s_sxx += delx_half * fkcx;
+ s_sxy += delx_half * fkcy;
+ s_sxz += delx_half * fkcz;
+ s_syx += dely_half * fkcx;
+ s_syy += dely_half * fkcy;
+ s_syz += dely_half * fkcz;
+ s_szx += delz_half * fkcx;
+ s_szy += delz_half * fkcy;
+ s_szz += delz_half * fkcz;
+ }
+
+ // save force
+ g_fx[n1] += s_fx;
+ g_fy[n1] += s_fy;
+ g_fz[n1] += s_fz;
+
+ // save virial
+ // xx xy xz 0 3 4
+ // yx yy yz 6 1 5
+ // zx zy zz 7 8 2
+ g_virial[n1 + 0 * number_of_particles] += s_sxx;
+ g_virial[n1 + 1 * number_of_particles] += s_syy;
+ g_virial[n1 + 2 * number_of_particles] += s_szz;
+ g_virial[n1 + 3 * number_of_particles] += s_sxy;
+ g_virial[n1 + 4 * number_of_particles] += s_sxz;
+ g_virial[n1 + 5 * number_of_particles] += s_syz;
+ g_virial[n1 + 6 * number_of_particles] += s_syx;
+ g_virial[n1 + 7 * number_of_particles] += s_szx;
+ g_virial[n1 + 8 * number_of_particles] += s_szy;
+
+ // save potential
+ g_potential[n1] += s_pe;
+
+ }
+}
+
+// build a neighbor list for reducing force
+static __global__ void build_reduce_neighbor_list(
+ const int number_of_particles,
+ const int N1,
+ const int N2,
+ const int *g_neighbor_number,
+ const int *g_neighbor_list,
+ int *g_reduce_neighbor_list)
+{
+ int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1;
+ if (N1 < N2) {
+ int neighbor_number = g_neighbor_number[n1];
+ int l, r, m, tmp_value;
+ for (int i1 = 0; i1 < neighbor_number; ++i1) {
+ int index = n1 + i1 * number_of_particles;
+ int n2 = g_neighbor_list[index];
+
+ l = 0;
+ r = g_neighbor_number[n2];
+ while (l < r) {
+ m = (l + r) >> 1;
+ tmp_value = g_neighbor_list[n2 + number_of_particles * m];
+ if (tmp_value < n1) {
+ l = m + 1;
+ } else if (tmp_value > n1) {
+ r = m - 1;
+ } else {
+ break;
+ }
+ }
+ g_reduce_neighbor_list[index] = (l + r) >> 1;
+ }
+ }
+}
+
+// reduce the rep force
+static __global__ void reduce_force_many_body(
+ const int number_of_particles,
+ const int N1,
+ const int N2,
+ const Box box,
+ const int *g_neighbor_number,
+ const int *g_neighbor_list,
+ int *g_reduce_neighbor_list,
+ int *g_ilp_neighbor_number,
+ int *g_ilp_neighbor_list,
+ const double *__restrict__ g_x,
+ const double *__restrict__ g_y,
+ const double *__restrict__ g_z,
+ double *g_fx,
+ double *g_fy,
+ double *g_fz,
+ double *g_virial,
+ float *g_f12x,
+ float *g_f12y,
+ float *g_f12z,
+ float *g_f12x_ilp_neigh,
+ float *g_f12y_ilp_neigh,
+ float *g_f12z_ilp_neigh)
+{
+ int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1; // particle index
+ float s_fx = 0.0f; // force_x
+ float s_fy = 0.0f; // force_y
+ float s_fz = 0.0f; // force_z
+ float s_sxx = 0.0f; // virial_stress_xx
+ float s_sxy = 0.0f; // virial_stress_xy
+ float s_sxz = 0.0f; // virial_stress_xz
+ float s_syx = 0.0f; // virial_stress_yx
+ float s_syy = 0.0f; // virial_stress_yy
+ float s_syz = 0.0f; // virial_stress_yz
+ float s_szx = 0.0f; // virial_stress_zx
+ float s_szy = 0.0f; // virial_stress_zy
+ float s_szz = 0.0f; // virial_stress_zz
+
+
+ if (n1 < N2) {
+ double x12d, y12d, z12d;
+ float x12f, y12f, z12f;
+ int neighbor_number_1 = g_neighbor_number[n1];
+ double x1 = g_x[n1];
+ double y1 = g_y[n1];
+ double z1 = g_z[n1];
+
+ // calculate energy and force
+ for (int i1 = 0; i1 < neighbor_number_1; ++i1) {
+ int index = n1 + number_of_particles * i1;
+ int n2 = g_neighbor_list[index];
+
+ x12d = g_x[n2] - x1;
+ y12d = g_y[n2] - y1;
+ z12d = g_z[n2] - z1;
+ apply_mic(box, x12d, y12d, z12d);
+ x12f = float(x12d);
+ y12f = float(y12d);
+ z12f = float(z12d);
+
+ index = n2 + number_of_particles * g_reduce_neighbor_list[index];
+ float f21x = g_f12x[index];
+ float f21y = g_f12y[index];
+ float f21z = g_f12z[index];
+
+ s_fx -= f21x;
+ s_fy -= f21y;
+ s_fz -= f21z;
+
+ // per-atom virial
+ s_sxx += x12f * f21x * 0.5f;
+ s_sxy += x12f * f21y * 0.5f;
+ s_sxz += x12f * f21z * 0.5f;
+ s_syx += y12f * f21x * 0.5f;
+ s_syy += y12f * f21y * 0.5f;
+ s_syz += y12f * f21z * 0.5f;
+ s_szx += z12f * f21x * 0.5f;
+ s_szy += z12f * f21y * 0.5f;
+ s_szz += z12f * f21z * 0.5f;
+ }
+
+ int ilp_neighbor_number_1 = g_ilp_neighbor_number[n1];
+
+ for (int i1 = 0; i1 < ilp_neighbor_number_1; ++i1) {
+ int index = n1 + number_of_particles * i1;
+ int n2 = g_ilp_neighbor_list[index];
+ int ilp_neighor_number_2 = g_ilp_neighbor_number[n2];
+
+ x12d = g_x[n2] - x1;
+ y12d = g_y[n2] - y1;
+ z12d = g_z[n2] - z1;
+ apply_mic(box, x12d, y12d, z12d);
+ x12f = float(x12d);
+ y12f = float(y12d);
+ z12f = float(z12d);
+
+ int offset = 0;
+ for (int k = 0; k < ilp_neighor_number_2; ++k) {
+ if (n1 == g_ilp_neighbor_list[n2 + number_of_particles * k]) {
+ offset = k;
+ break;
+ }
+ }
+ index = n2 + number_of_particles * offset;
+ float f21x = g_f12x_ilp_neigh[index];
+ float f21y = g_f12y_ilp_neigh[index];
+ float f21z = g_f12z_ilp_neigh[index];
+
+ s_fx += f21x;
+ s_fy += f21y;
+ s_fz += f21z;
+
+ // per-atom virial
+ s_sxx += -x12f * f21x * 0.5f;
+ s_sxy += -x12f * f21y * 0.5f;
+ s_sxz += -x12f * f21z * 0.5f;
+ s_syx += -y12f * f21x * 0.5f;
+ s_syy += -y12f * f21y * 0.5f;
+ s_syz += -y12f * f21z * 0.5f;
+ s_szx += -z12f * f21x * 0.5f;
+ s_szy += -z12f * f21y * 0.5f;
+ s_szz += -z12f * f21z * 0.5f;
+ }
+
+ // save force
+ g_fx[n1] += s_fx;
+ g_fy[n1] += s_fy;
+ g_fz[n1] += s_fz;
+
+ // save virial
+ // xx xy xz 0 3 4
+ // yx yy yz 6 1 5
+ // zx zy zz 7 8 2
+ g_virial[n1 + 0 * number_of_particles] += s_sxx;
+ g_virial[n1 + 1 * number_of_particles] += s_syy;
+ g_virial[n1 + 2 * number_of_particles] += s_szz;
+ g_virial[n1 + 3 * number_of_particles] += s_sxy;
+ g_virial[n1 + 4 * number_of_particles] += s_sxz;
+ g_virial[n1 + 5 * number_of_particles] += s_syz;
+ g_virial[n1 + 6 * number_of_particles] += s_syx;
+ g_virial[n1 + 7 * number_of_particles] += s_szx;
+ g_virial[n1 + 8 * number_of_particles] += s_szy;
+ }
+}
+
+// SW term
+// two-body part of the SW potential
+static __device__ void find_p2_and_f2(
+ double sigma, double a, double B, double epsilon_times_A, double d12, double& p2, double& f2)
+{
+ double r12 = d12 / sigma;
+ double B_over_r12power4 = B / (r12 * r12 * r12 * r12);
+ double exp_factor = epsilon_times_A * exp(1.0 / (r12 - a));
+ p2 = exp_factor * (B_over_r12power4 - 1.0);
+ f2 = -p2 / ((r12 - a) * (r12 - a)) - exp_factor * 4.0 * B_over_r12power4 / r12;
+ f2 /= (sigma * d12);
+}
+
+// find the partial forces dU_i/dr_ij of SW potential
+static __global__ void gpu_find_force_sw3_partial(
+ const int number_of_particles,
+ const int N1,
+ const int N2,
+ const Box box,
+ const SW2_Para sw3,
+ const int* g_neighbor_number,
+ const int* g_neighbor_list,
+ const int* g_type,
+ const double* __restrict__ g_x,
+ const double* __restrict__ g_y,
+ const double* __restrict__ g_z,
+ double* g_potential,
+ double* g_f12x,
+ double* g_f12y,
+ double* g_f12z)
+{
+ int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1; // particle index
+ if (n1 >= N1 && n1 < N2) {
+ int neighbor_number = g_neighbor_number[n1];
+ int type1 = g_type[n1];
+ double x1 = g_x[n1];
+ double y1 = g_y[n1];
+ double z1 = g_z[n1];
+ double potential_energy = 0.0;
+
+ for (int i1 = 0; i1 < neighbor_number; ++i1) {
+ int index = i1 * number_of_particles + n1;
+ int n2 = g_neighbor_list[index];
+ int type2 = g_type[n2];
+ double x12 = g_x[n2] - x1;
+ double y12 = g_y[n2] - y1;
+ double z12 = g_z[n2] - z1;
+ apply_mic(box, x12, y12, z12);
+ double d12 = sqrt(x12 * x12 + y12 * y12 + z12 * z12);
+ double d12inv = 1.0 / d12;
+ if (d12 >= sw3.rc[type1][type2]) {
+ continue;
+ }
+
+ double gamma12 = sw3.gamma[type1][type2];
+ double sigma12 = sw3.sigma[type1][type2];
+ double a12 = sw3.a[type1][type2];
+ double tmp = gamma12 / (sigma12 * (d12 / sigma12 - a12) * (d12 / sigma12 - a12));
+ double p2, f2;
+ find_p2_and_f2(sigma12, a12, sw3.B[type1][type2], sw3.A[type1][type2], d12, p2, f2);
+
+ // treat the two-body part in the same way as the many-body part
+ double f12x = f2 * x12 * 0.5;
+ double f12y = f2 * y12 * 0.5;
+ double f12z = f2 * z12 * 0.5;
+ // accumulate potential energy
+ potential_energy += p2 * 0.5;
+
+ // three-body part
+ for (int i2 = 0; i2 < neighbor_number; ++i2) {
+ int n3 = g_neighbor_list[n1 + number_of_particles * i2];
+ if (n3 == n2) {
+ continue;
+ }
+ int type3 = g_type[n3];
+ double x13 = g_x[n3] - x1;
+ double y13 = g_y[n3] - y1;
+ double z13 = g_z[n3] - z1;
+ apply_mic(box, x13, y13, z13);
+ double d13 = sqrt(x13 * x13 + y13 * y13 + z13 * z13);
+ if (d13 >= sw3.rc[type1][type3]) {
+ continue;
+ }
+
+ double cos0 = sw3.cos0[type1][type2][type3];
+ double lambda = sw3.lambda[type1][type2][type3];
+ double exp123 = d13 / sw3.sigma[type1][type3] - sw3.a[type1][type3];
+ exp123 = sw3.gamma[type1][type3] / exp123;
+ exp123 = exp(gamma12 / (d12 / sigma12 - a12) + exp123);
+ double one_over_d12d13 = 1.0 / (d12 * d13);
+ double cos123 = (x12 * x13 + y12 * y13 + z12 * z13) * one_over_d12d13;
+ double cos123_over_d12d12 = cos123 * d12inv * d12inv;
+ // cos123 - cos0
+ double delta_cos = cos123 - cos0;
+
+ // modification to (cos123 - cos0)
+ double abs_delta_cos = fabs(delta_cos);
+ if (abs_delta_cos >= DELTA2) {
+ delta_cos = 0.0;
+ } else if (abs_delta_cos < DELTA2 && abs_delta_cos > DELTA1) {
+ double factor = 0.5 + 0.5 * cos(PI * (abs_delta_cos - DELTA1) / (DELTA2 - DELTA1));
+ delta_cos *= factor;
+ }
+
+ // double tmp1 = exp123 * (cos123 - cos0) * lambda;
+ // double tmp2 = tmp * (cos123 - cos0) * d12inv;
+ double tmp1 = exp123 * delta_cos * lambda;
+ double tmp2 = tmp * delta_cos * d12inv;
+
+ // accumulate potential energy
+ // potential_energy += (cos123 - cos0) * tmp1 * 0.5;
+ // double tmp_e = (cos123 - cos0) * tmp1 * 0.5;
+ double tmp_e = delta_cos * tmp1 * 0.5;
+ potential_energy += tmp_e;
+
+ double cos_d = x13 * one_over_d12d13 - x12 * cos123_over_d12d12;
+ f12x += tmp1 * (2.0 * cos_d - tmp2 * x12);
+
+ cos_d = y13 * one_over_d12d13 - y12 * cos123_over_d12d12;
+ f12y += tmp1 * (2.0 * cos_d - tmp2 * y12);
+
+ cos_d = z13 * one_over_d12d13 - z12 * cos123_over_d12d12;
+ f12z += tmp1 * (2.0 * cos_d - tmp2 * z12);
+ }
+ g_f12x[index] = f12x;
+ g_f12y[index] = f12y;
+ g_f12z[index] = f12z;
+ }
+ // save potential
+ g_potential[n1] += potential_energy;
+ }
+}
+
+// define the pure virtual func
+void ILP_TMD_SW::compute(
+ Box &box,
+ const GPU_Vector &type,
+ const GPU_Vector &position_per_atom,
+ GPU_Vector &potential_per_atom,
+ GPU_Vector &force_per_atom,
+ GPU_Vector &virial_per_atom)
+{
+ // nothing
+}
+
+#define USE_FIXED_NEIGHBOR 1
+#define UPDATE_TEMP 10
+#define BIG_ILP_CUTOFF_SQUARE 50.0
+// find force and related quantities
+void ILP_TMD_SW::compute_ilp(
+ Box &box,
+ const GPU_Vector &type,
+ const GPU_Vector &position_per_atom,
+ GPU_Vector &potential_per_atom,
+ GPU_Vector &force_per_atom,
+ GPU_Vector &virial_per_atom,
+ std::vector &group)
+{
+ const int number_of_atoms = type.size();
+ int grid_size = (N2 - N1 - 1) / BLOCK_SIZE_FORCE + 1;
+
+ // assume the first group column is for ILP
+ const int *group_label = group[0].label.data();
+
+#ifdef USE_FIXED_NEIGHBOR
+ static int num_calls = 0;
+#endif
+#ifdef USE_FIXED_NEIGHBOR
+ if (num_calls++ == 0) {
+#endif
+ find_neighbor_ilp(
+ N1,
+ N2,
+ rc,
+ BIG_ILP_CUTOFF_SQUARE,
+ box,
+ group_label,
+ type,
+ position_per_atom,
+ ilp_data.cell_count,
+ ilp_data.cell_count_sum,
+ ilp_data.cell_contents,
+ ilp_data.NN,
+ ilp_data.NL,
+ ilp_data.big_ilp_NN,
+ ilp_data.big_ilp_NL);
+
+ find_neighbor_SW(
+ N1,
+ N2,
+ rc_sw,
+ box,
+ group_label,
+ type,
+ position_per_atom,
+ sw2_data.cell_count,
+ sw2_data.cell_count_sum,
+ sw2_data.cell_contents,
+ sw2_data.NN,
+ sw2_data.NL
+ );
+
+ build_reduce_neighbor_list<<>>(
+ number_of_atoms,
+ N1,
+ N2,
+ ilp_data.NN.data(),
+ ilp_data.NL.data(),
+ ilp_data.reduce_NL.data());
+#ifdef USE_FIXED_NEIGHBOR
+ }
+ num_calls %= UPDATE_TEMP;
+#endif
+
+ const double* x = position_per_atom.data();
+ const double* y = position_per_atom.data() + number_of_atoms;
+ const double* z = position_per_atom.data() + number_of_atoms * 2;
+ const int *NN = ilp_data.NN.data();
+ const int *NL = ilp_data.NL.data();
+ const int* big_ilp_NN = ilp_data.big_ilp_NN.data();
+ const int* big_ilp_NL = ilp_data.big_ilp_NL.data();
+ int *reduce_NL = ilp_data.reduce_NL.data();
+ int *ilp_NL = ilp_data.ilp_NL.data();
+ int *ilp_NN = ilp_data.ilp_NN.data();
+
+ const int* NN_sw = sw2_data.NN.data();
+ const int* NL_sw = sw2_data.NL.data();
+
+ ilp_data.ilp_NL.fill(0);
+ ilp_data.ilp_NN.fill(0);
+
+ // find ILP neighbor list
+ ILP_neighbor<<>>(
+ number_of_atoms, N1, N2, box, big_ilp_NN, big_ilp_NL, \
+ type.data(), ilp_para, x, y, z, ilp_NN, \
+ ilp_NL, group[1].label.data());
+ CUDA_CHECK_KERNEL
+
+ // initialize force of ilp neighbor temporary vector
+ ilp_data.f12x_ilp_neigh.fill(0);
+ ilp_data.f12y_ilp_neigh.fill(0);
+ ilp_data.f12z_ilp_neigh.fill(0);
+ ilp_data.f12x.fill(0);
+ ilp_data.f12y.fill(0);
+ ilp_data.f12z.fill(0);
+
+ double *g_fx = force_per_atom.data();
+ double *g_fy = force_per_atom.data() + number_of_atoms;
+ double *g_fz = force_per_atom.data() + number_of_atoms * 2;
+ double *g_virial = virial_per_atom.data();
+ double *g_potential = potential_per_atom.data();
+ float *g_f12x = ilp_data.f12x.data();
+ float *g_f12y = ilp_data.f12y.data();
+ float *g_f12z = ilp_data.f12z.data();
+ float *g_f12x_ilp_neigh = ilp_data.f12x_ilp_neigh.data();
+ float *g_f12y_ilp_neigh = ilp_data.f12y_ilp_neigh.data();
+ float *g_f12z_ilp_neigh = ilp_data.f12z_ilp_neigh.data();
+
+ gpu_find_force<<>>(
+ ilp_para,
+ number_of_atoms,
+ N1,
+ N2,
+ box,
+ NN,
+ NL,
+ ilp_NN,
+ ilp_NL,
+ group_label,
+ type.data(),
+ x,
+ y,
+ z,
+ g_fx,
+ g_fy,
+ g_fz,
+ g_virial,
+ g_potential,
+ g_f12x,
+ g_f12y,
+ g_f12z,
+ g_f12x_ilp_neigh,
+ g_f12y_ilp_neigh,
+ g_f12z_ilp_neigh);
+ CUDA_CHECK_KERNEL
+
+ reduce_force_many_body<<>>(
+ number_of_atoms,
+ N1,
+ N2,
+ box,
+ NN,
+ NL,
+ reduce_NL,
+ ilp_NN,
+ ilp_NL,
+ x,
+ y,
+ z,
+ g_fx,
+ g_fy,
+ g_fz,
+ g_virial,
+ g_f12x,
+ g_f12y,
+ g_f12z,
+ g_f12x_ilp_neigh,
+ g_f12y_ilp_neigh,
+ g_f12z_ilp_neigh);
+ CUDA_CHECK_KERNEL
+
+ // step 1: calculate the partial forces
+ gpu_find_force_sw3_partial<<>>(
+ number_of_atoms, N1, N2, box, sw2_para, sw2_data.NN.data(), sw2_data.NL.data(),
+ type.data(), position_per_atom.data(), position_per_atom.data() + number_of_atoms,
+ position_per_atom.data() + number_of_atoms * 2, potential_per_atom.data(), sw2_data.f12x.data(),
+ sw2_data.f12y.data(), sw2_data.f12z.data());
+ CUDA_CHECK_KERNEL
+
+ // step 2: calculate force and related quantities
+ find_properties_many_body(
+ box, sw2_data.NN.data(), sw2_data.NL.data(), sw2_data.f12x.data(),
+ sw2_data.f12y.data(), sw2_data.f12z.data(), position_per_atom, force_per_atom, virial_per_atom);
+}
\ No newline at end of file
diff --git a/src/force/ilp_tmd_sw.cuh b/src/force/ilp_tmd_sw.cuh
new file mode 100644
index 000000000..7e43a611b
--- /dev/null
+++ b/src/force/ilp_tmd_sw.cuh
@@ -0,0 +1,116 @@
+/*
+ Copyright 2017 Zheyong Fan and GPUMD development team
+ This file is part of GPUMD.
+ GPUMD is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ GPUMD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with GPUMD. If not, see .
+*/
+
+#pragma once
+#include "potential.cuh"
+#include
+#include
+
+// MoS2 MoSe2 WSe2 MoSe2
+#define MAX_TYPE_ILP_TMD_SW 4
+#define CUDA_MAX_NL_TMD 2048
+#define MAX_ILP_NEIGHBOR_TMD 6
+#define MAX_BIG_ILP_NEIGHBOR_TMD 128
+#define MAX_SW_NEIGHBOR_NUM 50
+
+// modification for TMD
+#define DELTA1 0.25
+#define DELTA2 0.35
+
+struct ILP_TMD_Para {
+ float rcutsq_ilp[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW];
+ float d[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW];
+ float d_Seff[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // d / S_R / r_eff
+ float C_6[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW];
+ float z0[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // beta
+ float lambda[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // alpha / beta
+ float epsilon[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW];
+ float C[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW];
+ float delta2inv[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // 1 / delta ^ 2
+ float S[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // scale
+ float rcut_global[MAX_TYPE_ILP_TMD_SW][MAX_TYPE_ILP_TMD_SW]; // scale
+};
+
+struct ILP_TMD_Data {
+ GPU_Vector NN, NL;
+ GPU_Vector reduce_NL;
+ GPU_Vector big_ilp_NN, big_ilp_NL;
+ GPU_Vector ilp_NN, ilp_NL;
+ GPU_Vector cell_count;
+ GPU_Vector cell_count_sum;
+ GPU_Vector cell_contents;
+ GPU_Vector f12x;
+ GPU_Vector f12y;
+ GPU_Vector f12z;
+ GPU_Vector f12x_ilp_neigh;
+ GPU_Vector f12y_ilp_neigh;
+ GPU_Vector f12z_ilp_neigh;
+};
+
+struct SW2_Para {
+ // 2-body part
+ double A[3][3], B[3][3], a[3][3], sigma[3][3], gamma[3][3], rc[3][3];
+ // 3-body part
+ double lambda[3][3][3], cos0[3][3][3];
+};
+
+struct SW2_Data {
+ GPU_Vector NN, NL;
+ GPU_Vector cell_count;
+ GPU_Vector cell_count_sum;
+ GPU_Vector cell_contents;
+ GPU_Vector f12x;
+ GPU_Vector f12y;
+ GPU_Vector f12z;
+};
+
+class ILP_TMD_SW : public Potential
+{
+public:
+ using Potential::compute;
+ ILP_TMD_SW(FILE*, FILE*, int, int);
+ virtual ~ILP_TMD_SW(void);
+ virtual void compute(
+ Box& box,
+ const GPU_Vector& type,
+ const GPU_Vector& position,
+ GPU_Vector& potential,
+ GPU_Vector& force,
+ GPU_Vector& virial);
+
+ virtual void compute_ilp(
+ Box& box,
+ const GPU_Vector& type,
+ const GPU_Vector& position,
+ GPU_Vector& potential,
+ GPU_Vector& force,
+ GPU_Vector& virial,
+ std::vector &group);
+
+ // sw term
+ void initialize_sw_1985_1(FILE*); // called by the constructor
+ void initialize_sw_1985_2(FILE*); // called by the constructor
+ void initialize_sw_1985_3(FILE*); // called by the constructor
+
+protected:
+ ILP_TMD_Para ilp_para;
+ ILP_TMD_Data ilp_data;
+ SW2_Para sw2_para;
+ SW2_Data sw2_data;
+ // rcut for SW
+ double rc_sw;
+};
+
+static __constant__ float Tap_coeff_tmd[8];
\ No newline at end of file
diff --git a/src/force/neighbor.cu b/src/force/neighbor.cu
index bed99d912..8b234415e 100644
--- a/src/force/neighbor.cu
+++ b/src/force/neighbor.cu
@@ -341,3 +341,292 @@ void find_neighbor(
gpu_sort_neighbor_list<<>>(N, NN.data(), NL.data());
CUDA_CHECK_KERNEL
}
+
+// For ILP, the neighbor could not contain atoms in the same layer
+static __global__ void gpu_find_neighbor_ON1_ilp(
+ const Box box,
+ const int N,
+ const int N1,
+ const int N2,
+ const int* __restrict__ type,
+ const int* __restrict__ cell_counts,
+ const int* __restrict__ cell_count_sum,
+ const int* __restrict__ cell_contents,
+ int* NN,
+ int* NL,
+ int* big_ilp_NN,
+ int* big_ilp_NL,
+ const int* group_label,
+ const double* __restrict__ x,
+ const double* __restrict__ y,
+ const double* __restrict__ z,
+ const int nx,
+ const int ny,
+ const int nz,
+ const double rc_inv,
+ const double cutoff_square,
+ const double big_ilp_cutoff_square)
+{
+ const int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1;
+ int count = 0;
+ int ilp_count = 0;
+ if (n1 < N2) {
+ const double x1 = x[n1];
+ const double y1 = y[n1];
+ const double z1 = z[n1];
+ int cell_id;
+ int cell_id_x;
+ int cell_id_y;
+ int cell_id_z;
+ find_cell_id(box, x1, y1, z1, rc_inv, nx, ny, nz, cell_id_x, cell_id_y, cell_id_z, cell_id);
+
+ const int z_lim = box.pbc_z ? 2 : 0;
+ const int y_lim = box.pbc_y ? 2 : 0;
+ const int x_lim = box.pbc_x ? 2 : 0;
+
+ // get radial descriptors
+ for (int k = -z_lim; k <= z_lim; ++k) {
+ for (int j = -y_lim; j <= y_lim; ++j) {
+ for (int i = -x_lim; i <= x_lim; ++i) {
+ int neighbor_cell = cell_id + k * nx * ny + j * nx + i;
+ if (cell_id_x + i < 0)
+ neighbor_cell += nx;
+ if (cell_id_x + i >= nx)
+ neighbor_cell -= nx;
+ if (cell_id_y + j < 0)
+ neighbor_cell += ny * nx;
+ if (cell_id_y + j >= ny)
+ neighbor_cell -= ny * nx;
+ if (cell_id_z + k < 0)
+ neighbor_cell += nz * ny * nx;
+ if (cell_id_z + k >= nz)
+ neighbor_cell -= nz * ny * nx;
+
+ const int num_atoms_neighbor_cell = cell_counts[neighbor_cell];
+ const int num_atoms_previous_cells = cell_count_sum[neighbor_cell];
+
+ for (int m = 0; m < num_atoms_neighbor_cell; ++m) {
+ const int n2 = cell_contents[num_atoms_previous_cells + m];
+ // neighbors in different layers
+ if (n2 >= N1 && n2 < N2 && n1 != n2) {
+
+ double x12 = x[n2] - x1;
+ double y12 = y[n2] - y1;
+ double z12 = z[n2] - z1;
+ apply_mic(box, x12, y12, z12);
+ const double d2 = x12 * x12 + y12 * y12 + z12 * z12;
+
+ bool different_layer = group_label[n1] != group_label[n2];
+ if (different_layer && d2 < cutoff_square) {
+ NL[count++ * N + n1] = n2;
+ } else if (!different_layer && d2 < big_ilp_cutoff_square) {
+ big_ilp_NL[ilp_count++ * N + n1] = n2;
+ }
+
+ }
+ }
+ }
+ }
+ }
+ NN[n1] = count;
+ big_ilp_NN[n1] = ilp_count;
+ }
+}
+
+void find_neighbor_ilp(
+ const int N1,
+ const int N2,
+ double rc,
+ double big_ilp_cutoff_square,
+ Box& box,
+ const int* group_label,
+ const GPU_Vector& type,
+ const GPU_Vector& position_per_atom,
+ GPU_Vector& cell_count,
+ GPU_Vector& cell_count_sum,
+ GPU_Vector& cell_contents,
+ GPU_Vector& NN,
+ GPU_Vector& NL,
+ GPU_Vector& big_ilp_NN,
+ GPU_Vector& big_ilp_NL)
+{
+ const int N = NN.size();
+ const int block_size = 256;
+ const int grid_size = (N2 - N1 - 1) / block_size + 1;
+ const double* x = position_per_atom.data();
+ const double* y = position_per_atom.data() + N;
+ const double* z = position_per_atom.data() + N * 2;
+ const double rc_cell_list = 0.5 * rc;
+ const double rc_inv_cell_list = 2.0 / rc;
+
+ int num_bins[3];
+ box.get_num_bins(rc_cell_list, num_bins);
+
+ find_cell_list(
+ rc_cell_list, num_bins, box, position_per_atom, cell_count, cell_count_sum, cell_contents);
+
+ gpu_find_neighbor_ON1_ilp<<>>(
+ box,
+ N,
+ N1,
+ N2,
+ type.data(),
+ cell_count.data(),
+ cell_count_sum.data(),
+ cell_contents.data(),
+ NN.data(),
+ NL.data(),
+ big_ilp_NN.data(),
+ big_ilp_NL.data(),
+ group_label,
+ x,
+ y,
+ z,
+ num_bins[0],
+ num_bins[1],
+ num_bins[2],
+ rc_inv_cell_list,
+ rc * rc,
+ big_ilp_cutoff_square);
+ CUDA_CHECK_KERNEL
+
+ const int MN = NL.size() / NN.size();
+ gpu_sort_neighbor_list_ilp<<>>(N, NN.data(), NL.data());
+ CUDA_CHECK_KERNEL
+}
+
+static __global__ void gpu_find_neighbor_ON1_SW(
+ const Box box,
+ const int N,
+ const int N1,
+ const int N2,
+ const int* __restrict__ type,
+ const int* __restrict__ cell_counts,
+ const int* __restrict__ cell_count_sum,
+ const int* __restrict__ cell_contents,
+ int* NN,
+ int* NL,
+ const int* group_label,
+ const double* __restrict__ x,
+ const double* __restrict__ y,
+ const double* __restrict__ z,
+ const int nx,
+ const int ny,
+ const int nz,
+ const double rc_inv,
+ const double cutoff_square)
+{
+ const int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1;
+ int count = 0;
+ if (n1 < N2) {
+ const double x1 = x[n1];
+ const double y1 = y[n1];
+ const double z1 = z[n1];
+ int cell_id;
+ int cell_id_x;
+ int cell_id_y;
+ int cell_id_z;
+ find_cell_id(box, x1, y1, z1, rc_inv, nx, ny, nz, cell_id_x, cell_id_y, cell_id_z, cell_id);
+
+ const int z_lim = box.pbc_z ? 2 : 0;
+ const int y_lim = box.pbc_y ? 2 : 0;
+ const int x_lim = box.pbc_x ? 2 : 0;
+
+ // get radial descriptors
+ for (int k = -z_lim; k <= z_lim; ++k) {
+ for (int j = -y_lim; j <= y_lim; ++j) {
+ for (int i = -x_lim; i <= x_lim; ++i) {
+ int neighbor_cell = cell_id + k * nx * ny + j * nx + i;
+ if (cell_id_x + i < 0)
+ neighbor_cell += nx;
+ if (cell_id_x + i >= nx)
+ neighbor_cell -= nx;
+ if (cell_id_y + j < 0)
+ neighbor_cell += ny * nx;
+ if (cell_id_y + j >= ny)
+ neighbor_cell -= ny * nx;
+ if (cell_id_z + k < 0)
+ neighbor_cell += nz * ny * nx;
+ if (cell_id_z + k >= nz)
+ neighbor_cell -= nz * ny * nx;
+
+ const int num_atoms_neighbor_cell = cell_counts[neighbor_cell];
+ const int num_atoms_previous_cells = cell_count_sum[neighbor_cell];
+
+ for (int m = 0; m < num_atoms_neighbor_cell; ++m) {
+ const int n2 = cell_contents[num_atoms_previous_cells + m];
+ if (n2 >= N1 && n2 < N2 && n1 != n2) {
+
+ double x12 = x[n2] - x1;
+ double y12 = y[n2] - y1;
+ double z12 = z[n2] - z1;
+ apply_mic(box, x12, y12, z12);
+ const double d2 = x12 * x12 + y12 * y12 + z12 * z12;
+
+ if (d2 < cutoff_square && group_label[n1] == group_label[n2]) {
+ NL[count++ * N + n1] = n2;
+ }
+ }
+ }
+ }
+ }
+ }
+ NN[n1] = count;
+ }
+}
+
+void find_neighbor_SW(
+ const int N1,
+ const int N2,
+ double rc,
+ Box& box,
+ const int* group_label,
+ const GPU_Vector& type,
+ const GPU_Vector& position_per_atom,
+ GPU_Vector& cell_count,
+ GPU_Vector& cell_count_sum,
+ GPU_Vector& cell_contents,
+ GPU_Vector& NN,
+ GPU_Vector& NL)
+{
+ const int N = NN.size();
+ const int block_size = 256;
+ const int grid_size = (N2 - N1 - 1) / block_size + 1;
+ const double* x = position_per_atom.data();
+ const double* y = position_per_atom.data() + N;
+ const double* z = position_per_atom.data() + N * 2;
+ const double rc_cell_list = 0.5 * rc;
+ const double rc_inv_cell_list = 2.0 / rc;
+
+ int num_bins[3];
+ box.get_num_bins(rc_cell_list, num_bins);
+
+ find_cell_list(
+ rc_cell_list, num_bins, box, position_per_atom, cell_count, cell_count_sum, cell_contents);
+
+ gpu_find_neighbor_ON1_SW<<>>(
+ box,
+ N,
+ N1,
+ N2,
+ type.data(),
+ cell_count.data(),
+ cell_count_sum.data(),
+ cell_contents.data(),
+ NN.data(),
+ NL.data(),
+ group_label,
+ x,
+ y,
+ z,
+ num_bins[0],
+ num_bins[1],
+ num_bins[2],
+ rc_inv_cell_list,
+ rc * rc);
+ CUDA_CHECK_KERNEL
+
+ const int MN = NL.size() / NN.size();
+ gpu_sort_neighbor_list<<>>(N, NN.data(), NL.data());
+ CUDA_CHECK_KERNEL
+}
\ No newline at end of file
diff --git a/src/force/neighbor.cuh b/src/force/neighbor.cuh
index 068eb5e22..3e7cd1681 100644
--- a/src/force/neighbor.cuh
+++ b/src/force/neighbor.cuh
@@ -40,6 +40,39 @@ void find_neighbor(
GPU_Vector& NN,
GPU_Vector& NL);
+// For ILP
+void find_neighbor_ilp(
+ const int N1,
+ const int N2,
+ double rc,
+ double big_ilp_cutoff_square,
+ Box& box,
+ const int* group_label,
+ const GPU_Vector& type,
+ const GPU_Vector& position_per_atom,
+ GPU_Vector& cell_count,
+ GPU_Vector& cell_count_sum,
+ GPU_Vector& cell_contents,
+ GPU_Vector& NN,
+ GPU_Vector& NL,
+ GPU_Vector& big_ilp_NN,
+ GPU_Vector& big_ilp_NL);
+
+// for SW
+void find_neighbor_SW(
+ const int N1,
+ const int N2,
+ double rc,
+ Box& box,
+ const int* group_label,
+ const GPU_Vector& type,
+ const GPU_Vector& position_per_atom,
+ GPU_Vector& cell_count,
+ GPU_Vector& cell_count_sum,
+ GPU_Vector& cell_contents,
+ GPU_Vector& NN,
+ GPU_Vector& NL);
+
static __device__ void find_cell_id(
const Box& box,
const double x,
@@ -106,3 +139,59 @@ static __global__ void gpu_sort_neighbor_list(const int N, const int* NN, int* N
NL[bid + count * N] = atom_index;
}
}
+
+static __global__ void gpu_sort_neighbor_list_ilp(const int N, const int* NN, int* NL)
+{
+ int bid = blockIdx.x;
+ int tid = threadIdx.x;
+ int neighbor_number = NN[bid];
+ int atom_index;
+ int atom_index_hold[10] = {0};
+ extern __shared__ int atom_index_copy[];
+
+ if (neighbor_number <= 1024) {
+ if (tid < neighbor_number) {
+ atom_index = NL[bid + tid * N];
+ atom_index_copy[tid] = atom_index;
+ }
+ } else {
+ int tid_plus = tid;
+ for (int i = 0; tid_plus < neighbor_number; ++i) {
+ atom_index = NL[bid + tid_plus * N];
+ atom_index_copy[tid_plus] = atom_index;
+ atom_index_hold[i] = atom_index;
+ tid_plus += 1024;
+ }
+ }
+ int count = 0;
+ __syncthreads();
+
+ if (neighbor_number <= 1024) {
+ for (int j = 0; j < neighbor_number; ++j) {
+ if (atom_index > atom_index_copy[j]) {
+ count++;
+ }
+ }
+
+ if (tid < neighbor_number) {
+ NL[bid + count * N] = atom_index;
+ }
+ } else {
+ int tid_plus = tid;
+ for (int i = 0; tid_plus < neighbor_number; ++i)
+ {
+ count = 0;
+ atom_index = atom_index_hold[i];
+ for (int j = 0; j < neighbor_number; ++j) {
+ if (atom_index > atom_index_copy[j]) {
+ ++count;
+ }
+ }
+
+ if (tid_plus < neighbor_number) {
+ NL[bid + count * N] = atom_index;
+ }
+ tid_plus += 1024;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/force/potential.cuh b/src/force/potential.cuh
index 2d3e81efc..30012865f 100644
--- a/src/force/potential.cuh
+++ b/src/force/potential.cuh
@@ -26,6 +26,7 @@ public:
double rc; // maximum cutoff distance
int nep_model_type =
-1; // -1 for non_nep, 0 for potential, 1 for dipole, 2 for polarizability, 3 for temperature
+ int ilp_flag = 0; // 0 for non_ilp, 1 for ilp
Potential(void);
virtual ~Potential(void);
@@ -46,6 +47,16 @@ public:
GPU_Vector& force,
GPU_Vector& virial){};
+ // add group message for ILP TMD SW
+ virtual void compute_ilp(
+ Box& box,
+ const GPU_Vector& type,
+ const GPU_Vector& position,
+ GPU_Vector& potential,
+ GPU_Vector& force,
+ GPU_Vector& virial,
+ std::vector& group){};
+
protected:
void find_properties_many_body(
Box& box,