From 107fd7aa8a063d52fd13d99525e4f2a3bc997f01 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 24 Aug 2021 15:17:13 -0400 Subject: [PATCH 1/5] Cuda optimizations --- cuda/auxiliary_genetic.cu | 79 --------------------------------------- cuda/calcMergeEneGra.cu | 11 +----- cuda/calcenergy.cu | 4 +- cuda/kernel3.cu | 19 +++------- cuda/kernel_ad.cu | 16 ++------ cuda/kernel_adam.cu | 16 ++------ 6 files changed, 15 insertions(+), 130 deletions(-) diff --git a/cuda/auxiliary_genetic.cu b/cuda/auxiliary_genetic.cu index a4138331..be1341b2 100644 --- a/cuda/auxiliary_genetic.cu +++ b/cuda/auxiliary_genetic.cu @@ -77,82 +77,3 @@ inline __device__ void map_angle(float& angle) } } -#if 0 -// ------------------------------------------------------- -// -// ------------------------------------------------------- -void gpu_perform_elitist_selection( - int dockpars_pop_size, - __global float* restrict dockpars_energies_current, - __global float* restrict dockpars_energies_next, - __global int* restrict dockpars_evals_of_new_entities, - int dockpars_num_of_genes, - __global float* restrict dockpars_conformations_next, - __global const float* restrict dockpars_conformations_current, - __local float* best_energies, - __local int* best_IDs, - __local int* best_ID - ) -// The GPU device function performs elitist selection, -// that is, it looks for the best entity in conformations_current and -// energies_current of the run that corresponds to the block ID, -// and copies it to the place of the first entity in -// conformations_next and energies_next. -{ - int entity_counter; - int gene_counter; - float best_energy; - - if (get_local_id(0) < dockpars_pop_size) { - best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+get_local_id(0)]; - best_IDs[get_local_id(0)] = get_local_id(0); - } - - for (entity_counter = NUM_OF_THREADS_PER_BLOCK+get_local_id(0); - entity_counter < dockpars_pop_size; - entity_counter+= NUM_OF_THREADS_PER_BLOCK) - { - if (dockpars_energies_current[get_group_id(0)+entity_counter] < best_energies[get_local_id(0)]) { - best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+entity_counter]; - best_IDs[get_local_id(0)] = entity_counter; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // This could be implemented with a tree-like structure - // which may be slightly faster - if (get_local_id(0) == 0) - { - best_energy = best_energies[0]; - best_ID[0] = best_IDs[0]; - - for (entity_counter = 1; - entity_counter < NUM_OF_THREADS_PER_BLOCK; - entity_counter++) - { - if ((best_energies[entity_counter] < best_energy) && (entity_counter < dockpars_pop_size)) { - best_energy = best_energies[entity_counter]; - best_ID[0] = best_IDs[entity_counter]; - } - } - - // Setting energy value of new entity - dockpars_energies_next[get_group_id(0)] = best_energy; - - // Zero (0) evals were performed for entity selected with elitism (since it was copied only) - dockpars_evals_of_new_entities[get_group_id(0)] = 0; - } - - // "best_id" stores the id of the best entity in the population, - // Copying genotype and energy value to the first entity of new population - barrier(CLK_LOCAL_MEM_FENCE); - - for (gene_counter = get_local_id(0); - gene_counter < dockpars_num_of_genes; - gene_counter+= NUM_OF_THREADS_PER_BLOCK) - { - dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0) + GENOTYPE_LENGTH_IN_GLOBMEM*best_ID[0]+gene_counter]; - } -} -#endif diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index 52fa41f0..99bbc076 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -119,7 +119,6 @@ __device__ void gpu_calc_energrad( uint32_t g2 = cData.dockpars.gridsize_x_times_y; uint32_t g3 = cData.dockpars.gridsize_x_times_y_times_z; - __threadfence(); __syncthreads(); // ================================================ @@ -183,8 +182,7 @@ __device__ void gpu_calc_energrad( calc_coords[atom_id].z = qt.z + rotation_movingvec.z; } // End if-statement not dummy rotation - __threadfence(); - __syncthreads(); + __syncthreads(); } // End rotation_counter for-loop // ================================================ @@ -409,7 +407,6 @@ __device__ void gpu_calc_energrad( gradient[atom_id].z += lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * gz))); #endif } // End atom_id for-loop (INTERMOLECULAR ENERGY) - __threadfence(); __syncthreads(); // Inter- and intra-molecular energy calculation @@ -595,7 +592,6 @@ __device__ void gpu_calc_energrad( ATOMICADDI32(&gradient[atom2_id].z, priv_intra_gradient_z); #endif } // End contributor_counter for-loop (INTRAMOLECULAR ENERGY) - __threadfence(); __syncthreads(); // Transform gradients_inter_{x|y|z} @@ -681,7 +677,6 @@ __device__ void gpu_calc_energrad( printf("gradient_z:%f\n", gradient_genotype [2]); #endif } - __threadfence(); __syncthreads(); // ------------------------------------------ @@ -876,7 +871,6 @@ __device__ void gpu_calc_energrad( printf("%-13.6f %-13.6f %-13.6f\n", gradient_genotype[3], gradient_genotype[4], gradient_genotype[5]); #endif } - __threadfence(); __syncthreads(); // ------------------------------------------ @@ -938,7 +932,6 @@ __device__ void gpu_calc_energrad( // - this works because a * (a_1 + a_2 + ... + a_n) = a*a_1 + a*a_2 + ... + a*a_n ATOMICADDI32(&gradient_genotype[rotbond_id+6], lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * torque_on_axis * DEG_TO_RAD)))); /*(M_PI / 180.0f)*/; } - __threadfence(); __syncthreads(); for (uint32_t gene_cnt = threadIdx.x; @@ -946,7 +939,6 @@ __device__ void gpu_calc_energrad( gene_cnt+= blockDim.x) { fgradient_genotype[gene_cnt] = ONEOVERTERMSCALE * (float)gradient_genotype[gene_cnt]; } - __threadfence(); __syncthreads(); #if defined (CONVERT_INTO_ANGSTROM_RADIAN) @@ -956,7 +948,6 @@ __device__ void gpu_calc_energrad( { fgradient_genotype[gene_cnt] *= cData.dockpars.grid_spacing * cData.dockpars.grid_spacing * SCFACTOR_ANGSTROM_RADIAN; } - __threadfence(); __syncthreads(); #endif } diff --git a/cuda/calcenergy.cu b/cuda/calcenergy.cu index 4e18ae11..9a431f68 100644 --- a/cuda/calcenergy.cu +++ b/cuda/calcenergy.cu @@ -163,7 +163,6 @@ __device__ void gpu_calc_energy( uint g2 = cData.dockpars.gridsize_x_times_y; uint g3 = cData.dockpars.gridsize_x_times_y_times_z; - __threadfence(); __syncthreads(); // ================================================ @@ -226,8 +225,7 @@ __device__ void gpu_calc_energy( calc_coords[atom_id].z = qt.z + rotation_movingvec.z; } // End if-statement not dummy rotation - __threadfence(); - __syncthreads(); + __syncthreads(); } // End rotation_counter for-loop diff --git a/cuda/kernel3.cu b/cuda/kernel3.cu index 65f2975e..e450edb6 100644 --- a/cuda/kernel3.cu +++ b/cuda/kernel3.cu @@ -51,11 +51,12 @@ gpu_perform_LS_kernel( __shared__ int iteration_cnt; __shared__ int evaluation_cnt; - __shared__ float offspring_energy; __shared__ float sFloatAccumulator; __shared__ int entity_id; - extern __shared__ float sFloatBuff[]; + + volatile __shared__ float sFloatBuff[3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH]; + float candidate_energy; int run_id; // Ligand-atom position and partial energies @@ -90,7 +91,6 @@ gpu_perform_LS_kernel( iteration_cnt = 0; evaluation_cnt = 0; } - __threadfence(); __syncthreads(); size_t offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM; @@ -101,7 +101,6 @@ gpu_perform_LS_kernel( offspring_genotype[gene_counter] = pMem_conformations_next[offset + gene_counter]; genotype_bias[gene_counter] = 0.0f; } - __threadfence(); __syncthreads(); @@ -155,7 +154,6 @@ gpu_perform_LS_kernel( genotype_bias[gene_counter]; } // Evaluating candidate - __threadfence(); __syncthreads(); // ================================================================= @@ -170,7 +168,6 @@ gpu_perform_LS_kernel( if (threadIdx.x == 0) { evaluation_cnt++; } - __threadfence(); __syncthreads(); if (candidate_energy < offspring_energy) // If candidate is better, success @@ -187,7 +184,6 @@ gpu_perform_LS_kernel( // Work-item 0 will overwrite the shared variables // used in the previous if condition - __threadfence(); __syncthreads(); if (threadIdx.x == 0) @@ -210,7 +206,6 @@ gpu_perform_LS_kernel( } // Evaluating candidate - __threadfence(); __syncthreads(); // ================================================================= @@ -230,7 +225,6 @@ gpu_perform_LS_kernel( printf("%-18s [%-5s]---{%-5s} [%-10.8f]---{%-10.8f}\n", "-ENERGY-KERNEL3-", "GRIDS", "INTRA", partial_interE[0], partial_intraE[0]); #endif } - __threadfence(); __syncthreads(); if (candidate_energy < offspring_energy) // If candidate is better, success @@ -247,8 +241,7 @@ gpu_perform_LS_kernel( // Work-item 0 will overwrite the shared variables // used in the previous if condition - __threadfence(); - __syncthreads(); + __syncthreads(); if (threadIdx.x == 0) { @@ -290,7 +283,6 @@ gpu_perform_LS_kernel( cons_fail = 0; } } - __threadfence(); __syncthreads(); } @@ -321,8 +313,7 @@ void gpu_perform_LS( float* pMem_energies_next ) { - size_t sz_shared = (3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float); - gpu_perform_LS_kernel<<>>(pMem_conformations_next, pMem_energies_next); + gpu_perform_LS_kernel<<>>(pMem_conformations_next, pMem_energies_next); LAUNCHERROR("gpu_perform_LS_kernel"); #if 0 cudaError_t status; diff --git a/cuda/kernel_ad.cu b/cuda/kernel_ad.cu index 631ae566..a963992f 100644 --- a/cuda/kernel_ad.cu +++ b/cuda/kernel_ad.cu @@ -84,7 +84,8 @@ gpu_gradient_minAD_kernel( __shared__ int entity_id; __shared__ float best_energy; __shared__ float sFloatAccumulator; - extern __shared__ float sFloatBuff[]; + + volatile __shared__ float sFloatBuff[6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH]; // Ligand-atom position and partial energies float3* calc_coords = (float3*)sFloatBuff; @@ -135,7 +136,6 @@ gpu_gradient_minAD_kernel( printf("%20s %.6f\n", "initial energy: ", energy); #endif } - __threadfence(); __syncthreads(); energy = pMem_energies_next[run_id * cData.dockpars.pop_size + entity_id]; @@ -160,7 +160,6 @@ gpu_gradient_minAD_kernel( // E.g. in steepest descent "delta" is -1.0 * stepsize * gradient // Asynchronous copy should be finished by here - __threadfence(); __syncthreads(); // Initializing vectors @@ -214,7 +213,6 @@ gpu_gradient_minAD_kernel( // ============================================================= // ============================================================= // Calculating energy & gradient - __threadfence(); __syncthreads(); gpu_calc_energrad( @@ -272,7 +270,6 @@ gpu_gradient_minAD_kernel( printf("\n"); #endif } - __threadfence(); __syncthreads(); #endif // DEBUG_ENERGY_ADADELTA @@ -297,7 +294,6 @@ gpu_gradient_minAD_kernel( // Applying update genotype[i] += delta; } - __threadfence(); __syncthreads(); #if defined (DEBUG_SQDELTA_ADADELTA) @@ -310,7 +306,6 @@ gpu_gradient_minAD_kernel( printf("%13u %20.6f %15.6f %15.6f %15.6f\n", i, square_gradient[i], delta[i], square_delta[i], genotype[i]); } } - __threadfence(); __syncthreads(); #endif @@ -356,8 +351,7 @@ gpu_gradient_minAD_kernel( } #endif } - __threadfence(); - __syncthreads(); // making sure that iteration_cnt is up-to-date + __syncthreads(); // making sure that iteration_cnt is up-to-date #ifdef ADADELTA_AUTOSTOP } while ((iteration_cnt < cData.dockpars.max_num_of_iters) && (rho > 0.01f)); #else @@ -375,7 +369,6 @@ gpu_gradient_minAD_kernel( } // Updating old offspring in population - __threadfence(); __syncthreads(); offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM; @@ -407,8 +400,7 @@ void gpu_gradient_minAD( float* pMem_energies_next ) { - size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float); - gpu_gradient_minAD_kernel<<>>(pMem_conformations_next, pMem_energies_next); + gpu_gradient_minAD_kernel<<>>(pMem_conformations_next, pMem_energies_next); LAUNCHERROR("gpu_gradient_minAD_kernel"); #if 0 cudaError_t status; diff --git a/cuda/kernel_adam.cu b/cuda/kernel_adam.cu index 11f37b3f..ea6717bc 100644 --- a/cuda/kernel_adam.cu +++ b/cuda/kernel_adam.cu @@ -77,7 +77,8 @@ gpu_gradient_minAdam_kernel( __shared__ int entity_id; __shared__ float best_energy; __shared__ float sFloatAccumulator; - extern __shared__ float sFloatBuff[]; + + volatile __shared__ float sFloatBuff[6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH]; // Ligand-atom position and partial energies float3* calc_coords = (float3*)sFloatBuff; @@ -130,7 +131,6 @@ gpu_gradient_minAdam_kernel( #endif } __syncthreads(); - __threadfence(); energy = pMem_energies_next[run_id * cData.dockpars.pop_size + entity_id]; int offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM; @@ -158,7 +158,6 @@ gpu_gradient_minAdam_kernel( // E.g. in steepest descent "delta" is -1.0 * stepsize * gradient // Asynchronous copy should be finished by here - __threadfence(); __syncthreads(); // Enable this for debugging ADADELTA from a defined initial genotype @@ -210,7 +209,6 @@ gpu_gradient_minAdam_kernel( // ============================================================= // ============================================================= // Calculating energy & gradient - __threadfence(); __syncthreads(); gpu_calc_energrad( @@ -262,7 +260,6 @@ gpu_gradient_minAdam_kernel( printf("\n"); #endif } - __threadfence(); __syncthreads(); #endif // DEBUG_ENERGY_ADADELTA @@ -299,7 +296,6 @@ gpu_gradient_minAdam_kernel( // Applying update genotype[i] -= mp / (sqrt(vp) + cData.dockpars.adam_epsilon); } - __threadfence(); __syncthreads(); #if defined (DEBUG_SQDELTA_ADADELTA) @@ -312,7 +308,6 @@ gpu_gradient_minAdam_kernel( printf("%13u %20.6f %15.6f %15.6f %15.6f\n", i, square_gradient[i], delta[i], square_delta[i], genotype[i]); } } - __threadfence(); __syncthreads(); #endif @@ -358,8 +353,7 @@ gpu_gradient_minAdam_kernel( } #endif } - __threadfence(); - __syncthreads(); // making sure that iteration_cnt is up-to-date + __syncthreads(); // making sure that iteration_cnt is up-to-date #ifdef AD_RHO_CRITERION } while ((iteration_cnt < cData.dockpars.max_num_of_iters) && (rho > 0.01f)); #else @@ -378,7 +372,6 @@ gpu_gradient_minAdam_kernel( } // Updating old offspring in population - __threadfence(); __syncthreads(); offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM; @@ -410,8 +403,7 @@ void gpu_gradient_minAdam( float* pMem_energies_next ) { - size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float); - gpu_gradient_minAdam_kernel<<>>(pMem_conformations_next, pMem_energies_next); + gpu_gradient_minAdam_kernel<<>>(pMem_conformations_next, pMem_energies_next); LAUNCHERROR("gpu_gradient_minAdam_kernel"); #if 0 cudaError_t status; From 9e52fbfdc45ec44858b0453ef25a3d5fd531c803 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 31 Aug 2021 10:39:51 -0400 Subject: [PATCH 2/5] Fixed bug reading in atom names (used in contact analysis). --- host/src/processligand.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/host/src/processligand.cpp b/host/src/processligand.cpp index 9abf5667..c021a6a1 100644 --- a/host/src/processligand.cpp +++ b/host/src/processligand.cpp @@ -96,8 +96,8 @@ int init_liganddata( new_type = 1; // supposing this will be a new atom type sscanf(&line.c_str()[77], "%3s", tempstr); // reading atom type tempstr[3] = '\0'; //just to be sure strcpy wont fail even if something is wrong with position - line[17]='\0'; - sscanf(&line.c_str()[13], "%4s", myligand->atom_names[atom_cnt]); + line[16]='\0'; + sscanf(&line.c_str()[12], "%4s", myligand->atom_names[atom_cnt]); atom_cnt++; // checking if this atom has been already found From 6efdb21480135c3fe23c8eb1f8e0485b899896da Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 31 Aug 2021 11:16:43 -0400 Subject: [PATCH 3/5] Remove unused ligand struct member. --- host/inc/processligand.h | 1 - 1 file changed, 1 deletion(-) diff --git a/host/inc/processligand.h b/host/inc/processligand.h index c5ebb705..f6b5607c 100644 --- a/host/inc/processligand.h +++ b/host/inc/processligand.h @@ -64,7 +64,6 @@ typedef struct _Liganddata // base_atom_types: Each row (first index) contain an atom base type (for derived types it'll be different from atom_types), // the row index is equal to the atom type code. char base_atom_types [MAX_NUM_OF_ATOMS][4]; - char base_atom_names [MAX_NUM_OF_ATOMS][4]; // atom_map_to_fgrids: Maps each moving atom to a (pre-loaded) map id int atom_map_to_fgrids [MAX_NUM_OF_ATOMS]; // ligand grid types From 2ed8634d212440e288387fa14244c91662083cb2 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 31 Aug 2021 14:35:59 -0400 Subject: [PATCH 4/5] Small fix to treat relative paths inside fld file correctly. --- host/src/miscellaneous.cpp | 1 + host/src/processgrid.cpp | 7 +++---- host/src/processresult.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/host/src/miscellaneous.cpp b/host/src/miscellaneous.cpp index 7d5acee3..59082636 100644 --- a/host/src/miscellaneous.cpp +++ b/host/src/miscellaneous.cpp @@ -263,6 +263,7 @@ std::string get_filepath(const char* filename) char* ts1 = strdup(filename); std::string result = dirname(ts1); free(ts1); + if(result==".") result=""; return result; #else char drive_tmp[_MAX_DRIVE]; diff --git a/host/src/processgrid.cpp b/host/src/processgrid.cpp index e29fb712..c8d23a3f 100644 --- a/host/src/processgrid.cpp +++ b/host/src/processgrid.cpp @@ -40,7 +40,6 @@ int get_gridinfo( std::string line; char tempstr [256]; int gpoints_even[3]; - int recnamelen; double center[3]; int grid_types=0; @@ -112,9 +111,9 @@ int get_gridinfo( if (strcmp(tempstr, "#MACROMOLECULE") == 0) { sscanf(&line.c_str()[14], "%255s", tempstr); - recnamelen = strcspn(tempstr,"."); - tempstr[recnamelen] = '\0'; - int len = strlen(tempstr)+1; + if(strrchr(tempstr,'.')!=NULL){ + tempstr[strrchr(tempstr,'.')-tempstr] = '\0'; + } mygrid->receptor_name = tempstr; } diff --git a/host/src/processresult.cpp b/host/src/processresult.cpp index 8f5e0385..497e8223 100644 --- a/host/src/processresult.cpp +++ b/host/src/processresult.cpp @@ -319,7 +319,7 @@ void write_basic_info_dlg( fprintf(fp, " ________________________\n\n\n"); fprintf(fp, "DPF> outlev 1\n"); fprintf(fp, "DPF> ga_run %lu\n", mypars->num_of_runs); - fprintf(fp, "DPF> fld %s.maps.fld\n", mygrid->receptor_name.c_str()); + fprintf(fp, "DPF> fld %s\n", mygrid->fld_name.c_str()); fprintf(fp, "DPF> move %s\n", mypars->ligandfile); if(flexres) fprintf(fp, "DPF> flexres %s\n", mypars->flexresfile); fprintf(fp, "\n\n"); From f8fd68fc8c5927b062eeb65c18f7d476bd30a483 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 31 Aug 2021 14:48:21 -0400 Subject: [PATCH 5/5] More small fixes for relative path issues. --- host/src/miscellaneous.cpp | 1 - host/src/processgrid.cpp | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/host/src/miscellaneous.cpp b/host/src/miscellaneous.cpp index 59082636..7d5acee3 100644 --- a/host/src/miscellaneous.cpp +++ b/host/src/miscellaneous.cpp @@ -263,7 +263,6 @@ std::string get_filepath(const char* filename) char* ts1 = strdup(filename); std::string result = dirname(ts1); free(ts1); - if(result==".") result=""; return result; #else char drive_tmp[_MAX_DRIVE]; diff --git a/host/src/processgrid.cpp b/host/src/processgrid.cpp index c8d23a3f..9caa7cd1 100644 --- a/host/src/processgrid.cpp +++ b/host/src/processgrid.cpp @@ -47,6 +47,7 @@ int get_gridinfo( // Getting full path fo the grid file // Getting father directory name mygrid->grid_file_path = get_filepath(fldfilename); + if(mygrid->grid_file_path==".") mygrid->grid_file_path=""; // ---------------------------------------------------- // Processing fld file @@ -207,7 +208,9 @@ int get_gridvalues(Gridinfo* mygrid) { ti = t + mygrid->grid_mapping.size()/2; if(mygrid->fld_relative){ // this is always true (unless changed) - fn=mygrid->grid_file_path+"/"+mygrid->grid_mapping[ti]; + fn=mygrid->grid_file_path; + if(mygrid->grid_file_path.size()>0) fn+="/"; + fn+=mygrid->grid_mapping[ti]; // printf("Atom type %d (%s) uses map: %s\n",t,mygrid->grid_mapping[t].c_str(),fn.c_str()); fp.open(fn); }