From 8a752240795c473d345c3c64283b8bedff501e30 Mon Sep 17 00:00:00 2001 From: Chris Walker Date: Tue, 4 Jan 2022 17:34:14 -0500 Subject: [PATCH 1/5] partial native contacts by transition, generalize homopolymer symmetry check --- cg_openmm/parameters/secondary_structure.py | 846 ++++++++++++++++++-- cg_openmm/tests/test_native_contacts.py | 302 +++++++ 2 files changed, 1091 insertions(+), 57 deletions(-) diff --git a/cg_openmm/parameters/secondary_structure.py b/cg_openmm/parameters/secondary_structure.py index 7d78b9e1..2e7c6c87 100644 --- a/cg_openmm/parameters/secondary_structure.py +++ b/cg_openmm/parameters/secondary_structure.py @@ -364,6 +364,207 @@ def expectations_fraction_contacts(fraction_native_contacts, frame_begin=0, samp return_results["dQ"] = dQ_expect return return_results + + +def expectations_partial_contact_fractions(array_folded_states, fraction_native_contacts, + frame_begin=0, sample_spacing=1, output_data="output/output.nc", num_intermediate_states=0, + bootstrap_energies=None, transition_list=None): + """ + For systems with more than 2 conformational states, compute the expectation fraction of contacts + vs temperature for each individual transition by summing the appropriate mbar weights. + + :param array_folded_states: a precomputed array classifying the different conformational states + :type array_folded_states: 2d numpy array (int) + + :param fraction_native_contacts: The fraction of native contacts for all selected frames in the trajectories. + :type fraction_native_contacts: numpy array (float * nframes x nreplicas) + + :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0) + :type frame_begin: int + + :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1) + :type sample_spacing: int + + :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data (default="output/output.nc") + :type output_data: str + + :param num_intermediate_states: The number of states to insert between existing simulated temperature states (default=0) + :type num_intermediate_states: int + + :param bootstrap_energies: a custom replica_energies array to be used for bootstrapping calculations. Used instead of the energies in the .nc file. (default=None) + :type bootstrap_energies: 2d numpy array (float) + + :param transition_list: a list containing the allowed transitions to a folded state, such as '0_1' or '1_2'. + :type transition_list: list ( str ) + + :returns: + - Q_expect_confs ( dict ) - dictionary of the form {'m_n': 1D numpy array} containing partial contact fractions for each transition between states m and n + - T_list_out ( np.array ( float * unit.simtk.temperature ) ) Temperature list corresponding to the partial contact fraction values + """ + + if bootstrap_energies is not None: + # Use a subsampled replica_energy matrix instead of reading from file + replica_energies = bootstrap_energies + # Still need to get the thermodynamic states + reporter = MultiStateReporter(output_data, open_mode="r") + else: + # extract reduced energies and the state indices from the .nc + reporter = MultiStateReporter(output_data, open_mode="r") + analyzer = ReplicaExchangeAnalyzer(reporter) + ( + replica_energies_all, + unsampled_state_energies, + neighborhoods, + replica_state_indices, + ) = analyzer.read_energies() + + # Select production frames to analyze + replica_energies = replica_energies_all[:,:,frame_begin::sample_spacing] + + # Also apply frame selection to array_folded_states (starts at frame_begin): + array_folded_states = array_folded_states[::sample_spacing,:] + + # Check the size of the fraction_native_contacts array: + if np.shape(replica_energies)[2] != np.shape(fraction_native_contacts)[0]: + # Mismatch in number of frames. + if np.shape(replica_energies_all[:,:,frame_begin::sample_spacing])[2] == np.shape(fraction_native_contacts[::sample_spacing,:])[0]: + # Correct starting frame, need to apply sampling stride: + fraction_native_contacts = fraction_native_contacts[::sample_spacing,:] + elif np.shape(replica_energies_all)[2] == np.shape(fraction_native_contacts)[0]: + # This is the full fraction_native_contacts, slice production frames: + fraction_native_contacts = fraction_native_contacts[production_start::sample_spacing,:] + + # Get the temperature list from .nc file: + states = reporter.read_thermodynamic_states()[0] + + temperature_list = [] + for s in states: + temperature_list.append(s.temperature) + + # Close the data file: + reporter.close() + + # determine the numerical values of beta at each state in units consistent with the temperature + Tunit = temperature_list[0].unit + temps = np.array([temp.value_in_unit(Tunit) for temp in temperature_list]) # should this just be array to begin with + beta_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole/Tunit) * temps) + + # convert the energies from replica/evaluated state/sample form to evaluated state/sample form + replica_energies = pymbar.utils.kln_to_kn(replica_energies) + n_samples = len(replica_energies[0,:]) + # n_samples in [nreplica x nsamples_per_replica] + + # calculate the number of states we need expectations at. We want it at all of the original + # temperatures, each intermediate temperature, and then temperatures +/- from the original + # to take finite derivatives. + + # create an array for the temperature and energy for each state, including the + # finite different state. + n_sampled_T = len(temps) + n_unsampled_states = (n_sampled_T + (n_sampled_T-1)*num_intermediate_states) + unsampled_state_energies = np.zeros([n_unsampled_states,n_samples]) + full_T_list = np.zeros(n_unsampled_states) + + # delta is the spacing between temperatures. + delta = np.zeros(n_sampled_T-1) + + # fill in a list of temperatures at all original temperatures and all intermediate states. + full_T_list[0] = temps[0] + t = 0 + for i in range(n_sampled_T-1): + delta[i] = (temps[i+1] - temps[i])/(num_intermediate_states+1) + for j in range(num_intermediate_states+1): + full_T_list[t] = temps[i] + delta[i]*j + t += 1 + full_T_list[t] = temps[-1] + n_T_vals = t+1 + + # calculate betas of all of these temperatures + beta_full_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole/Tunit) * full_T_list) + + ti = 0 + N_k = np.zeros(n_unsampled_states) + for k in range(n_unsampled_states): + # Calculate the reduced energies at all temperatures, sampled and unsample. + unsampled_state_energies[k, :] = replica_energies[0,:]*(beta_full_k[k]/beta_k[0]) + if ti < len(temps): + # store in N_k which states do and don't have samples. + if full_T_list[k] == temps[ti]: + ti += 1 + N_k[k] = n_samples//len(temps) # these are the states that have samples + + # call MBAR to find weights at all states, sampled and unsampled + mbarT = pymbar.MBAR(unsampled_state_energies,N_k,verbose=False, relative_tolerance=1e-12); + + # Get the weights from the mbar object: + # These weights include all of the intermediate temperatures + w_nk = mbarT.W_nk + + # Now we have the weights at all temperatures, so we can + # calculate the expectations. + + # Reshape fraction native contacts [nframes x nreplicas] column by column for pymbar + Q = np.reshape(fraction_native_contacts,np.size(fraction_native_contacts), order='F') + + # Repeat for corresponding array_folded_state + array_folded_states = np.reshape(array_folded_states,(np.size(array_folded_states)),order='F') + + # Here we need to classify each of the sample mbar weights + num_confs = {} # w_i*U_i, for each state. + den_confs = {} # w_i + Q_expect_confs = {} # Contact fraction expectations + + n_conf_states = len(np.unique(array_folded_states)) + + if n_conf_states == 1: + print(f'Only 1 configurational state found. Exiting...') + exit() + + print(f'n_conf_states: {n_conf_states}') + + if transition_list is None: + # Compute all transitions: + transition_list = [] + for m in range(n_conf_states): + for n in range(m+1,n_conf_states): + transition_list.append(f'{m}_{n}') + + elif type(transition_list) == str: + # Convert to list if a single string: + if type(transition_list) == str: + transition_list = transition_list.split() + + print(f'transitions: {transition_list}') + + for m in range(n_conf_states): + for n in range(m+1,n_conf_states): + if f'{m}_{n}' in transition_list: + num_confs[f'{m}_{n}'] = 0 + den_confs[f'{m}_{n}'] = 0 + + # Loop over the conformational transitions: + for frame in range(len(array_folded_states)): + # Sum the weighted energies for partial contact fraction eval: + # These expectations are for Q in the subset of states (m,n) + for m in range(n_conf_states): + for n in range(m+1,n_conf_states): + if f'{m}_{n}' in transition_list: + if array_folded_states[frame] == m or array_folded_states[frame] == n: + num_confs[f'{m}_{n}'] += w_nk[frame,:]*Q[frame] + den_confs[f'{m}_{n}'] += w_nk[frame,:] + break + + # Now compute the final expectation values: + for m in range(n_conf_states): + for n in range(m+1,n_conf_states): + if f'{m}_{n}' in transition_list: + Q_expect_confs[f'{m}_{n}'] = num_confs[f'{m}_{n}']/den_confs[f'{m}_{n}'] + + # ***TODO: implement uncertainty directly from the mbar algorithm + + T_list_out = full_T_list*unit.kelvin + + return Q_expect_confs, T_list_out def fraction_native_contacts( @@ -375,7 +576,7 @@ def fraction_native_contacts( native_contact_tol=1.3, subsample=True, homopolymer_sym=False, -): + ): """ Given a cgmodel, mdtraj trajectory object, and positions for the native structure, this function calculates the fraction of native contacts for the model. @@ -428,13 +629,6 @@ def fraction_native_contacts( Q_avg = np.zeros((n_replicas)) Q_stderr = np.zeros((n_replicas)) - # For homopolymer symmetry, check that the chains are linear, - # which is the only option implemented for symmetry checks so far. - if homopolymer_sym: - if len(cgmodel.particle_type_list) > 1: - print(f'Error: For homopolymer symmetry checks, only linear chains with one bead type are supported') - exit() - for rep in range(n_replicas): # This should work for pdb or dcd # However for dcd we need to insert a topology, and convert it from openmm->mdtraj topology @@ -454,16 +648,57 @@ def fraction_native_contacts( # This produces a [nframe x len(native_contacts)] array if homopolymer_sym: - # We can simply reverse the indices of the native contact list: - # ***TODO: allow the reversed indices to be specified explicitly. - # (such as if there are sidechains, or the order in the pdb file is different) - + + # ***Note: this assumes that the particles are indexed by monomer, and in the same + # order for each monomer. + n_particles = rep_traj.n_atoms - reverse_contact_list = [] - for pair in native_contact_list: - reverse_par0 = n_particles-pair[0]-1 - reverse_par1 = n_particles-pair[1]-1 - reverse_contact_list.append([reverse_par0, reverse_par1]) + reverse_contact_list = [] + + if len(cgmodel.particle_type_list) == 1: + # We can simply reverse the indices of the native contact list: + + for pair in native_contact_list: + reverse_par0 = n_particles-pair[0]-1 + reverse_par1 = n_particles-pair[1]-1 + reverse_contact_list.append([reverse_par0, reverse_par1]) + + else: + # We need to use the particle type list of the original model, + # and reconstruct monomer by monomer from the opposite end. + + # This should also work for 1 particle per monomer. + + particle_type_list = [] + mono_indices_list = [] + particle_indices_forward = [] + particle_indices_reverse = [] + + for p in range(n_particles): + particle_indices_forward.append(p) + particle_type_list.append(cgmodel.get_particle_type_name(p)) + mono_indices_list.append(cgmodel.get_particle_monomer(p)) + + n_particles_per_mono = 0 + + for p in range(n_particles): + if cgmodel.get_particle_monomer(p) == 0: + n_particles_per_mono += 1 + else: + break + + n_mono = int(n_particles/n_particles_per_mono) + + for m in range(n_mono): + for p in range(n_particles_per_mono): + particle_indices_reverse = n_particles - n_particles_per_mono*(m+1) - p + # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse + + # Now select the contact pairs: + for pair in native_contact_list: + reverse_par0 = particle_indices_reverse[pair[0]] + reverse_par1 = particle_indices_reverse[pair[1]] + reverse_contact_list.append([reverse_par0, reverse_par1]) reverse_distances = md.compute_distances( rep_traj,reverse_contact_list,periodic=False,opt=True) @@ -588,16 +823,57 @@ def fraction_native_contacts_preloaded( # This produces a [nframe x len(native_contacts)] array if homopolymer_sym: - # We can simply reverse the indices of the native contact list: - # ***TODO: allow the reversed indices to be specified explicitly. - # (such as if there are sidechains, or the order in the pdb file is different) - - n_particles = traj_dict[rep].n_atoms - reverse_contact_list = [] - for pair in native_contact_list: - reverse_par0 = n_particles-pair[0]-1 - reverse_par1 = n_particles-pair[1]-1 - reverse_contact_list.append([reverse_par0, reverse_par1]) + + # ***Note: this assumes that the particles are indexed by monomer, and in the same + # order for each monomer. + + n_particles = rep_traj.n_atoms + reverse_contact_list = [] + + if len(cgmodel.particle_type_list) == 1: + # We can simply reverse the indices of the native contact list: + + for pair in native_contact_list: + reverse_par0 = n_particles-pair[0]-1 + reverse_par1 = n_particles-pair[1]-1 + reverse_contact_list.append([reverse_par0, reverse_par1]) + + else: + # We need to use the particle type list of the original model, + # and reconstruct monomer by monomer from the opposite end. + + # This should also work for 1 particle per monomer. + + particle_type_list = [] + mono_indices_list = [] + particle_indices_forward = [] + particle_indices_reverse = [] + + for p in range(n_particles): + particle_indices_forward.append(p) + particle_type_list.append(cgmodel.get_particle_type_name(p)) + mono_indices_list.append(cgmodel.get_particle_monomer(p)) + + n_particles_per_mono = 0 + + for p in range(n_particles): + if cgmodel.get_particle_monomer(p) == 0: + n_particles_per_mono += 1 + else: + break + + n_mono = int(n_particles/n_particles_per_mono) + + for m in range(n_mono): + for p in range(n_particles_per_mono): + particle_indices_reverse = n_particles - n_particles_per_mono*(m+1) - p + # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse + + # Now select the contact pairs: + for pair in native_contact_list: + reverse_par0 = particle_indices_reverse[pair[0]] + reverse_par1 = particle_indices_reverse[pair[1]] + reverse_contact_list.append([reverse_par0, reverse_par1]) reverse_distances = md.compute_distances( traj_dict[rep][frame_begin:],reverse_contact_list,periodic=False,opt=True) @@ -723,13 +999,6 @@ def optimize_Q_cut( # Convert to a 1 element list if not one traj_file_list = traj_file_list.split() n_replicas = 1 - - # For homopolymer symmetry, check that the chains are linear, - # which is the only option implemented for symmetry checks so far. - if homopolymer_sym: - if len(cgmodel.particle_type_list) > 1: - print(f'Error: For homopolymer symmetry checks, only linear chains with one bead type are supported') - exit() for rep in range(n_replicas): if traj_file_list[rep][-3:] == 'dcd': @@ -966,14 +1235,7 @@ def optimize_Q_cut_1d( # Convert to a 1 element list if not one traj_file_list = traj_file_list.split() n_replicas = 1 - - # For homopolymer symmetry, check that the chains are linear, - # which is the only option implemented for symmetry checks so far. - if homopolymer_sym: - if len(cgmodel.particle_type_list) > 1: - print(f'Error: For homopolymer symmetry checks, only linear chains with one bead type are supported') - exit() - + for rep in range(n_replicas): if traj_file_list[rep][-3:] == 'dcd': traj_dict[rep] = md.load(traj_file_list[rep],top=md.Topology.from_openmm(cgmodel.topology)) @@ -1194,13 +1456,6 @@ def bootstrap_native_contacts_expectation( # Convert to a 1 element list if not one traj_file_list = traj_file_list.split() n_replicas = 1 - - # For homopolymer symmetry, check that the chains are linear, - # which is the only option implemented for symmetry checks so far. - if homopolymer_sym: - if len(cgmodel.particle_type_list) > 1: - print(f'Error: For homopolymer symmetry checks, only linear chains with one bead type are supported') - exit() for rep in range(n_replicas): if traj_file_list[rep][-3:] == 'dcd': @@ -1419,6 +1674,366 @@ def bootstrap_native_contacts_expectation( return temp_list, Q_values, Q_uncertainty, sigmoid_results_boot +def bootstrap_partial_contacts_expectation( + cgmodel, + traj_file_list, + native_contact_list, + native_contact_distances, + array_folded_states, + output_data='output/output.nc', + frame_begin=0, + sample_spacing=1, + native_contact_tol=1.3, + num_intermediate_states=0, + n_trial_boot=200, + conf_percent='sigma', + plotfile='Q_vs_T_bootstrap.pdf', + homopolymer_sym=False, + transition_list=None, + ): + """ + Given a cgmodel, native contact definitions, and trajectory file list, this function calculates the + fraction of native contacts for all specified frames, and uses a bootstrapping scheme to compute + the uncertainties in the Q vs T folding curve. Intended to be used after the native contact tolerance + has been optimized (either the helical or generalized versions). + + Here the partial contact fractions are computed, using only the structures classified within a pair of conformations. + + :param cgmodel: CGModel() class object + :type cgmodel: class + + :param traj_file_list: A list of replica PDB or DCD trajectory files corresponding to the energies in the .nc file, or a single file name + :type traj_file_list: List( str ) or str + + :param native_contact_list: A list of the nonbonded interactions whose inter-particle distances are less than the 'native_contact_distance_cutoff'. + :type native_contact_list: List + + :param native_contact_distances: A numpy array of the native pairwise distances corresponding to native_contact_list + :type native_contact_distances: Quantity + + :param array_folded_states: a precomputed array classifying the different conformational states + :type array_folded_states: 2d numpy array (int) + + :param frame_begin: Frame at which to start native contacts analysis (default=0) + :type frame_begin: int + + :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1) + :type sample_spacing: int + + :param native_contact_tol: Tolerance factor beyond the native distance for determining whether a pair of particles is 'native' (in multiples of native distance) (default=1.3) + :type native_contact_tol: float + + :param num_intermediate_states: The number of states to insert between existing simulated temperature states (default=0) + :type num_intermediate_states: int + + :param n_trial_boot: number of trials to run for generating bootstrapping uncertainties (default=200) + :type n_trial_boot: int + + :param conf_percent: Confidence level in percent for outputting uncertainties (default='sigma'=68.27) + :type conf_percent: float + + :param plotfile: Path to output file for plotting results (default='Q_vs_T_bootstrap.pdf') + :type plotfile: str + + :param homopolymer_sym: if there is end-to-end symmetry, scan forwards and backwards sequences for highest Q (default=False) + :type homopolymer_sym: Boolean + + :returns: + - temp_list ( List( float * unit.simtk.temperature ) ) - The temperature list corresponding to the native contact fraction values + - Q_values ( List( float ) ) - The native contact fraction values for all (including inserted intermediates) states + - Q_uncertainty ( Tuple ( np.array(float) ) - confidence interval for all Q_values computed from bootstrapping + - sigmoid_results_boot ( dict ) - dictionary containing the 4 sigmoid parameters and Q_folded (and their confidence interval tuples) + """ + + # Pre-load the replica trajectories into MDTraj objects, to avoid having to load them + # at each iteration (very costly for pdb in particular) + + traj_dict = {} + + if type(traj_file_list) == list: + n_replicas = len(traj_file_list) + elif type(traj_file_list) == str: + # Convert to a 1 element list if not one + traj_file_list = traj_file_list.split() + n_replicas = 1 + + for rep in range(n_replicas): + if traj_file_list[rep][-3:] == 'dcd': + traj_dict[rep] = md.load(traj_file_list[rep],top=md.Topology.from_openmm(cgmodel.topology)) + else: + traj_dict[rep] = md.load(traj_file_list[rep]) + + # Extract reduced energies and the state indices from the .nc + reporter = MultiStateReporter(output_data, open_mode="r") + analyzer = ReplicaExchangeAnalyzer(reporter) + ( + replica_energies_all, + unsampled_state_energies, + neighborhoods, + replica_state_indices, + ) = analyzer.read_energies() + + reporter.close() + + # Get native contact fraction of all frames (bootstrapping draws uncorrelated samples from this full dataset) + # To avoid loading in files each iteration, use alternate version of fraction_native_contacts code + Q_all, Q_avg, Q_stderr, decorrelation_time = fraction_native_contacts_preloaded( + cgmodel, + traj_dict, + native_contact_list, + native_contact_distances, + frame_begin=frame_begin, + native_contact_tol=native_contact_tol, + subsample=False, + homopolymer_sym=homopolymer_sym, + ) + + # Check the size of array_folded_states: + if np.shape(replica_energies_all[:,:,frame_begin::])[2] != np.shape(array_folded_states)[0]: + print(f'Error: mismatch in number of samples in array_folded_states and specified frames of replica energies') + exit() + + # Get the number of conformational state classifications: + n_conf_states = len(np.unique(array_folded_states)) + + if transition_list is None: + # Compute all transitions: + transition_list = [] + for m in range(n_conf_states): + for n in range(m+1,n_conf_states): + transition_list.append(f'{m}_{n}') + + elif type(transition_list) == str: + # Convert to list if a single string: + if type(transition_list) == str: + transition_list = transition_list.split() + + # For each bootstrap trial, compute the expectation of native contacts and fit to sigmoid. + Q_expect_boot = {} + + sigmoid_Q_max = {} + sigmoid_Q_min = {} + sigmoid_d = {} + sigmoid_Tm = {} + Q_folded = {} + + for trans in transition_list: + sigmoid_Q_max[trans] = np.zeros(n_trial_boot) + sigmoid_Q_min[trans] = np.zeros(n_trial_boot) + sigmoid_d[trans] = np.zeros(n_trial_boot) + sigmoid_Tm[trans] = np.zeros(n_trial_boot) + Q_folded[trans] = np.zeros(n_trial_boot) + + for i_boot in range(n_trial_boot): + # Select production frames to analyze + # Here we can potentially change the reference frame for each bootstrap trial. + ref_shift = np.random.randint(sample_spacing) + # ***We should check if these energies arrays will be the same size for + # different reference frames + replica_energies = replica_energies_all[:,:,(frame_begin+ref_shift)::sample_spacing] + array_folded_states_boot = array_folded_states[ref_shift::sample_spacing,:] + # ***Unlike replica energies, Q does not include the equilibration frames + Q = Q_all[ref_shift::sample_spacing,:] + + # Get all possible sample indices + sample_indices_all = np.arange(0,len(replica_energies[0,0,:])) + # n_samples should match the size of the sliced replica energy dataset + sample_indices = resample(sample_indices_all, replace=True, n_samples=len(sample_indices_all)) + + n_state = replica_energies.shape[0] + + replica_energies_resample = np.zeros_like(replica_energies) + # replica_energies is [n_states x n_states x n_frame] + # Q and array_folded_states are [nframes x n_replicas] + Q_resample = np.zeros((len(sample_indices),n_replicas)) + array_folded_states_resample = np.zeros_like(array_folded_states_boot) + + # Select the sampled frames from array_folded_states and replica_energies: + j = 0 + for i in sample_indices: + replica_energies_resample[:,:,j] = replica_energies[:,:,i] + array_folded_states_resample[j,:] = array_folded_states_boot[i,:] + Q_resample[j,:] = Q[i,:] + j += 1 + + # Run the native contacts expectation calculation: + # Q_expect_boot is nested dict with keys [i_boot]['m_n'] + Q_expect_boot[i_boot], T_list_out = expectations_partial_contact_fractions( + array_folded_states_resample, + Q_resample, + frame_begin=frame_begin, + num_intermediate_states=num_intermediate_states, + bootstrap_energies=replica_energies_resample, + output_data=output_data, + transition_list=transition_list, + ) + + # Fit contact fraction vs T for each transition to sigmoid: + for trans in transition_list: + param_opt, param_cov = fit_sigmoid( + T_list_out, + Q_expect_boot[i_boot][trans], + plotfile=None + ) + + # Save the individual parameters: + # These have the reversed order of keys: + if param_opt[1] >= param_opt[2]: + sigmoid_Q_max[trans][i_boot] = param_opt[1] + sigmoid_Q_min[trans][i_boot] = param_opt[2] + else: + # This shouldn't occur unless d is negative + print(f'Error with sigmoid fitting') + sigmoid_Q_max[trans][i_boot] = param_opt[2] + sigmoid_Q_min[trans][i_boot] = param_opt[1] + + sigmoid_d[trans][i_boot] = param_opt[3] + sigmoid_Tm[trans][i_boot] = param_opt[0] + Q_folded[trans][i_boot] = (param_opt[1]+param_opt[2])/2 + + # Compute uncertainty at all temps in Q_expect_boot over the n_trial_boot trials performed: + + # Total number of temps including intermediate states: + n_temps = len(T_list_out) + + # Convert bootstrap trial dicts to array + arr_Q_values_boot = {} + + for trans in transition_list: + arr_Q_values_boot[trans] = np.zeros((n_trial_boot, n_temps)) + + for i_boot in range(n_trial_boot): + arr_Q_values_boot[trans][i_boot,:] = Q_expect_boot[i_boot][trans] + + # Compute mean values: + + Q_values = {} + sigmoid_Q_max_value = {} + sigmoid_Q_min_value = {} + sigmoid_d_value = {} + sigmoid_Tm_value = {} + Q_folded_value = {} + + for trans in transition_list: + Q_values[trans] = np.mean(arr_Q_values_boot[trans],axis=0) + sigmoid_Q_max_value[trans] = np.mean(sigmoid_Q_max[trans]) + sigmoid_Q_min_value[trans] = np.mean(sigmoid_Q_min[trans]) + sigmoid_d_value[trans] = np.mean(sigmoid_d[trans])*unit.kelvin + sigmoid_Tm_value[trans] = np.mean(sigmoid_Tm[trans])*unit.kelvin + Q_folded_value[trans] = np.mean(Q_folded[trans]) + + # Compute confidence intervals: + Q_uncertainty = {} + sigmoid_Q_max_uncertainty = {} + sigmoid_Q_min_uncertainty = {} + sigmoid_d_uncertainty = {} + sigmoid_Tm_uncertainty = {} + Q_folded_uncertainty = {} + + if conf_percent == 'sigma': + # Use analytical standard deviation instead of percentile method: + + for trans in transition_list: + # Q values: + Q_std = np.std(arr_Q_values_boot[trans],axis=0) + Q_uncertainty[trans] = (-Q_std, Q_std) + + # Sigmoid Q_max: + sigmoid_Q_max_std = np.std(sigmoid_Q_max[trans]) + sigmoid_Q_max_uncertainty[trans] = (-sigmoid_Q_max_std, sigmoid_Q_max_std) + + # Sigmoid Q_min: + sigmoid_Q_min_std = np.std(sigmoid_Q_min[trans]) + sigmoid_Q_min_uncertainty[trans] = (-sigmoid_Q_min_std, sigmoid_Q_min_std) + + # Sigmoid d: + sigmoid_d_std = np.std(sigmoid_d[trans]) + sigmoid_d_uncertainty[trans] = (-sigmoid_d_std*unit.kelvin, sigmoid_d_std*unit.kelvin) + + # Sigmoid Tm: + sigmoid_Tm_std = np.std(sigmoid_Tm[trans]) + sigmoid_Tm_uncertainty[trans] = (-sigmoid_Tm_std*unit.kelvin, sigmoid_Tm_std*unit.kelvin) + + # Q_folded: + Q_folded_std = np.std(Q_folded[trans]) + Q_folded_uncertainty[trans] = (-Q_folded_std, Q_folded_std) + + else: + # Compute specified confidence interval: + p_lo = (100-conf_percent)/2 + p_hi = 100-p_lo + + for trans in transition_list: + # Q values: + Q_diff = arr_Q_values_boot[trans]-np.mean(arr_Q_values_boot[trans],axis=0) + Q_conf_lo = np.percentile(Q_diff,p_lo,axis=0,interpolation='linear') + Q_conf_hi = np.percentile(Q_diff,p_hi,axis=0,interpolation='linear') + + Q_uncertainty[trans] = (Q_conf_lo, Q_conf_hi) + + # Sigmoid Q_max: + sigmoid_Q_max_diff = sigmoid_Q_max[trans]-np.mean(sigmoid_Q_max[trans]) + sigmoid_Q_max_conf_lo = np.percentile(sigmoid_Q_max_diff,p_lo,interpolation='linear') + sigmoid_Q_max_conf_hi = np.percentile(sigmoid_Q_max_diff,p_hi,interpolation='linear') + + sigmoid_Q_max_uncertainty[trans] = (sigmoid_Q_max_conf_lo, sigmoid_Q_max_conf_hi) + + # Sigmoid Q_min: + sigmoid_Q_min_diff = sigmoid_Q_min[trans]-np.mean(sigmoid_Q_min[trans]) + sigmoid_Q_min_conf_lo = np.percentile(sigmoid_Q_min_diff,p_lo,interpolation='linear') + sigmoid_Q_min_conf_hi = np.percentile(sigmoid_Q_min_diff,p_hi,interpolation='linear') + + sigmoid_Q_min_uncertainty[trans] = (sigmoid_Q_min_conf_lo, sigmoid_Q_min_conf_hi) + + # Sigmoid d: + sigmoid_d_diff = sigmoid_d[trans]-np.mean(sigmoid_d[trans]) + sigmoid_d_conf_lo = np.percentile(sigmoid_d_diff,p_lo,interpolation='linear') + sigmoid_d_conf_hi = np.percentile(sigmoid_d_diff,p_hi,interpolation='linear') + + sigmoid_d_uncertainty[trans] = (sigmoid_d_conf_lo*unit.kelvin, sigmoid_d_conf_hi*unit.kelvin) + + # Sigmoid Tm: + sigmoid_Tm_diff = sigmoid_Tm[trans]-np.mean(sigmoid_Tm[trans]) + sigmoid_Tm_conf_lo = np.percentile(sigmoid_Tm_diff,p_lo,interpolation='linear') + sigmoid_Tm_conf_hi = np.percentile(sigmoid_Tm_diff,p_hi,interpolation='linear') + + sigmoid_Tm_uncertainty[trans] = (sigmoid_Tm_conf_lo*unit.kelvin, sigmoid_Tm_conf_hi*unit.kelvin) + + # Q_folded: + Q_folded_diff = Q_folded[trans]-np.mean(Q_folded[trans]) + Q_folded_conf_lo = np.percentile(Q_folded_diff,p_lo,interpolation='linear') + Q_folded_conf_hi = np.percentile(Q_folded_diff,p_hi,interpolation='linear') + + Q_folded_uncertainty[trans] = (Q_folded_conf_lo, Q_folded_conf_hi*unit.kelvin) + + # Compile sigmoid results into dict of dicts: + sigmoid_results_boot = {} + + sigmoid_results_boot['sigmoid_Q_max_value'] = sigmoid_Q_max_value + sigmoid_results_boot['sigmoid_Q_max_uncertainty'] = sigmoid_Q_max_uncertainty + + sigmoid_results_boot['sigmoid_Q_min_value'] = sigmoid_Q_min_value + sigmoid_results_boot['sigmoid_Q_min_uncertainty'] = sigmoid_Q_min_uncertainty + + sigmoid_results_boot['sigmoid_d_value'] = sigmoid_d_value + sigmoid_results_boot['sigmoid_d_uncertainty'] = sigmoid_d_uncertainty + + sigmoid_results_boot['sigmoid_Tm_value'] = sigmoid_Tm_value + sigmoid_results_boot['sigmoid_Tm_uncertainty'] = sigmoid_Tm_uncertainty + + sigmoid_results_boot['Q_folded_value'] = Q_folded_value + sigmoid_results_boot['Q_folded_uncertainty'] = Q_folded_uncertainty + + # Plot Q vs T results with uncertainty and mean sigmoid parameters + if conf_percent=='sigma': + plot_partial_contact_fractions( + T_list_out, Q_values, Q_std, plotfile=plotfile, sigmoid_dict=sigmoid_results_boot + ) + # TODO: implement unequal upper and lower error plotting + + return T_list_out, Q_values, Q_uncertainty, sigmoid_results_boot + + def optimize_Q_tol_helix( cgmodel, native_structure_file, traj_file_list, output_data="output/output.nc", num_intermediate_states=0, frame_begin=0, frame_stride=1, backbone_type_name='bb', @@ -1485,14 +2100,7 @@ def optimize_Q_tol_helix( # Convert to a 1 element list if not one traj_file_list = traj_file_list.split() n_replicas = 1 - - # For homopolymer symmetry, check that the chains are linear, - # which is the only option implemented for symmetry checks so far. - if homopolymer_sym: - if len(cgmodel.particle_type_list) > 1: - print(f'Error: For homopolymer symmetry checks, only linear chains with one bead type are supported') - exit() - + for rep in range(n_replicas): if traj_file_list[rep][-3:] == 'dcd': traj_dict[rep] = md.load(traj_file_list[rep],top=md.Topology.from_openmm(cgmodel.topology)) @@ -1723,6 +2331,130 @@ def tanh_switch(x,x0,y0,y1,d): plt.close() +def plot_partial_contact_fractions(temperature_list, Q_values, Q_uncertainty, plotfile="partial_Q_vs_T.pdf", sigmoid_dict=None): + """ + Given a list of temperatures and corresponding partial contact fractions, plot Q vs T for each transition. + If a sigmoid dict from bootstrapping is given, also plot the sigmoid curve. + Note that this sigmoid curve is generated by using the mean values of the 4 hyperbolic fitting parameters + taken over all bootstrap trials, not a direct fit to the Q vs T data. + + :param temperature_list: List of temperatures that will be used to define different replicas (thermodynamics states) + :type temperature_list: List( `SIMTK `_ `Unit() `_ * number_replicas ) + + :param Q: native contact fraction array for all temperatures in temperature_list + :type Q: np.array(float * len(temperature_list)) + + :param Q_uncertainty: uncertainty associated with Q + :type Q_uncertainty: np.array(float * len(temperature_list)) + + :param plotfile: Path to output file for plotting results (default='Q_vs_T.pdf') + :type plotfile: str + + :param sigmoid_dict: dictionary containing sigmoid parameter mean values and uncertainties (default=None) + :type sigmoid_dict: dict + """ + + temperature_array = np.zeros((len(temperature_list))) + for i in range(len(temperature_list)): + temperature_array[i] = temperature_list[i].value_in_unit(unit.kelvin) + + if sigmoid_dict is not None: + # Also plot sigmoid curve + def tanh_switch(x,x0,y0,y1,d): + return (y0+y1)/2-((y0-y1)/2)*np.tanh(np.radians(x-x0)/d) + + for trans, value in Q_values.items(): + xsig = np.linspace(temperature_array[0],temperature_array[-1],1000) + ysig = tanh_switch( + xsig, + sigmoid_dict['sigmoid_Tm_value'][trans].value_in_unit(unit.kelvin), + sigmoid_dict['sigmoid_Q_max_value'][trans], + sigmoid_dict['sigmoid_Q_min_value'][trans], + sigmoid_dict['sigmoid_d_value'][trans].value_in_unit(unit.kelvin), + ) + + + errorbar1 = plt.errorbar( + temperature_array, + Q_values[trans], + yerr=Q_uncertainty, + linewidth=0.5, + markersize=4, + fmt='o', + label=f'trans {trans} mean', + fillstyle='none', + capsize=4, + ) + + # Get the color of the errorbar: + lines_tuple = errorbar1.lines + # This contains (data_line(Line2D), caplines(Line2D), barlinecols(LineCollection)) + color1 = lines_tuple[0].get_color() + + line2 = plt.plot( + xsig, ysig,'-', + label=f'trans {trans} fit', + color=color1, # keep the same color + ) + + line3 = plt.errorbar( + sigmoid_dict['sigmoid_Tm_value'][trans].value_in_unit(unit.kelvin), + sigmoid_dict['Q_folded_value'][trans], + xerr=sigmoid_dict['sigmoid_Tm_uncertainty'][trans][1].value_in_unit(unit.kelvin), + yerr=sigmoid_dict['Q_folded_uncertainty'][trans][1], + linewidth=0.5, + markersize=4, + fmt='D-r', + fillstyle='none', + capsize=4, + ) + + xlim = plt.xlim() + ylim = plt.ylim() + + # TODO: update to use the asymmetric uncertainties here for confidence intervals + # We can add the hyperbolic fits with parameters for the upper and lower confidence bounds + # plt.text( + # (xlim[0]+0.90*(xlim[1]-xlim[0])), + # (ylim[0]+0.50*(ylim[1]-ylim[0])), + # f"T_m = {sigmoid_dict['sigmoid_Tm_value'].value_in_unit(unit.kelvin):.2f} \u00B1 {sigmoid_dict['sigmoid_Tm_uncertainty'][1].value_in_unit(unit.kelvin):.2f} \n"\ + # f"Q_m = {sigmoid_dict['Q_folded_value']:.4f} \u00B1 {sigmoid_dict['Q_folded_uncertainty'][1]:.4f}\n"\ + # f"d = {sigmoid_dict['sigmoid_d_value'].value_in_unit(unit.kelvin):.4f} \u00B1 {sigmoid_dict['sigmoid_d_uncertainty'][1].value_in_unit(unit.kelvin):.4f}\n"\ + # f"Qmax = {sigmoid_dict['sigmoid_Q_max_value']:.4f} \u00B1 {sigmoid_dict['sigmoid_Q_max_uncertainty'][1]:.4f}\n"\ + # f"Qmin = {sigmoid_dict['sigmoid_Q_min_value']:.4f} \u00B1 {sigmoid_dict['sigmoid_Q_min_uncertainty'][1]:.4f}", + # {'fontsize': 10}, + # horizontalalignment='right', + # ) + + else: + for trans, value in Q_values.items(): + plt.errorbar( + temperature_array, + Q_values[trans], + Q_uncertainty[trans], + linewidth=0.5, + markersize=4, + label=f'trans {trans}', + fmt='o-', + fillstyle='none', + capsize=4, + ) + + plt.legend( + loc='upper right', + fontsize=6, + ) + + # Fix y limits: + plt.xlim((temperature_array[0],temperature_array[-1])) + plt.ylim((0,1)) + + plt.xlabel("T (K)") + plt.ylabel("Contact fraction") + plt.savefig(plotfile) + plt.close() + + def plot_native_contact_timeseries( Q, time_interval=1.0*unit.picosecond, diff --git a/cg_openmm/tests/test_native_contacts.py b/cg_openmm/tests/test_native_contacts.py index f7d9edf2..6d55c7c1 100644 --- a/cg_openmm/tests/test_native_contacts.py +++ b/cg_openmm/tests/test_native_contacts.py @@ -472,6 +472,163 @@ def test_expectations_fraction_contacts_dcd(tmpdir): assert os.path.isfile(f"{output_directory}/Q_vs_T_fit.pdf") +def test_expectations_partial_contact_fractions_2state(tmpdir): + """ + Test the partial contact fraction expectation code for the trivial case + of a 2 state system (should match the original expectation native contact fraction result) + """ + + output_directory = tmpdir.mkdir("output") + + # Replica exchange settings + number_replicas = 12 + min_temp = 200.0 * unit.kelvin + max_temp = 600.0 * unit.kelvin + temperature_list = get_temperature_list(min_temp, max_temp, number_replicas) + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of pdb trajectories to analyze + # For expectation fraction native contacts, we use replica trajectories: + dcd_file_list = [] + for i in range(len(temperature_list)): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Load in native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Set cutoff parameters: + # Cutoff for native structure pairwise distances: + native_contact_cutoff = 4.0* unit.angstrom + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Set production frames: + frame_begin = 100 + + # Get native contacts: + native_contact_list, native_contact_distances, contact_type_dict = get_native_contacts( + cgmodel, + native_structure_file, + native_contact_cutoff, + ) + + Q, Q_avg, Q_stderr, decorrelation_spacing = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + frame_begin=frame_begin, + native_contact_tol=native_contact_tol, + ) + + # Set 1 as folded, 0 as unfolded: + Q_folded = 0.3 + array_folded_states = np.multiply((Q>=Q_folded),1) + + output_data = os.path.join(data_path, "output.nc") + num_intermediate_states=1 + + Q_values_partial, T_list_out = expectations_partial_contact_fractions( + array_folded_states, + Q, + frame_begin=100, + output_data=output_data, + num_intermediate_states=num_intermediate_states, + transition_list='0_1', + ) + + Q_results = expectations_fraction_contacts( + Q, + frame_begin=100, + output_data=output_data, + num_intermediate_states=num_intermediate_states, + ) + + assert Q_results['Q'].all() == Q_values_partial['0_1'].all() + + +def test_expectations_partial_contact_fractions_2state_no_list(tmpdir): + """ + Test the partial contact fraction expectation code for the trivial case + of a 2 state system (should match the original expectation native contact fraction result) + (no transition list explicitly specified) + """ + + output_directory = tmpdir.mkdir("output") + + # Replica exchange settings + number_replicas = 12 + min_temp = 200.0 * unit.kelvin + max_temp = 600.0 * unit.kelvin + temperature_list = get_temperature_list(min_temp, max_temp, number_replicas) + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of pdb trajectories to analyze + # For expectation fraction native contacts, we use replica trajectories: + dcd_file_list = [] + for i in range(len(temperature_list)): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Load in native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Set cutoff parameters: + # Cutoff for native structure pairwise distances: + native_contact_cutoff = 4.0* unit.angstrom + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Set production frames: + frame_begin = 100 + + # Get native contacts: + native_contact_list, native_contact_distances, contact_type_dict = get_native_contacts( + cgmodel, + native_structure_file, + native_contact_cutoff, + ) + + Q, Q_avg, Q_stderr, decorrelation_spacing = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + frame_begin=frame_begin, + native_contact_tol=native_contact_tol, + ) + + # Set 1 as folded, 0 as unfolded: + Q_folded = 0.3 + array_folded_states = np.multiply((Q>=Q_folded),1) + + output_data = os.path.join(data_path, "output.nc") + num_intermediate_states=1 + + Q_values_partial, T_list_out = expectations_partial_contact_fractions( + array_folded_states, + Q, + frame_begin=100, + output_data=output_data, + num_intermediate_states=num_intermediate_states, + transition_list=None, + ) + + Q_results = expectations_fraction_contacts( + Q, + frame_begin=100, + output_data=output_data, + num_intermediate_states=num_intermediate_states, + ) + + assert Q_results['Q'].all() == Q_values_partial['0_1'].all() + + def test_expectations_fraction_contacts_dcd_homopolymer_sym(tmpdir): """ See if we can determine native contacts expectations as a function of T, @@ -578,6 +735,151 @@ def test_bootstrap_native_contacts_expectation_dcd(tmpdir): assert os.path.isfile(f'{output_directory}/native_contacts_boot.pdf') +def test_bootstrap_partial_contacts_expectation_dcd_2state(tmpdir): + """Test bootstrapping of partial native contacts expectation, based on helix contacts""" + + output_directory = tmpdir.mkdir("output") + output_data = os.path.join(data_path, "output.nc") + + # Replica exchange settings + number_replicas = 12 + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of pdb trajectories to analyze + # For fraction_native_contacts vs. T, we use state trajectories. + # However, we can test with the replica pdbs: + dcd_file_list = [] + for i in range(number_replicas): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Load in native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Set cutoff parameters: + # Cutoff for native structure pairwise distances: + native_contact_cutoff = 4.0* unit.angstrom + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Determine native contacts: + native_contact_list, native_contact_distances, contact_type_dict = get_native_contacts( + cgmodel, + native_structure_file, + native_contact_cutoff, + ) + + frame_begin = 100 + + Q, Q_avg, Q_stderr, decorrelation_spacing = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + frame_begin=frame_begin, + native_contact_tol=native_contact_tol, + ) + + # Set 1 as folded, 0 as unfolded: + Q_folded = 0.3 + array_folded_states = np.multiply((Q>=Q_folded),1) + + full_T_list, Q_values, Q_uncertainty, sigmoid_results_boot = bootstrap_partial_contacts_expectation( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + array_folded_states, + output_data=output_data, + frame_begin=frame_begin, + sample_spacing=20, + native_contact_tol=native_contact_tol, + num_intermediate_states=1, + n_trial_boot=10, + conf_percent='sigma', + plotfile=f'{output_directory}/partial_contacts_boot.pdf', + transition_list='0_1', + ) + + assert os.path.isfile(f'{output_directory}/partial_contacts_boot.pdf') + + +def test_bootstrap_partial_contacts_expectation_dcd_2state_no_list(tmpdir): + """ + Test bootstrapping of partial native contacts expectation, based on helix contacts + (no transition list explicitly specified) + """ + + output_directory = tmpdir.mkdir("output") + output_data = os.path.join(data_path, "output.nc") + + # Replica exchange settings + number_replicas = 12 + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of pdb trajectories to analyze + # For fraction_native_contacts vs. T, we use state trajectories. + # However, we can test with the replica pdbs: + dcd_file_list = [] + for i in range(number_replicas): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Load in native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Set cutoff parameters: + # Cutoff for native structure pairwise distances: + native_contact_cutoff = 4.0* unit.angstrom + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Determine native contacts: + native_contact_list, native_contact_distances, contact_type_dict = get_native_contacts( + cgmodel, + native_structure_file, + native_contact_cutoff, + ) + + frame_begin = 100 + + Q, Q_avg, Q_stderr, decorrelation_spacing = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + frame_begin=frame_begin, + native_contact_tol=native_contact_tol, + ) + + # Set 1 as folded, 0 as unfolded: + Q_folded = 0.3 + array_folded_states = np.multiply((Q>=Q_folded),1) + + full_T_list, Q_values, Q_uncertainty, sigmoid_results_boot = bootstrap_partial_contacts_expectation( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + array_folded_states, + output_data=output_data, + frame_begin=frame_begin, + sample_spacing=20, + native_contact_tol=native_contact_tol, + num_intermediate_states=1, + n_trial_boot=10, + conf_percent='sigma', + plotfile=f'{output_directory}/partial_contacts_boot.pdf', + transition_list=None, + ) + + assert os.path.isfile(f'{output_directory}/partial_contacts_boot.pdf') + + def test_bootstrap_native_contacts_expectation_dcd_homopolymer_sym(tmpdir): """ Test bootstrapping of native contacts expectation, based on helix contacts, From 37547d25c1a09c726127baa7495865abe856a8aa Mon Sep 17 00:00:00 2001 From: Chris Walker Date: Wed, 12 Jan 2022 11:00:50 -0500 Subject: [PATCH 2/5] return N_eff_samples for partial Cv's --- cg_openmm/tests/test_thermo.py | 14 ++++++++------ cg_openmm/thermo/calc.py | 27 ++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/cg_openmm/tests/test_thermo.py b/cg_openmm/tests/test_thermo.py index a489dd23..06c58746 100644 --- a/cg_openmm/tests/test_thermo.py +++ b/cg_openmm/tests/test_thermo.py @@ -171,7 +171,7 @@ def test_partial_heat_capacity_2state(tmpdir): array_folded_states = np.multiply((Q>=Q_folded),1) (Cv_partial, dCv_partial, temperature_list, \ - FWHM_partial, Tm_partial, Cv_height_partial, U_expect_confs) = get_partial_heat_capacities( + FWHM_partial, Tm_partial, Cv_height_partial, U_expect_confs, N_eff_partial) = get_partial_heat_capacities( array_folded_states=array_folded_states, frame_begin=frame_begin, sample_spacing=1, @@ -186,7 +186,7 @@ def test_partial_heat_capacity_2state(tmpdir): # Check data types of all output: for varname in [ - 'Cv_partial','FWHM_partial','Tm_partial','Cv_height_partial','U_expect_confs' + 'Cv_partial','FWHM_partial','Tm_partial','Cv_height_partial','U_expect_confs','N_eff_partial', ]: assert eval(f'type({varname})') == dict @@ -255,7 +255,7 @@ def test_bootstrap_partial_heat_capacity_2state_sigma(tmpdir): (T_list, Cv_values, Cv_uncertainty, Tm_value, Tm_uncertainty, \ Cv_height_value, Cv_height_uncertainty, FWHM_value, FWHM_uncertainty, \ - U_expect_values, U_expect_uncertainty) = bootstrap_partial_heat_capacities( + U_expect_values, U_expect_uncertainty, N_eff_values) = bootstrap_partial_heat_capacities( array_folded_states=array_folded_states, frame_begin=frame_begin, sample_spacing=1, @@ -273,7 +273,8 @@ def test_bootstrap_partial_heat_capacity_2state_sigma(tmpdir): # Check data types of all output: for varname in ['Cv_values','Cv_uncertainty', 'Tm_value','Tm_uncertainty','Cv_height_value','Cv_height_uncertainty', - 'FWHM_value','FWHM_uncertainty','U_expect_values','U_expect_uncertainty' + 'FWHM_value','FWHM_uncertainty','U_expect_values','U_expect_uncertainty', + 'N_eff_values', ]: assert eval(f'type({varname})') == dict @@ -342,7 +343,7 @@ def test_bootstrap_partial_heat_capacity_2state_conf(tmpdir): (T_list, Cv_values, Cv_uncertainty, Tm_value, Tm_uncertainty, \ Cv_height_value, Cv_height_uncertainty, FWHM_value, FWHM_uncertainty, \ - U_expect_values, U_expect_uncertainty) = bootstrap_partial_heat_capacities( + U_expect_values, U_expect_uncertainty, N_eff_values) = bootstrap_partial_heat_capacities( array_folded_states=array_folded_states, frame_begin=frame_begin, sample_spacing=2, @@ -360,7 +361,8 @@ def test_bootstrap_partial_heat_capacity_2state_conf(tmpdir): # Check data types of all output: for varname in ['Cv_values','Cv_uncertainty', 'Tm_value','Tm_uncertainty','Cv_height_value','Cv_height_uncertainty', - 'FWHM_value','FWHM_uncertainty','U_expect_values','U_expect_uncertainty' + 'FWHM_value','FWHM_uncertainty','U_expect_values','U_expect_uncertainty', + 'N_eff_values', ]: assert eval(f'type({varname})') == dict diff --git a/cg_openmm/thermo/calc.py b/cg_openmm/thermo/calc.py index 8106b8d8..6e697636 100644 --- a/cg_openmm/thermo/calc.py +++ b/cg_openmm/thermo/calc.py @@ -457,6 +457,7 @@ def get_partial_heat_capacities(array_folded_states, - Tm_partial ( dict ( float * unit.simtk.temperature ) ) - For each conformational class, melting point from heat capacity vs T - Cv_height_partial ( dict ( float * kJ/mol/K ) ) - For each conformational class, relative height of heat capacity peak - U_expect_confs ( dict ( np.array( float * kJ/mol ) ) - For each conformational class, the energy expectations (in kJ/mol) at each T, including intermediate states + - N_eff_partial ( dict ( np.array( float ) ) ) - For each conformational class, the number of effective samples contributing to that state's heat capacity expectation """ if bootstrap_energies is not None: @@ -582,6 +583,8 @@ def get_partial_heat_capacities(array_folded_states, # Here we need to classify each of the sample mbar weights num_confs = {} # w_i*U_i, for each state. den_confs = {} # w_i + N_eff_partial = {} # Number of effective samples for each conformational state and temperature + w_nk_sq_sum = {} U_expect_confs = {} # Energy expectations n_conf_states = len(np.unique(array_folded_states)) @@ -589,22 +592,30 @@ def get_partial_heat_capacities(array_folded_states, for c in range(n_conf_states): num_confs[c] = 0 den_confs[c] = 0 + w_nk_sq_sum[c] = 0 for frame in range(len(array_folded_states)): # Sum the weighted energies for partial heat capacity eval: # These energies should be the same unsampled_state_energies used in the computeExpectations calculation + + # Also compute the effective numbers of sample contributing to each partial Cv: + # N_eff,k = (sum(w_nk))^2/(sum(w_nk^2)) + for c in range(n_conf_states): if array_folded_states[frame] == c: num_confs[c] += w_nk[frame,:]*unsampled_state_energies[:,frame] den_confs[c] += w_nk[frame,:] + w_nk_sq_sum[c] += w_nk[frame,:]**2 break # Compute the expectations for each conformational class: # These are also needed in the heat capacity decomposition, # for the term involving the square of the energy expectation # differences. + for c in range(n_conf_states): U_expect_confs[c] = num_confs[c]/den_confs[c] + N_eff_partial[c] = (den_confs[c])**2/w_nk_sq_sum[c] # Compute the array of differences to use for the heat capacity calculation: U_expect_confs_diff = {} @@ -650,10 +661,12 @@ def get_partial_heat_capacities(array_folded_states, dCv_partial = None # Slice the U_expect_confs arrays to correspond to the output temperature list, and add back units: + # Also slice the N_eff_partial array: for c in range(n_conf_states): U_expect_confs[c] = U_expect_confs[c][0:n_T_vals]*unit.kilojoule_per_mole + N_eff_partial[c] = N_eff_partial[c][0:n_T_vals] - return (Cv_partial, dCv_partial, full_T_list[0:n_T_vals], FWHM_partial, Tm_partial, Cv_height_partial, U_expect_confs) + return (Cv_partial, dCv_partial, full_T_list[0:n_T_vals], FWHM_partial, Tm_partial, Cv_height_partial, U_expect_confs, N_eff_partial) def get_heat_capacity_reeval( @@ -997,7 +1010,7 @@ def bootstrap_partial_heat_capacities(array_folded_states, U_expect_confs_boot = {} - N_eff_boot = {} + N_eff_partial_boot = {} Tm_boot = {} Cv_height_boot = {} @@ -1036,7 +1049,7 @@ def bootstrap_partial_heat_capacities(array_folded_states, # Run partial heat capacity expectation calculation: (Cv_partial_values_boot[i_boot], Cv_partial_uncertainty_boot_out, T_list, - FWHM_curr, Tm_curr, Cv_height_curr, U_expect_confs_boot[i_boot]) = get_partial_heat_capacities( + FWHM_curr, Tm_curr, Cv_height_curr, U_expect_confs_boot[i_boot], N_eff_partial_boot[i_boot]) = get_partial_heat_capacities( array_folded_states_resample, output_data=output_data, num_intermediate_states=num_intermediate_states, @@ -1074,14 +1087,17 @@ def bootstrap_partial_heat_capacities(array_folded_states, for c in range(n_conf_states): arr_Cv_values_boot[c] = np.zeros((n_trial_boot, len(T_list))) arr_U_expect_confs_boot[c] = np.zeros((n_trial_boot, len(T_list))) + arr_N_eff_boot[c] = np.zeros((n_trial_boot, len(T_list))) for i_boot in range(n_trial_boot): arr_Cv_values_boot[c][i_boot,:] = Cv_partial_values_boot[i_boot][c].value_in_unit(Cv_unit) arr_U_expect_confs_boot[c][i_boot,:] = U_expect_confs_boot[i_boot][c].value_in_unit(U_unit) + arr_N_eff_boot[c][i_boot,:] = N_eff_partial_boot[i_boot][c] # Compute mean values: Cv_values = {} U_expect_values = {} + N_eff_values = {} Cv_height_value = {} Tm_value = {} FWHM_value = {} @@ -1089,6 +1105,7 @@ def bootstrap_partial_heat_capacities(array_folded_states, for c in range(n_conf_states): Cv_values[c] = np.mean(arr_Cv_values_boot[c],axis=0)*Cv_unit U_expect_values[c] = np.mean(arr_U_expect_confs_boot[c],axis=0)*U_unit + N_eff_values[c] = np.mean(arr_N_eff_boot[c],axis=0) Cv_height_value[c] = np.mean(Cv_height_boot[c])*Cv_unit Tm_value[c] = np.mean(Tm_boot[c])*T_unit FWHM_value[c] = np.mean(FWHM_boot[c])*T_unit @@ -1096,7 +1113,7 @@ def bootstrap_partial_heat_capacities(array_folded_states, # Compute confidence intervals: Cv_uncertainty = {} U_expect_uncertainty = {} - Cv_height_uncertainty = {} + Cv_height_uncertainty = {} Tm_uncertainty = {} FWHM_uncertainty = {} @@ -1170,7 +1187,7 @@ def bootstrap_partial_heat_capacities(array_folded_states, plot_partial_heat_capacities(Cv_values, Cv_uncertainty, T_list, file_name=plot_file) return (T_list, Cv_values, Cv_uncertainty, Tm_value, Tm_uncertainty, Cv_height_value, Cv_height_uncertainty, - FWHM_value, FWHM_uncertainty, U_expect_values, U_expect_uncertainty) + FWHM_value, FWHM_uncertainty, U_expect_values, U_expect_uncertainty, N_eff_values) def bootstrap_heat_capacity(frame_begin=0, sample_spacing=1, frame_end=-1, plot_file='heat_capacity_boot.pdf', From c8d06935530b9b9c279bc280403663d6a3a0ded6 Mon Sep 17 00:00:00 2001 From: Chris Walker Date: Wed, 12 Jan 2022 12:38:48 -0500 Subject: [PATCH 3/5] compute partial contacts by state, not transition --- cg_openmm/parameters/secondary_structure.py | 219 ++++++++------------ cg_openmm/tests/test_native_contacts.py | 166 ++++++++++++--- 2 files changed, 223 insertions(+), 162 deletions(-) diff --git a/cg_openmm/parameters/secondary_structure.py b/cg_openmm/parameters/secondary_structure.py index 2e7c6c87..92da17f8 100644 --- a/cg_openmm/parameters/secondary_structure.py +++ b/cg_openmm/parameters/secondary_structure.py @@ -368,10 +368,10 @@ def expectations_fraction_contacts(fraction_native_contacts, frame_begin=0, samp def expectations_partial_contact_fractions(array_folded_states, fraction_native_contacts, frame_begin=0, sample_spacing=1, output_data="output/output.nc", num_intermediate_states=0, - bootstrap_energies=None, transition_list=None): + bootstrap_energies=None): """ For systems with more than 2 conformational states, compute the expectation fraction of contacts - vs temperature for each individual transition by summing the appropriate mbar weights. + vs temperature for each individual configurational state by summing the appropriate mbar weights. :param array_folded_states: a precomputed array classifying the different conformational states :type array_folded_states: 2d numpy array (int) @@ -394,11 +394,8 @@ def expectations_partial_contact_fractions(array_folded_states, fraction_native_ :param bootstrap_energies: a custom replica_energies array to be used for bootstrapping calculations. Used instead of the energies in the .nc file. (default=None) :type bootstrap_energies: 2d numpy array (float) - :param transition_list: a list containing the allowed transitions to a folded state, such as '0_1' or '1_2'. - :type transition_list: list ( str ) - :returns: - - Q_expect_confs ( dict ) - dictionary of the form {'m_n': 1D numpy array} containing partial contact fractions for each transition between states m and n + - Q_expect_confs ( dict ) - dictionary of the form {'m_n': 1D numpy array} containing partial contact fractions for each conformational state - T_list_out ( np.array ( float * unit.simtk.temperature ) ) Temperature list corresponding to the partial contact fraction values """ @@ -515,50 +512,23 @@ def expectations_partial_contact_fractions(array_folded_states, fraction_native_ Q_expect_confs = {} # Contact fraction expectations n_conf_states = len(np.unique(array_folded_states)) - - if n_conf_states == 1: - print(f'Only 1 configurational state found. Exiting...') - exit() - - print(f'n_conf_states: {n_conf_states}') - - if transition_list is None: - # Compute all transitions: - transition_list = [] - for m in range(n_conf_states): - for n in range(m+1,n_conf_states): - transition_list.append(f'{m}_{n}') - - elif type(transition_list) == str: - # Convert to list if a single string: - if type(transition_list) == str: - transition_list = transition_list.split() - - print(f'transitions: {transition_list}') - + for m in range(n_conf_states): - for n in range(m+1,n_conf_states): - if f'{m}_{n}' in transition_list: - num_confs[f'{m}_{n}'] = 0 - den_confs[f'{m}_{n}'] = 0 + num_confs[f'{m}'] = 0 + den_confs[f'{m}'] = 0 - # Loop over the conformational transitions: + # Loop over the conformational states: for frame in range(len(array_folded_states)): # Sum the weighted energies for partial contact fraction eval: - # These expectations are for Q in the subset of states (m,n) for m in range(n_conf_states): - for n in range(m+1,n_conf_states): - if f'{m}_{n}' in transition_list: - if array_folded_states[frame] == m or array_folded_states[frame] == n: - num_confs[f'{m}_{n}'] += w_nk[frame,:]*Q[frame] - den_confs[f'{m}_{n}'] += w_nk[frame,:] - break + if array_folded_states[frame] == m: + num_confs[f'{m}'] += w_nk[frame,:]*Q[frame] + den_confs[f'{m}'] += w_nk[frame,:] + break # Now compute the final expectation values: for m in range(n_conf_states): - for n in range(m+1,n_conf_states): - if f'{m}_{n}' in transition_list: - Q_expect_confs[f'{m}_{n}'] = num_confs[f'{m}_{n}']/den_confs[f'{m}_{n}'] + Q_expect_confs[f'{m}'] = num_confs[f'{m}']/den_confs[f'{m}'] # ***TODO: implement uncertainty directly from the mbar algorithm @@ -691,7 +661,7 @@ def fraction_native_contacts( for m in range(n_mono): for p in range(n_particles_per_mono): - particle_indices_reverse = n_particles - n_particles_per_mono*(m+1) - p + particle_indices_reverse.append(n_particles - n_particles_per_mono*(m+1) + p) # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse # Now select the contact pairs: @@ -815,6 +785,7 @@ def fraction_native_contacts_preloaded( if rep == 0: nframes = traj_dict[rep][frame_begin:].n_frames + n_particles = traj_dict[rep][frame_begin:].n_atoms Q = np.zeros((nframes,n_replicas)) traj_distances = md.compute_distances( @@ -827,7 +798,6 @@ def fraction_native_contacts_preloaded( # ***Note: this assumes that the particles are indexed by monomer, and in the same # order for each monomer. - n_particles = rep_traj.n_atoms reverse_contact_list = [] if len(cgmodel.particle_type_list) == 1: @@ -866,7 +836,7 @@ def fraction_native_contacts_preloaded( for m in range(n_mono): for p in range(n_particles_per_mono): - particle_indices_reverse = n_particles - n_particles_per_mono*(m+1) - p + particle_indices_reverse.append(n_particles - n_particles_per_mono*(m+1) + p) # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse # Now select the contact pairs: @@ -1689,7 +1659,6 @@ def bootstrap_partial_contacts_expectation( conf_percent='sigma', plotfile='Q_vs_T_bootstrap.pdf', homopolymer_sym=False, - transition_list=None, ): """ Given a cgmodel, native contact definitions, and trajectory file list, this function calculates the @@ -1697,7 +1666,7 @@ def bootstrap_partial_contacts_expectation( the uncertainties in the Q vs T folding curve. Intended to be used after the native contact tolerance has been optimized (either the helical or generalized versions). - Here the partial contact fractions are computed, using only the structures classified within a pair of conformations. + Here the partial contact fractions are computed for each conformational state in array_folded_states :param cgmodel: CGModel() class object :type cgmodel: class @@ -1795,19 +1764,7 @@ def bootstrap_partial_contacts_expectation( # Get the number of conformational state classifications: n_conf_states = len(np.unique(array_folded_states)) - - if transition_list is None: - # Compute all transitions: - transition_list = [] - for m in range(n_conf_states): - for n in range(m+1,n_conf_states): - transition_list.append(f'{m}_{n}') - - elif type(transition_list) == str: - # Convert to list if a single string: - if type(transition_list) == str: - transition_list = transition_list.split() - + # For each bootstrap trial, compute the expectation of native contacts and fit to sigmoid. Q_expect_boot = {} @@ -1817,12 +1774,12 @@ def bootstrap_partial_contacts_expectation( sigmoid_Tm = {} Q_folded = {} - for trans in transition_list: - sigmoid_Q_max[trans] = np.zeros(n_trial_boot) - sigmoid_Q_min[trans] = np.zeros(n_trial_boot) - sigmoid_d[trans] = np.zeros(n_trial_boot) - sigmoid_Tm[trans] = np.zeros(n_trial_boot) - Q_folded[trans] = np.zeros(n_trial_boot) + for m in range(n_conf_states): + sigmoid_Q_max[m] = np.zeros(n_trial_boot) + sigmoid_Q_min[m] = np.zeros(n_trial_boot) + sigmoid_d[m] = np.zeros(n_trial_boot) + sigmoid_Tm[m] = np.zeros(n_trial_boot) + Q_folded[m] = np.zeros(n_trial_boot) for i_boot in range(n_trial_boot): # Select production frames to analyze @@ -1857,7 +1814,7 @@ def bootstrap_partial_contacts_expectation( j += 1 # Run the native contacts expectation calculation: - # Q_expect_boot is nested dict with keys [i_boot]['m_n'] + # Q_expect_boot is nested dict with keys [i_boot]['m'] Q_expect_boot[i_boot], T_list_out = expectations_partial_contact_fractions( array_folded_states_resample, Q_resample, @@ -1865,31 +1822,29 @@ def bootstrap_partial_contacts_expectation( num_intermediate_states=num_intermediate_states, bootstrap_energies=replica_energies_resample, output_data=output_data, - transition_list=transition_list, ) - # Fit contact fraction vs T for each transition to sigmoid: - for trans in transition_list: + # Fit contact fraction vs T for each state to sigmoid: + for m in range(n_conf_states): param_opt, param_cov = fit_sigmoid( T_list_out, - Q_expect_boot[i_boot][trans], + Q_expect_boot[i_boot][f'{m}'], plotfile=None ) # Save the individual parameters: # These have the reversed order of keys: if param_opt[1] >= param_opt[2]: - sigmoid_Q_max[trans][i_boot] = param_opt[1] - sigmoid_Q_min[trans][i_boot] = param_opt[2] + sigmoid_Q_max[m][i_boot] = param_opt[1] + sigmoid_Q_min[m][i_boot] = param_opt[2] else: # This shouldn't occur unless d is negative - print(f'Error with sigmoid fitting') - sigmoid_Q_max[trans][i_boot] = param_opt[2] - sigmoid_Q_min[trans][i_boot] = param_opt[1] + sigmoid_Q_max[m][i_boot] = param_opt[2] + sigmoid_Q_min[m][i_boot] = param_opt[1] - sigmoid_d[trans][i_boot] = param_opt[3] - sigmoid_Tm[trans][i_boot] = param_opt[0] - Q_folded[trans][i_boot] = (param_opt[1]+param_opt[2])/2 + sigmoid_d[m][i_boot] = param_opt[3] + sigmoid_Tm[m][i_boot] = param_opt[0] + Q_folded[m][i_boot] = (param_opt[1]+param_opt[2])/2 # Compute uncertainty at all temps in Q_expect_boot over the n_trial_boot trials performed: @@ -1899,11 +1854,11 @@ def bootstrap_partial_contacts_expectation( # Convert bootstrap trial dicts to array arr_Q_values_boot = {} - for trans in transition_list: - arr_Q_values_boot[trans] = np.zeros((n_trial_boot, n_temps)) + for m in range(n_conf_states): + arr_Q_values_boot[m] = np.zeros((n_trial_boot, n_temps)) for i_boot in range(n_trial_boot): - arr_Q_values_boot[trans][i_boot,:] = Q_expect_boot[i_boot][trans] + arr_Q_values_boot[m][i_boot,:] = Q_expect_boot[i_boot][f'{m}'] # Compute mean values: @@ -1914,13 +1869,13 @@ def bootstrap_partial_contacts_expectation( sigmoid_Tm_value = {} Q_folded_value = {} - for trans in transition_list: - Q_values[trans] = np.mean(arr_Q_values_boot[trans],axis=0) - sigmoid_Q_max_value[trans] = np.mean(sigmoid_Q_max[trans]) - sigmoid_Q_min_value[trans] = np.mean(sigmoid_Q_min[trans]) - sigmoid_d_value[trans] = np.mean(sigmoid_d[trans])*unit.kelvin - sigmoid_Tm_value[trans] = np.mean(sigmoid_Tm[trans])*unit.kelvin - Q_folded_value[trans] = np.mean(Q_folded[trans]) + for m in range(n_conf_states): + Q_values[m] = np.mean(arr_Q_values_boot[m],axis=0) + sigmoid_Q_max_value[m] = np.mean(sigmoid_Q_max[m]) + sigmoid_Q_min_value[m] = np.mean(sigmoid_Q_min[m]) + sigmoid_d_value[m] = np.mean(sigmoid_d[m])*unit.kelvin + sigmoid_Tm_value[m] = np.mean(sigmoid_Tm[m])*unit.kelvin + Q_folded_value[m] = np.mean(Q_folded[m]) # Compute confidence intervals: Q_uncertainty = {} @@ -1933,78 +1888,78 @@ def bootstrap_partial_contacts_expectation( if conf_percent == 'sigma': # Use analytical standard deviation instead of percentile method: - for trans in transition_list: + for m in range(n_conf_states): # Q values: - Q_std = np.std(arr_Q_values_boot[trans],axis=0) - Q_uncertainty[trans] = (-Q_std, Q_std) + Q_std = np.std(arr_Q_values_boot[m],axis=0) + Q_uncertainty[m] = (-Q_std, Q_std) # Sigmoid Q_max: - sigmoid_Q_max_std = np.std(sigmoid_Q_max[trans]) - sigmoid_Q_max_uncertainty[trans] = (-sigmoid_Q_max_std, sigmoid_Q_max_std) + sigmoid_Q_max_std = np.std(sigmoid_Q_max[m]) + sigmoid_Q_max_uncertainty[m] = (-sigmoid_Q_max_std, sigmoid_Q_max_std) # Sigmoid Q_min: - sigmoid_Q_min_std = np.std(sigmoid_Q_min[trans]) - sigmoid_Q_min_uncertainty[trans] = (-sigmoid_Q_min_std, sigmoid_Q_min_std) + sigmoid_Q_min_std = np.std(sigmoid_Q_min[m]) + sigmoid_Q_min_uncertainty[m] = (-sigmoid_Q_min_std, sigmoid_Q_min_std) # Sigmoid d: - sigmoid_d_std = np.std(sigmoid_d[trans]) - sigmoid_d_uncertainty[trans] = (-sigmoid_d_std*unit.kelvin, sigmoid_d_std*unit.kelvin) + sigmoid_d_std = np.std(sigmoid_d[m]) + sigmoid_d_uncertainty[m] = (-sigmoid_d_std*unit.kelvin, sigmoid_d_std*unit.kelvin) # Sigmoid Tm: - sigmoid_Tm_std = np.std(sigmoid_Tm[trans]) - sigmoid_Tm_uncertainty[trans] = (-sigmoid_Tm_std*unit.kelvin, sigmoid_Tm_std*unit.kelvin) + sigmoid_Tm_std = np.std(sigmoid_Tm[m]) + sigmoid_Tm_uncertainty[m] = (-sigmoid_Tm_std*unit.kelvin, sigmoid_Tm_std*unit.kelvin) # Q_folded: - Q_folded_std = np.std(Q_folded[trans]) - Q_folded_uncertainty[trans] = (-Q_folded_std, Q_folded_std) + Q_folded_std = np.std(Q_folded[m]) + Q_folded_uncertainty[m] = (-Q_folded_std, Q_folded_std) else: # Compute specified confidence interval: p_lo = (100-conf_percent)/2 p_hi = 100-p_lo - for trans in transition_list: + for m in range(n_conf_states): # Q values: - Q_diff = arr_Q_values_boot[trans]-np.mean(arr_Q_values_boot[trans],axis=0) + Q_diff = arr_Q_values_boot[m]-np.mean(arr_Q_values_boot[m],axis=0) Q_conf_lo = np.percentile(Q_diff,p_lo,axis=0,interpolation='linear') Q_conf_hi = np.percentile(Q_diff,p_hi,axis=0,interpolation='linear') - Q_uncertainty[trans] = (Q_conf_lo, Q_conf_hi) + Q_uncertainty[m] = (Q_conf_lo, Q_conf_hi) # Sigmoid Q_max: - sigmoid_Q_max_diff = sigmoid_Q_max[trans]-np.mean(sigmoid_Q_max[trans]) + sigmoid_Q_max_diff = sigmoid_Q_max[m]-np.mean(sigmoid_Q_max[m]) sigmoid_Q_max_conf_lo = np.percentile(sigmoid_Q_max_diff,p_lo,interpolation='linear') sigmoid_Q_max_conf_hi = np.percentile(sigmoid_Q_max_diff,p_hi,interpolation='linear') - sigmoid_Q_max_uncertainty[trans] = (sigmoid_Q_max_conf_lo, sigmoid_Q_max_conf_hi) + sigmoid_Q_max_uncertainty[m] = (sigmoid_Q_max_conf_lo, sigmoid_Q_max_conf_hi) # Sigmoid Q_min: - sigmoid_Q_min_diff = sigmoid_Q_min[trans]-np.mean(sigmoid_Q_min[trans]) + sigmoid_Q_min_diff = sigmoid_Q_min[m]-np.mean(sigmoid_Q_min[m]) sigmoid_Q_min_conf_lo = np.percentile(sigmoid_Q_min_diff,p_lo,interpolation='linear') sigmoid_Q_min_conf_hi = np.percentile(sigmoid_Q_min_diff,p_hi,interpolation='linear') - sigmoid_Q_min_uncertainty[trans] = (sigmoid_Q_min_conf_lo, sigmoid_Q_min_conf_hi) + sigmoid_Q_min_uncertainty[m] = (sigmoid_Q_min_conf_lo, sigmoid_Q_min_conf_hi) # Sigmoid d: - sigmoid_d_diff = sigmoid_d[trans]-np.mean(sigmoid_d[trans]) + sigmoid_d_diff = sigmoid_d[m]-np.mean(sigmoid_d[m]) sigmoid_d_conf_lo = np.percentile(sigmoid_d_diff,p_lo,interpolation='linear') sigmoid_d_conf_hi = np.percentile(sigmoid_d_diff,p_hi,interpolation='linear') - sigmoid_d_uncertainty[trans] = (sigmoid_d_conf_lo*unit.kelvin, sigmoid_d_conf_hi*unit.kelvin) + sigmoid_d_uncertainty[m] = (sigmoid_d_conf_lo*unit.kelvin, sigmoid_d_conf_hi*unit.kelvin) # Sigmoid Tm: - sigmoid_Tm_diff = sigmoid_Tm[trans]-np.mean(sigmoid_Tm[trans]) + sigmoid_Tm_diff = sigmoid_Tm[m]-np.mean(sigmoid_Tm[m]) sigmoid_Tm_conf_lo = np.percentile(sigmoid_Tm_diff,p_lo,interpolation='linear') sigmoid_Tm_conf_hi = np.percentile(sigmoid_Tm_diff,p_hi,interpolation='linear') - sigmoid_Tm_uncertainty[trans] = (sigmoid_Tm_conf_lo*unit.kelvin, sigmoid_Tm_conf_hi*unit.kelvin) + sigmoid_Tm_uncertainty[m] = (sigmoid_Tm_conf_lo*unit.kelvin, sigmoid_Tm_conf_hi*unit.kelvin) # Q_folded: - Q_folded_diff = Q_folded[trans]-np.mean(Q_folded[trans]) + Q_folded_diff = Q_folded[m]-np.mean(Q_folded[m]) Q_folded_conf_lo = np.percentile(Q_folded_diff,p_lo,interpolation='linear') Q_folded_conf_hi = np.percentile(Q_folded_diff,p_hi,interpolation='linear') - Q_folded_uncertainty[trans] = (Q_folded_conf_lo, Q_folded_conf_hi*unit.kelvin) + Q_folded_uncertainty[m] = (Q_folded_conf_lo, Q_folded_conf_hi*unit.kelvin) # Compile sigmoid results into dict of dicts: sigmoid_results_boot = {} @@ -2333,7 +2288,7 @@ def tanh_switch(x,x0,y0,y1,d): def plot_partial_contact_fractions(temperature_list, Q_values, Q_uncertainty, plotfile="partial_Q_vs_T.pdf", sigmoid_dict=None): """ - Given a list of temperatures and corresponding partial contact fractions, plot Q vs T for each transition. + Given a list of temperatures and corresponding partial contact fractions, plot Q vs T for each conformational state. If a sigmoid dict from bootstrapping is given, also plot the sigmoid curve. Note that this sigmoid curve is generated by using the mean values of the 4 hyperbolic fitting parameters taken over all bootstrap trials, not a direct fit to the Q vs T data. @@ -2363,25 +2318,25 @@ def plot_partial_contact_fractions(temperature_list, Q_values, Q_uncertainty, pl def tanh_switch(x,x0,y0,y1,d): return (y0+y1)/2-((y0-y1)/2)*np.tanh(np.radians(x-x0)/d) - for trans, value in Q_values.items(): + for key, value in Q_values.items(): xsig = np.linspace(temperature_array[0],temperature_array[-1],1000) ysig = tanh_switch( xsig, - sigmoid_dict['sigmoid_Tm_value'][trans].value_in_unit(unit.kelvin), - sigmoid_dict['sigmoid_Q_max_value'][trans], - sigmoid_dict['sigmoid_Q_min_value'][trans], - sigmoid_dict['sigmoid_d_value'][trans].value_in_unit(unit.kelvin), + sigmoid_dict['sigmoid_Tm_value'][key].value_in_unit(unit.kelvin), + sigmoid_dict['sigmoid_Q_max_value'][key], + sigmoid_dict['sigmoid_Q_min_value'][key], + sigmoid_dict['sigmoid_d_value'][key].value_in_unit(unit.kelvin), ) errorbar1 = plt.errorbar( temperature_array, - Q_values[trans], + Q_values[key], yerr=Q_uncertainty, linewidth=0.5, markersize=4, fmt='o', - label=f'trans {trans} mean', + label=f'state {key}', fillstyle='none', capsize=4, ) @@ -2393,15 +2348,15 @@ def tanh_switch(x,x0,y0,y1,d): line2 = plt.plot( xsig, ysig,'-', - label=f'trans {trans} fit', + label=f'state {key} fit', color=color1, # keep the same color ) line3 = plt.errorbar( - sigmoid_dict['sigmoid_Tm_value'][trans].value_in_unit(unit.kelvin), - sigmoid_dict['Q_folded_value'][trans], - xerr=sigmoid_dict['sigmoid_Tm_uncertainty'][trans][1].value_in_unit(unit.kelvin), - yerr=sigmoid_dict['Q_folded_uncertainty'][trans][1], + sigmoid_dict['sigmoid_Tm_value'][key].value_in_unit(unit.kelvin), + sigmoid_dict['Q_folded_value'][key], + xerr=sigmoid_dict['sigmoid_Tm_uncertainty'][key][1].value_in_unit(unit.kelvin), + yerr=sigmoid_dict['Q_folded_uncertainty'][key][1], linewidth=0.5, markersize=4, fmt='D-r', @@ -2427,14 +2382,14 @@ def tanh_switch(x,x0,y0,y1,d): # ) else: - for trans, value in Q_values.items(): + for key, value in Q_values.items(): plt.errorbar( temperature_array, - Q_values[trans], - Q_uncertainty[trans], + Q_values[key], + Q_uncertainty[key], linewidth=0.5, markersize=4, - label=f'trans {trans}', + label=f'state {key}', fmt='o-', fillstyle='none', capsize=4, diff --git a/cg_openmm/tests/test_native_contacts.py b/cg_openmm/tests/test_native_contacts.py index 6d55c7c1..40b7522b 100644 --- a/cg_openmm/tests/test_native_contacts.py +++ b/cg_openmm/tests/test_native_contacts.py @@ -133,10 +133,11 @@ def test_native_contacts_dcd(tmpdir): assert os.path.isfile(f"{output_directory}/Q_vs_T.pdf") -def test_native_contacts_dcd_homopolymer_sym(tmpdir): +def test_native_contacts_dcd_homopolymer_sym_linear(tmpdir): """ Test native contact fraction calculation with homopolymer end-to-end symmetry checks. + (linear homopolymer with no sidechains) """ output_directory = tmpdir.mkdir("output") @@ -196,6 +197,68 @@ def test_native_contacts_dcd_homopolymer_sym(tmpdir): assert Q_sym.all() >= Q.all() +def test_native_contacts_dcd_homopolymer_sym_sidechain(tmpdir): + """ + Test native contact fraction calculation with homopolymer + end-to-end symmetry checks. + (1-1 homopolymer model with sidechains) + """ + + output_directory = tmpdir.mkdir("output") + + # Replica exchange settings + number_replicas = 12 + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of dcd trajectories to analyze + dcd_file_list = [] + for i in range(number_replicas): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Set path to native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Set cutoff parameters: + # Cutoff for native structure pairwise distances: + native_contact_cutoff = 4.0* unit.angstrom + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Determine native contacts: + native_contact_list, native_contact_distances, contact_type_dict = get_native_contacts( + cgmodel, + native_structure_file, + native_contact_cutoff, + ) + + # Determine native contact fraction of current trajectories: + # With end-to-end symmetry check: + Q_sym, Q_avg_sym, Q_stderr_sym, decorrelation_spacing_sym = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + native_contact_tol=native_contact_tol, + homopolymer_sym=True, + ) + + # Without end-to-end symmetry check: + Q, Q_avg, Q_stderr, decorrelation_spacing = fraction_native_contacts( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + native_contact_tol=native_contact_tol, + homopolymer_sym=False, + ) + + # End-to-end symmetry check should increase the contact fraction in all cases: + assert Q_sym.all() >= Q.all() + + def test_helix_contacts_dcd(tmpdir): """See if we can determine native contacts for helix backbone sequences""" @@ -472,10 +535,10 @@ def test_expectations_fraction_contacts_dcd(tmpdir): assert os.path.isfile(f"{output_directory}/Q_vs_T_fit.pdf") -def test_expectations_partial_contact_fractions_2state(tmpdir): +def test_expectations_partial_contact_fractions_1state(tmpdir): """ Test the partial contact fraction expectation code for the trivial case - of a 2 state system (should match the original expectation native contact fraction result) + of a 1 state system (should match the original expectation native contact fraction result) """ output_directory = tmpdir.mkdir("output") @@ -524,9 +587,8 @@ def test_expectations_partial_contact_fractions_2state(tmpdir): native_contact_tol=native_contact_tol, ) - # Set 1 as folded, 0 as unfolded: - Q_folded = 0.3 - array_folded_states = np.multiply((Q>=Q_folded),1) + # Set all states to be the same + array_folded_states = np.zeros_like(Q) output_data = os.path.join(data_path, "output.nc") num_intermediate_states=1 @@ -537,7 +599,6 @@ def test_expectations_partial_contact_fractions_2state(tmpdir): frame_begin=100, output_data=output_data, num_intermediate_states=num_intermediate_states, - transition_list='0_1', ) Q_results = expectations_fraction_contacts( @@ -547,14 +608,12 @@ def test_expectations_partial_contact_fractions_2state(tmpdir): num_intermediate_states=num_intermediate_states, ) - assert Q_results['Q'].all() == Q_values_partial['0_1'].all() + assert Q_results['Q'].all() == Q_values_partial['0'].all() -def test_expectations_partial_contact_fractions_2state_no_list(tmpdir): +def test_expectations_partial_contact_fractions_2state(tmpdir): """ - Test the partial contact fraction expectation code for the trivial case - of a 2 state system (should match the original expectation native contact fraction result) - (no transition list explicitly specified) + Test the partial contact fraction expectation code for a 2 state system. """ output_directory = tmpdir.mkdir("output") @@ -605,7 +664,7 @@ def test_expectations_partial_contact_fractions_2state_no_list(tmpdir): # Set 1 as folded, 0 as unfolded: Q_folded = 0.3 - array_folded_states = np.multiply((Q>=Q_folded),1) + array_folded_states = np.multiply((Q>=Q_folded),1) output_data = os.path.join(data_path, "output.nc") num_intermediate_states=1 @@ -616,23 +675,16 @@ def test_expectations_partial_contact_fractions_2state_no_list(tmpdir): frame_begin=100, output_data=output_data, num_intermediate_states=num_intermediate_states, - transition_list=None, - ) - - Q_results = expectations_fraction_contacts( - Q, - frame_begin=100, - output_data=output_data, - num_intermediate_states=num_intermediate_states, ) - assert Q_results['Q'].all() == Q_values_partial['0_1'].all() + assert len(Q_values_partial) == 2 and type(Q_values_partial) == dict -def test_expectations_fraction_contacts_dcd_homopolymer_sym(tmpdir): +def test_expectations_fraction_contacts_dcd_homopolymer_sym_linear(tmpdir): """ See if we can determine native contacts expectations as a function of T, with homopolymer end-to-end symmetry checks. + (linear homopolymer with no sidechains) """ output_directory = tmpdir.mkdir("output") @@ -800,16 +852,16 @@ def test_bootstrap_partial_contacts_expectation_dcd_2state(tmpdir): n_trial_boot=10, conf_percent='sigma', plotfile=f'{output_directory}/partial_contacts_boot.pdf', - transition_list='0_1', + homopolymer_sym=False, ) - assert os.path.isfile(f'{output_directory}/partial_contacts_boot.pdf') + assert os.path.isfile(f'{output_directory}/partial_contacts_boot.pdf') - -def test_bootstrap_partial_contacts_expectation_dcd_2state_no_list(tmpdir): + +def test_bootstrap_partial_contacts_expectation_dcd_2state_homopolymer_sym(tmpdir): """ Test bootstrapping of partial native contacts expectation, based on helix contacts - (no transition list explicitly specified) + (with homopolymer symmetry check) """ output_directory = tmpdir.mkdir("output") @@ -874,16 +926,17 @@ def test_bootstrap_partial_contacts_expectation_dcd_2state_no_list(tmpdir): n_trial_boot=10, conf_percent='sigma', plotfile=f'{output_directory}/partial_contacts_boot.pdf', - transition_list=None, + homopolymer_sym=True, ) assert os.path.isfile(f'{output_directory}/partial_contacts_boot.pdf') -def test_bootstrap_native_contacts_expectation_dcd_homopolymer_sym(tmpdir): +def test_bootstrap_native_contacts_expectation_dcd_homopolymer_sym_linear(tmpdir): """ Test bootstrapping of native contacts expectation, based on helix contacts, with homopolymer end-to-end symmetry checks. + (linear homopolymer with no sidechains) """ output_directory = tmpdir.mkdir("output") @@ -934,6 +987,59 @@ def test_bootstrap_native_contacts_expectation_dcd_homopolymer_sym(tmpdir): assert os.path.isfile(f'{output_directory}/native_contacts_boot_sym.pdf') +def test_bootstrap_native_contacts_expectation_dcd_homopolymer_sym_sidechain(tmpdir): + """ + Test bootstrapping of native contacts expectation, based on helix contacts, + with homopolymer end-to-end symmetry checks. + (1-1 homopolymer model with sidechains) + """ + + output_directory = tmpdir.mkdir("output") + output_data = os.path.join(data_path, "output.nc") + + # Replica exchange settings + number_replicas = 12 + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Create list of dcd trajectories to analyze + dcd_file_list = [] + for i in range(number_replicas): + dcd_file_list.append(f"{data_path}/replica_{i+1}.dcd") + + # Set path to native structure file: + native_structure_file=f"{structures_path}/medoid_0.dcd" + + # Tolerance for current trajectory distances: + native_contact_tol = 1.5 + + # Determine native contacts: + native_contact_list, native_contact_distances, opt_seq_spacing = get_helix_contacts( + cgmodel, + native_structure_file, + backbone_type_name='bb', + ) + + full_T_list, Q_values, Q_uncertainty, sigmoid_results_boot = bootstrap_native_contacts_expectation( + cgmodel, + dcd_file_list, + native_contact_list, + native_contact_distances, + output_data=output_data, + frame_begin=100, + sample_spacing=20, + native_contact_tol=1.2, + num_intermediate_states=1, + n_trial_boot=10, + conf_percent='sigma', + plotfile=f'{output_directory}/native_contacts_boot_sym.pdf', + homopolymer_sym=True, + ) + + assert os.path.isfile(f'{output_directory}/native_contacts_boot_sym.pdf') + + def test_optimize_Q_helix_tol_dcd(tmpdir): """Test the helix native contact tolerance optimization workflow""" From 837a2cb0e5c19c9cf9f2c79da2211c724ff671ad Mon Sep 17 00:00:00 2001 From: Chris Walker Date: Tue, 8 Feb 2022 13:13:35 -0500 Subject: [PATCH 4/5] fix bug with native contact symmetry check --- cg_openmm/parameters/secondary_structure.py | 42 ++++++++++++--------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/cg_openmm/parameters/secondary_structure.py b/cg_openmm/parameters/secondary_structure.py index 92da17f8..9a7082d0 100644 --- a/cg_openmm/parameters/secondary_structure.py +++ b/cg_openmm/parameters/secondary_structure.py @@ -621,6 +621,8 @@ def fraction_native_contacts( # ***Note: this assumes that the particles are indexed by monomer, and in the same # order for each monomer. + # We check if the model is a linear homopolymer, since there may be multiple backbone + # beads per monomer. n_particles = rep_traj.n_atoms reverse_contact_list = [] @@ -637,32 +639,38 @@ def fraction_native_contacts( # We need to use the particle type list of the original model, # and reconstruct monomer by monomer from the opposite end. - # This should also work for 1 particle per monomer. - particle_type_list = [] - mono_indices_list = [] particle_indices_forward = [] particle_indices_reverse = [] - for p in range(n_particles): - particle_indices_forward.append(p) - particle_type_list.append(cgmodel.get_particle_type_name(p)) - mono_indices_list.append(cgmodel.get_particle_monomer(p)) + # Check if linear chain with multiple beads per monomer: + mono = cgmodel.monomer_types + if len(mono) > 1: + print(f'Error: cannot apply end-to-end symmetry with multiple monomer types') + exit() - n_particles_per_mono = 0 + mono = mono[0] + mono_bond_start = mono['start'] + mono_bond_end = mono['end'] + mono_bond_list = mono['bond_list'] + mono_particle_sequence = mono['particle_sequence'] + + #***TODO: add rigorous check for linear topology here. - for p in range(n_particles): - if cgmodel.get_particle_monomer(p) == 0: - n_particles_per_mono += 1 - else: - break + n_particle_unique = len(set(particle_type_list)) + n_particles_per_mono = len(mono_particle_sequence) + n_mono = int(n_particles/n_particles_per_mono) - for m in range(n_mono): - for p in range(n_particles_per_mono): - particle_indices_reverse.append(n_particles - n_particles_per_mono*(m+1) + p) - # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse + if n_particle_unique == 1: + particle_indices_reverse = particle_indices_forward[::-1] + + else: + for m in range(n_mono): + for p in range(n_particles_per_mono): + particle_indices_reverse.append(n_particles - n_particles_per_mono*(m+1) + p) + # For example, if forward indices [0,1] is [bb,sc] with 84 total particles, then [82,83] is the reverse # Now select the contact pairs: for pair in native_contact_list: From 411a9d97bd0230b1dc6a72f0dcac74aae696cf96 Mon Sep 17 00:00:00 2001 From: Chris Walker Date: Mon, 14 Feb 2022 18:19:57 -0500 Subject: [PATCH 5/5] add energy decomposition function --- cg_openmm/parameters/evaluate_energy.py | 63 ++++++++++++++++++++++++- cg_openmm/tests/test_energy_eval.py | 35 +++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/cg_openmm/parameters/evaluate_energy.py b/cg_openmm/parameters/evaluate_energy.py index 87e8451b..2ed0f289 100644 --- a/cg_openmm/parameters/evaluate_energy.py +++ b/cg_openmm/parameters/evaluate_energy.py @@ -1155,4 +1155,65 @@ def get_replica_reeval_energies(replica, temperature_list, file_list, topology, # This reducing step gets rid of the simtk units U_eval_rep[j,k] = (potential_energy*beta_all[j]) - return U_eval_rep, replica \ No newline at end of file + return U_eval_rep, replica + + +def energy_decomposition(cgmodel, positions_file): + """ + Given a cgmodel with a topology and system and coordinates in positions_file, + perform an OpenMM energy decomposition. + + :param cgmodel: CGModel() class object to evaluate energy with + :type cgmodel: class + + :param positions_file: Path to pdb or dcd file containing molecular coordinates + :type positions_file: str + + :returns: + - U_decomposition - dict mapping force names to contribution to total energy. + """ + + topology = cgmodel.topology + system = cgmodel.system + + # Load in the coordinates as mdtraj object: + if positions_file[-3:] == 'dcd': + traj = md.load(positions_file,top=md.Topology.from_openmm(topology)) + else: + traj = md.load(positions_file) + + positions = traj[0].xyz[0]*unit.nanometer + + # Set up simulation object: + simulation_time_step = 5.0 * unit.femtosecond + friction = 0.0 / unit.picosecond + integrator = LangevinIntegrator( + 0.0 * unit.kelvin, friction, simulation_time_step.in_units_of(unit.picosecond) + ) + simulation = Simulation(topology, cgmodel.system, integrator) + simulation.context.setPositions(positions) + + # Set force groups: + force_names = {} + U_decomposition = {} + + for force_index, force in enumerate(simulation.system.getForces()): + # These are the overall classes of forces, not the particle-specific forces + force_names[force_index] = force.__class__.__name__ + force.setForceGroup(force_index) + + # Need to create a new simulation object with the updated force groups: + integrator_new = LangevinIntegrator( + 0.0 * unit.kelvin, friction, simulation_time_step.in_units_of(unit.picosecond) + ) + simulation_new = Simulation(topology, simulation.system, integrator_new) + simulation_new.context.setPositions(positions) + + total_energy = simulation_new.context.getState(getEnergy=True).getPotentialEnergy() + U_decomposition['total'] = total_energy + + for force_index, force in enumerate(simulation_new.system.getForces()): + force_names[force_index] = force.__class__.__name__ + U_decomposition[force_names[force_index]] = simulation_new.context.getState(getEnergy=True,groups={force_index}).getPotentialEnergy() + + return U_decomposition \ No newline at end of file diff --git a/cg_openmm/tests/test_energy_eval.py b/cg_openmm/tests/test_energy_eval.py index 9f546889..54ae06a0 100644 --- a/cg_openmm/tests/test_energy_eval.py +++ b/cg_openmm/tests/test_energy_eval.py @@ -18,7 +18,40 @@ current_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(current_path, 'test_data') - +structures_path = os.path.join(current_path, 'test_structures') + +def test_energy_decomposition_dcd(tmpdir): + """ + Test the energy decomposition on a cgmodel and medoid, and check that the individual components + sum to the total potential energy. + """ + + output_directory = tmpdir.mkdir("output") + + # Load in cgmodel + cgmodel = pickle.load(open(f"{data_path}/stored_cgmodel.pkl", "rb" )) + + # Set path to medoid structure file: + medoid_file = f"{structures_path}/medoid_min.dcd" + + # Run the energy decomposition: + U_decomposition = energy_decomposition( + cgmodel, + medoid_file, + ) + + # Check the sum of energy components: + sum_expected = U_decomposition['total'] + sum_actual = 0 * unit.kilojoule_per_mole + + print(U_decomposition) + + for key,value in U_decomposition.items(): + if key != 'total': + sum_actual += value + + assert sum_expected == sum_actual + def test_eval_energy_no_change(tmpdir): """