Skip to content

Commit

Permalink
Use model instead of file name for ca-ca distances
Browse files Browse the repository at this point in the history
  • Loading branch information
terwill committed Sep 7, 2021
1 parent fdbe73d commit f476ac7
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 34 deletions.
62 changes: 35 additions & 27 deletions mmtbx/domains_from_pae.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def parse_pae_file(pae_json_file):

def domains_from_pae_matrix_networkx(pae_matrix, pae_power=1,
pae_cutoff=5, graph_resolution=1, weight_by_ca_ca_distance=False,
distance_power=1, distance_model_file=None):
distance_power=1, distance_model=None):
'''
Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each
pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model
Expand All @@ -70,11 +70,11 @@ def domains_from_pae_matrix_networkx(pae_matrix, pae_power=1,
* pae_cutoff (optional, default=5): graph edges will only be created for residue pairs with pae<pae_cutoff
* graph_resolution (optional, default=1): regulates how aggressively the clustering algorithm is. Smaller values
lead to larger clusters. Value should be larger than zero, and values larger than 5 are unlikely to be useful.
* weight_by_ca_ca_distance (optional, default=False): adjust the edge weighting for each residue pair according
to the distance between CA residues. If this is True, then `distance_model_file` must be provided.
* weight_by_ca_ca_distance (optional, default=False): adjust the edge weighting for each residue pair according
to the distance between CA residues. If this is True, then `distance_model` must be provided.
* distance_power (optional, default=1): If `weight_by_ca_ca_distance` is True, then edge weights will be multiplied
by 1/distance**distance_power.
* distance_model_file (optional, default=None): A PDB or mmCIF file containing the model corresponding to the PAE
* distance_model (optional, default=None): Model corresponding to the PAE
matrix. Only needed if `weight_by_ca_ca_distances is True.
Returns: a series of lists, where each list contains the indices of residues belonging to one cluster.
'''
Expand All @@ -87,9 +87,9 @@ def domains_from_pae_matrix_networkx(pae_matrix, pae_power=1,
weights = 1/pae_matrix**pae_power

if weight_by_ca_ca_distance:
if distance_model_file is None:
raise Sorry('If weight_by_ca_ca_distance is True, distance_model_file must be provided!')
weights *= weights_from_distance_matrix(distance_model_file, distance_power)
if distance_model is None:
raise Sorry('If weight_by_ca_ca_distance is True, distance_model must be provided!')
weights *= weights_from_distance_matrix(distance_model, distance_power)

g = nx.Graph()
size = weights.shape[0]
Expand All @@ -110,7 +110,7 @@ def domains_from_pae_matrix_networkx(pae_matrix, pae_power=1,

def domains_from_pae_matrix_igraph(pae_matrix, pae_power=1, pae_cutoff=5,
graph_resolution=1, weight_by_ca_ca_distance=False,
distance_power=1, distance_model_file=None):
distance_power=1, distance_model=None):
'''
Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each
pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model
Expand All @@ -124,11 +124,11 @@ def domains_from_pae_matrix_igraph(pae_matrix, pae_power=1, pae_cutoff=5,
* pae_cutoff (optional, default=5): graph edges will only be created for residue pairs with pae<pae_cutoff
* graph_resolution (optional, default=1): regulates how aggressively the clustering algorithm is. Smaller values
lead to larger clusters. Value should be larger than zero, and values larger than 5 are unlikely to be useful.
* weight_by_ca_ca_distance (optional, default=False): adjust the edge weighting for each residue pair according
to the distance between CA residues. If this is True, then `distance_model_file` must be provided.
* weight_by_ca_ca_distance (optional, default=False): adjust the edge weighting for each residue pair according
to the distance between CA residues. If this is True, then `distance_model` must be provided.
* distance_power (optional, default=1): If `weight_by_ca_ca_distance` is True, then edge weights will be multiplied
by 1/distance**distance_power.
* distance_model_file (optional, default=None): A PDB or mmCIF file containing the model corresponding to the PAE
* distance_model (optional, default=None): Model corresponding to the PAE
matrix. Only needed if `weight_by_ca_ca_distances is True.
Returns: a series of lists, where each list contains the indices of residues belonging to one cluster.
Expand All @@ -142,9 +142,9 @@ def domains_from_pae_matrix_igraph(pae_matrix, pae_power=1, pae_cutoff=5,
weights = 1/pae_matrix**pae_power

if weight_by_ca_ca_distance:
if distance_model_file is None:
raise Sorry('If weight_by_ca_ca_distance is True, distance_model_file must be provided!')
weights *= weights_from_distance_matrix(distance_model_file, distance_power)
if distance_model is None:
raise Sorry('If weight_by_ca_ca_distance is True, distance_model must be provided!')
weights *= weights_from_distance_matrix(distance_model, distance_power)

g = igraph.Graph()
size = weights.shape[0]
Expand Down Expand Up @@ -188,16 +188,9 @@ def flex_matrix_as_numpy_matrix(distances):
matrix.ravel()[:] = list(distances.as_1d())
return matrix

def weights_from_distance_matrix(filename, distance_power):
from iotbx.data_manager import DataManager
dm = DataManager()
dm.set_overwrite(True)
dm.process_model_file(filename)
model = dm.get_model(filename)
model.add_crystal_symmetry_if_necessary()

def weights_from_distance_matrix(distance_model , distance_power):
flex_matrix = distance_matrix(
model.apply_selection_string("name CA").get_sites_cart()
distance_model.apply_selection_string("name CA").get_sites_cart()
)
numpy_matrix = flex_matrix_as_numpy_matrix(flex_matrix)
return 1/numpy_matrix**distance_power
Expand All @@ -224,10 +217,25 @@ def cluster_as_selection(c, first_resno = None):
selection_string,r_start, r_end)
return selection_string

def read_model(filename):
from iotbx.data_manager import DataManager
dm = DataManager()
dm.set_overwrite(True)
dm.process_model_file(filename)
model = dm.get_model(filename)
model.add_crystal_symmetry_if_necessary()
return model


def get_domain_selections_from_pae_matrix(pae_matrix = None,
pae_file = None, library = None,
pae_power = None, pae_cutoff = None, graph_resolution = None,
weight_by_ca_ca_distance = None, distance_power = None, distance_model_file = None,
pae_file = None,
library = None,
pae_power = None,
pae_cutoff = None,
graph_resolution = None,
weight_by_ca_ca_distance = None,
distance_power = None,
distance_model = None,
first_resno = None):

if library is None:
Expand All @@ -249,7 +257,7 @@ def get_domain_selections_from_pae_matrix(pae_matrix = None,
"graph_resolution": graph_resolution,
"weight_by_ca_ca_distance": weight_by_ca_ca_distance,
"distance_power": distance_power,
"distance_model_file": distance_model_file
"distance_model": distance_model
}
# Use default values when argument is None
kwargs = {key: value for key,value in kwargs.items() if value is not None}
Expand Down
36 changes: 32 additions & 4 deletions mmtbx/process_predicted_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
.help = Minimum length of a domain to keep (reject at end if smaller).
.short_caption = Minimum domain length (residues)
minimum_sequential_residues = 5
minimum_sequential_residues = 5
.type = int
.help = Minimum length of a short segment to keep (reject at end ).
.short_caption = Minimum sequential_residues
Expand Down Expand Up @@ -137,6 +137,20 @@
should be larger than zero, and values larger than 5 are \
unlikely to be useful
.short_caption = PAE graph resolution (if PAE matrix supplied)
weight_by_ca_ca_distance = False
.type = bool
.help = Adjust the edge weighting for each residue pair according \
to the distance between CA residues. If this is True, \
then distance_model_file must be provided. See also distance_power
.short_caption = Weight by CA-CA distance (if distance_model supplied)
distance_power = 1
.type = float
.help = If weight_by_ca_ca_distance is True, then edge weights will \
be multiplied by 1/distance**distance_power.
.short_caption = Distance power (for weighting by CA-CA distance)
}
"""
Expand Down Expand Up @@ -395,9 +409,9 @@ def get_selection_for_short_segments(ph, minimum_sequential_residues):
chain_id, r.start, r.end))
selection_string = " or ".join(selections)
return selection_string






def split_model_by_chainid(m, chainid_list):
"""
Expand Down Expand Up @@ -596,6 +610,9 @@ def split_model_with_pae(
pae_cutoff = 5.,
pae_graph_resolution = 1.,
minimum_domain_length = 10,
weight_by_ca_ca_distance = False,
distance_power = 1,
distance_model = None,
log = sys.stdout):

"""
Expand All @@ -620,6 +637,14 @@ def split_model_with_pae(
the clustering algorithm is. Smaller values lead to larger clusters.
Value should be larger than zero, and values larger than 5 are
unlikely to be useful
weight_by_ca_ca_distance: (optional, default=False): adjust the edge
weighting for each residue pair according to the distance between
CA residues. If this is True, then distance_model_file must be provided.
distance_power (optional, default=1): If weight_by_ca_ca_distance` is True,
then edge weights will be multiplied by 1/distance**distance_power.
distance_model ((optional, default=None): A PDB or mmCIF file containing
the model corresponding to the PAE matrix. Only needed if
weight_by_ca_ca_distances is True.
minimum_domain_length: if a region is smaller than this, skip completely
Output:
Expand Down Expand Up @@ -655,6 +680,9 @@ def split_model_with_pae(
pae_cutoff = pae_cutoff,
graph_resolution = pae_graph_resolution,
first_resno = first_resno,
weight_by_ca_ca_distance = weight_by_ca_ca_distance,
distance_power = distance_power,
distance_model = distance_model,
)

# And apply to full model
Expand All @@ -672,7 +700,7 @@ def split_model_with_pae(
good_selections.append(selection_string)
else:
keep_list.append(False)
print("Skipping region '%s' with size of only %s residues" %(
print("Skipping region with selection '%s' that contains %s residues" %(
selection_string,sel.count(True)),
file = log)

Expand Down
11 changes: 9 additions & 2 deletions mmtbx/regression/tst_domains_from_pae.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
pae_file=os.path.join(data_dir,'pae.json')
model_file=os.path.join(data_dir, 'pdbs','pae_model.pdb')

from iotbx.data_manager import DataManager
dm = DataManager()
distance_model = dm.get_model(model_file)
distance_model.add_crystal_symmetry_if_necessary()

def tst_01(log = sys.stdout):

args = group_args(
Expand Down Expand Up @@ -50,16 +55,18 @@ def tst_02(log = sys.stdout):
resolution = 1.0,
weight_by_ca_ca_distance = 1.0,
distance_power = 1.0,
distance_model_file = model_file,
distance_model = distance_model,
select_range = False)



selections = get_domain_selections_from_pae_matrix(pae_file = args.pae_file,
library=args.library,
pae_power = args.pae_power, pae_cutoff = args.pae_cutoff,
graph_resolution = args.resolution,
weight_by_ca_ca_distance = args.weight_by_ca_ca_distance,
distance_power = args.distance_power,
distance_model_file = args.distance_model_file)
distance_model = args.distance_model)
print("Selections:")
for s in selections:
print(s)
Expand Down
2 changes: 1 addition & 1 deletion mmtbx/regression/tst_process_predicted_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


model_file=os.path.join(data_dir,'fibronectin_af_ca_1358_1537.pdb')
pae_model_file=os.path.join(data_dir,'pae.pdb')
pae_model_file=os.path.join(data_dir,'pae_model.pdb')
pae_file=os.path.join(pae_data_dir,'pae.json')

def tst_01(log = sys.stdout):
Expand Down

0 comments on commit f476ac7

Please sign in to comment.