-
Notifications
You must be signed in to change notification settings - Fork 1
/
clustering_dih_v7.py
105 lines (58 loc) · 2.45 KB
/
clustering_dih_v7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 29 16:07:14 2022
@author: chingchinglam
linking the pre-clustering process scripts
"""
import pandas as pd
import traceback
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
########### linking the pre-clustering process scripts ###########
from isolate_key_dihedral_v5 import isolate_dihedral
from cal_dihedral_v2 import dihedral_df
from dihedral_parameter_v2 import remove_fixed_dihedrals, remove_fixed_bond_df
from correcting_dihedral_v1 import correction_by_gap
##########
def list_duplicates(seq):
tally = defaultdict(list)
for i,item in enumerate(seq):
tally[item].append(i)
return [[key,locs] for key,locs in tally.items() if len(locs)>0]
def get_cluster_df(path):
sdf_path=path
## pipeline to extract the descriptors
select_dihedral = isolate_dihedral(sdf_path)
raw_df = dihedral_df(sdf_path, select_dihedral[0], select_dihedral[1])
tobe_filtered_dihedral = remove_fixed_dihedrals(raw_df, select_dihedral[0], select_dihedral[1])[2]
dihedral_df_final=remove_fixed_bond_df(raw_df,tobe_filtered_dihedral)[0]
dihedral_df_final1=correction_by_gap(dihedral_df_final)[0]
descriptor=dihedral_df_final1.to_numpy()
## create n_ls
cluster_ls=list(range(1, len(descriptor)+1, 1))
## analyse and append result into a large df
nor_n_cluster=[i/cluster_ls[-1] for i in cluster_ls]
cluster_result_sort_ls =[]
for k in cluster_ls:
model = AgglomerativeClustering(n_clusters=k).fit(descriptor)
cluster_result_sort=list_duplicates(model.labels_)
cluster_result_sort_ls.append(cluster_result_sort)
result_dict ={'n':cluster_ls, 'nor_n': nor_n_cluster, 'clusters': cluster_result_sort_ls}
result_df = pd.DataFrame(result_dict)
name=path.split('/')[-1]
result_df['name'] = name
first_column = result_df.pop('name')
result_df.insert(0, 'name', first_column)
return result_df
def get_cluster_df_multi(mol_path_ls):
## perform get_cluster_df() for multiple files in the same directory
result_df_ls=[]
for i in mol_path_ls:
try:
result_df_ls.append(get_cluster_df(i))
except Exception:
print('Error: '+i)
traceback.print_exc()
result_df=pd.concat(result_df_ls)
return result_df