-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
90 lines (75 loc) · 3.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import numpy as np
import pandas as pd
from typing import Union, Tuple, Dict
from pathlib import Path
from scipy.io.arff import loadarff
from sklearn import model_selection
from collections import defaultdict
def read_arff(fullpath: Union[str, Path]) -> Tuple[np.ndarray, np.ndarray]:
data, meta = loadarff(fullpath)
df = pd.DataFrame(data)
klass = None
if 'class' in df.columns:
klass = 'class'
elif 'CLASS' in df.columns:
klass = 'CLASS'
elif 'Class' in df.columns:
klass = 'Class'
assert klass in df.columns and not klass is None
df[klass] = pd.factorize(df[klass])[0]
# cols = ['x', 'y'] if {'x', 'y'}.issubset((df.columns)) else ['a0', 'a1']
cols = [x for x in df.columns if x != klass]
X = df[cols].values
y = df[klass].values
return X, y
def read_dataset(fullpath: Path) -> Tuple[pd.DataFrame, np.ndarray]:
if isinstance(fullpath, str):
fullpath = Path(fullpath)
if fullpath.suffix == '.csv':
df = pd.read_csv(fullpath, sep=',')
X, y = df[['0', '1']], df['label'].to_numpy()
elif fullpath.suffix == '.arff':
X, y = read_arff(fullpath)
elif fullpath.suffix == '.txt':
df = pd.read_csv(fullpath, delim_whitespace=True, header=None)
X, y = df[[0, 1]], df[2].to_numpy()
else:
raise ValueError(f'invalid path {fullpath}')
return X, y
def generate_dbscan_config_tree(filepath: Union[str, Path]) -> Dict:
with open(filepath, 'r') as fp:
data = json.load(fp)
result = defaultdict(lambda: {})
for item in data:
for config in item['k_values']:
result[item['dataset']][config['k']] = config['eps']
return dict(result)
def __compute_over_axis(cm: np.ndarray, axis: int, zero_div: Union[None, float, int]=None) -> np.ndarray:
diag = np.diag(cm)
zero_div_mask = cm.sum(axis=axis) == 0
res = diag / cm.sum(axis=axis).clip(min=1e-4)
if not zero_div is None:
res[zero_div_mask] = zero_div
return res, zero_div_mask
def __compute_recall(cm: np.ndarray, zero_div: Union[None, float, int]=None) -> np.ndarray:
return __compute_over_axis(cm, axis=1, zero_div=zero_div)
def compute_recall(cm: np.ndarray, zero_div: Union[None, float, int]=None) -> np.ndarray:
return __compute_recall(cm, zero_div)[0]
def __compute_precision(cm: np.ndarray, zero_div: Union[None, float, int]=None) -> np.ndarray:
return __compute_over_axis(cm, axis=0, zero_div=zero_div)
def compute_precision(cm: np.ndarray, zero_div: Union[None, float, int]=None) -> np.ndarray:
return __compute_precision(cm, zero_div)[0]
def compute_f1(cm: np.ndarray, zero_div: Union[None, float, int]=None) -> np.ndarray:
precision, prec_mask = __compute_precision(cm, zero_div=zero_div)
recall, rec_mask = __compute_recall(cm, zero_div=zero_div)
res = 2 * precision * recall / (precision + recall).clip(min=1e-4)
if not zero_div is None:
mask = prec_mask | rec_mask
res[mask] = zero_div
return res
def parameter_generator(params):
pg = model_selection.ParameterGrid(params)
# print(f'Parameter grid length: {len(pg)}')
for p in pg:
yield p