-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubsetCounting.py
177 lines (145 loc) · 6.36 KB
/
subsetCounting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import itertools
import json
import math
from operator import itemgetter
import numpy as np
def getMinPairing(cur_subset: list, obs_counts: np.ndarray) -> int:
"""A way to quantify how amicable a subset is for removal."""
subset_pairs = itertools.product(cur_subset, repeat=2)
vals = [obs_counts[p] for p in subset_pairs]
return min(vals)
def isPairingInSubset(pairing, subset):
return np.intersect1d(pairing, subset).size == 2
class SubsetSampling():
def __init__(self, pool_size, subset_size):
self.poolSize = pool_size
self.subsetSize = subset_size
self.subsetMethod = None
self.subsets = []
self.observationCounts = self.newObservationMatrix()
self.totPoolSubsets = math.comb(pool_size, subset_size)
# Keep track of parameters when searching for an optimal set
self.params = {
"poolSize": pool_size,
"subsetSize": subset_size,
"method": "",
"params": {},
"nSubsets": 0,
"min": np.Inf,
"max": -np.inf
}
def newObservationMatrix(self):
return np.zeros((self.poolSize, self.poolSize), dtype=int)
def info(self):
print(json.dumps(self.params, indent=4))
def show(self):
for r in self.observationCounts:
print(','.join([f"{val:2d}" for val in r]))
def observedPairingsMin(self):
# use np.tril to ignore main diagonal?
return int(self.observationCounts.min())
def observedPairingsMax(self):
# need int for nice info() json formatting
return int(np.tril(self.observationCounts, k=-1).max())
def updateObservationCounts(self, cur_subset, action="Add"):
"""Add or Remove a subset """
if action == "Add":
if len(cur_subset) == self.subsetSize:
self.subsets.append(cur_subset)
for p in itertools.product(cur_subset, repeat=2):
self.observationCounts[p] = self.observationCounts[p] + 1
else:
raise NameError('Incorrect subset size')
elif action == "Remove":
if cur_subset in self.subsets:
self.subsets.pop(self.subsets.index(cur_subset))
for p in itertools.product(cur_subset, repeat=2):
self.observationCounts[p] = self.observationCounts[p] - 1
else:
print(f"{cur_subset} not in self.subsets")
else:
print("Specify Add or Remove")
self.params["nSubsets"] = len(self.subsets)
self.params["min"] = self.observedPairingsMin()
self.params["max"] = self.observedPairingsMax()
def generateSteppedSubsets(self, offset=None, amount=None):
"""Create an iterator to step through all possible subset
combinations at reqular intervals. This works quite well
to evenly cover the total sample space."""
self.params["method"] = "generateSteppedSubsets"
self.params["params"] = {
"offset": offset,
"amount": amount
}
combs = itertools.combinations(range(self.poolSize), self.subsetSize)
if amount:
slice_step = self.totPoolSubsets/amount
#print(f"tot {self.totPoolSubsets}, step {slice_step}")
ids = [int(int(i % slice_step) == 0) for i in range(self.totPoolSubsets)]
subsets = list(itertools.compress(combs, ids))
else:
subsets = list(combs)
for s in subsets:
self.updateObservationCounts(s, "Add")
self.params["nSubsets"] = len(self.subsets)
def getRandomSample(self, probs=None):
# need to remove elements that are already saturated before sampling
samp = np.random.choice(
self.poolSize,
self.subsetSize,
replace=False,
p=probs)
return list(samp)
def calculateWeights(self, method="unif"):
""""""
if method == "unif":
probs = [1/self.poolSize for _ in range(self.poolSize)]
else:
colSum = list(np.sum(self.observationCounts, axis=0))
# add 1 to ensure there are not too many 0s and thus enough
# elements to sample from.
weights = max(colSum) - colSum + 0.1
probs = [i/sum(weights) for i in weights]
return probs
def generateRandomSubsets(self, min_obs, n_iters=1):
""" Finds the best list of subsets from N attempts."""
self.params["method"] = "generateRandomSubsets"
self.params["params"] = {
"min_obs": min_obs
}
cur_best = np.inf
for _ in range(n_iters):
self.subsets = []
self.observationCounts = self.newObservationMatrix()
# Keep adding subsets untill a minimum count is achieved across
# all pairwise observations
while np.sum(self.observationCounts < min_obs) > 0:
new_subset = self.getRandomSample(self.calculateWeights())
self.updateObservationCounts(new_subset, "Add")
self.trim(min_obs)
# keep the best list of subsets
if len(self.subsets) < cur_best:
cur_best = len(self.subsets)
best_subset_list = self.subsets
self.subsets = []
self.observationCounts = self.newObservationMatrix()
for s in best_subset_list:
self.updateObservationCounts(s, "Add")
self.params["nSubsets"] = len(self.subsets)
def trim(self, minVal: int) -> None:
"""Find pairings in the observation matrix that have the highest
counts and likely belong to subsets that can be removed without
harming the minimun count specified in [minVal]"""
while self.observationCounts.max() > minVal:
# Go through all subsets. Probably not most efficient...
# Another idea is to find the max value in the observationCounts
# matrix and try to trim those first.
minimum_pairing_value = [
(si, getMinPairing(s, self.observationCounts))
for si, s in enumerate(self.subsets)]
minimum_pairing_value.sort(key=itemgetter(1), reverse=True)
best_cand = minimum_pairing_value[0]
if best_cand[1] > minVal:
self.updateObservationCounts(self.subsets[best_cand[0]], "Remove")
else:
break