Skip to content

Commit

Permalink
rework of entropy, minmax, percentile estimator
Browse files Browse the repository at this point in the history
  • Loading branch information
mmaelicke committed Feb 1, 2018
1 parent 8fa89ea commit 316cd71
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 24 deletions.
50 changes: 40 additions & 10 deletions skgstat/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,11 @@ def genton(X):

def minmax(X):
"""
Returns the MinMax Semivariance of sample X.
Returns the MinMax Semivariance of sample X pairwise differences.
X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h.
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
:param X:
:return:
"""
Expand All @@ -146,12 +148,20 @@ def minmax(X):
if len(_X) % 2 > 0:
raise ValueError('The sample does not have an even length: {}'.format(_X))

return (np.nanmax(_X) - np.nanmin(_X)) / np.nanmean(_X)
# calculate the point pair values
# helper function
ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]
p = ppairs(_X)

return (np.nanmax(p) - np.nanmin(p)) / np.nanmean(p)


def percentile(X, p=50):
"""
Returns the wanted percentile of sample X.
Returns the wanted percentile of sample X pairwise differences.
X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h.
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
:param X: np.ndarray with the given sample to calculate the Semivariance from
:param p: float with the percentile of sample X
Expand All @@ -166,7 +176,12 @@ def percentile(X, p=50):
if len(_X) % 2 > 0:
raise ValueError('The sample does not have an even length: {}'.format(_X))

return np.percentile(_X, q=p)
# calculate the point pair values
# helper function
ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]
pairs = ppairs(_X)

return np.percentile(pairs, q=p)


def entropy(X, bins=None):
Expand All @@ -177,33 +192,48 @@ def entropy(X, bins=None):
This uses Freedman Diacons Estimator and is fairly resilient to outliers.
If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
use the same edges in all bins.
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
# TODO: handle the 0s in output of X
:param X: np.ndarray with the given sample to calculate the Shannon entropy from
:param bins: The bin edges for entropy calculation, or an amount of even spaced bins
:return:
"""
_X = np.array(X)

# helper function
ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]

if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
# if bins is not set, use the histogram over the full value range
if bins is None:
# could not fiugre out a better way here. I need the values before calculating the entropy
# in order to use the full value range in all bins
vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
bins = np.histogram(vals, bins=15)[1][1:]
allp = [ppairs(_) for _ in _X]
minv = np.min(list(map(np.min, allp)))
maxv = np.max(list(map(np.max, allp)))
bins = np.linspace(minv, maxv, 50).tolist() + [maxv] # have no better idea to include the end edge as well
return np.array([entropy(_, bins=bins) for _ in _X])

# check even
if len(_X) % 2 > 0:
raise ValueError('The sample does not have an even length: {}'.format(_X))

# calculate the values
vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]
vals = ppairs(_X)

# claculate the bins
if bins is None:
bins = 15
pk = np.histogram(vals, bins)[0]

return scipy_entropy(pk=pk)
# get the amounts
amt = np.histogram(vals, bins=bins)[0]

# add a very small value to the p, otherwise the log2 will be -inf.
p = (amt / np.sum(amt)) + 1e-5
info = lambda p: -np.log2(p)

# map info to p and return the inner product
return np.fromiter(map(info, p), dtype=np.float).dot(p)

71 changes: 57 additions & 14 deletions skgstat/tests/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,22 @@
result_dowd = np.array([1.09900000e+00, 0.00000000e+00, 1.09900000e+02,
1.09900000e+04])

result_minmax = [2.0, 0.0, 2.0, 2.0]
result_minmax = np.array([1.9927489681047741, 1.8267020635804991, 1.5392934574169328, 2.0360906123190832])

result_percentile = [4.5, 5.0, 45.0, 450.0]
result_percentile_r = np.array([6.3486663708926443, 6.5981915461999279, 4.7812030455283168, 7.2582516292816663])
result_percentile = np.array([1.0, 0.0, 10.0, 100.0])

result_entropy = np.array([0.69314718, 0.63651417, 0.63651417, 1.60943791])
result_entropy_fd = np.array([0.67301167, 0.67301167, 0.67301167, 0.95027054])
result_entropy_5b = np.array([1.05492017, 1.05492017, 1.05492017, 0.95027054])
result_entropy_ar = np.array([1.05492017, 0.67301167, 1.05492017, 1.05492017])
# entropies
result_uni_entropy = np.array([0.0081243, 0.0081243, 0.0081243, 0.0081243])
result_uni_entropy_fd = np.array([-1.44270225e-05, -1.44270225e-05, -1.44270225e-05,
-1.44270225e-05])
result_uni_entropy_5b = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996])
result_uni_entropy_ar = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996])

result_entropy = np.array([1.9295937, 2.32944639, 2.32944639, 2.32944639])
result_entropy_fd = np.array([1.92211936, 1.37096112, 0.97094233, 1.37096112])
result_entropy_5b = np.array([1.92211936, 1.92211936, 1.92211936, 1.92211936])
result_entropy_ar = np.array([1.92211936, 1.52226666, 0.97144062, 1.52226666])

class TestEstimator(unittest.TestCase):
def setUp(self):
Expand All @@ -39,7 +46,7 @@ def setUp(self):
self.grouped = [list(np.arange(10)), [5] * 10, list(np.arange(0, 100, 10)),
list(np.arange(0, 1000, 100))]
np.random.seed(42)
self.entropy_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)),
self.random_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)),
list(np.random.gamma(4, 2, 10)), list(np.random.gamma(10,5, 10))]

def test_matheron(self):
Expand Down Expand Up @@ -73,36 +80,72 @@ def test_minmax(self):
"""
Testing minmax estimator
"""
assert_array_almost_equal(minmax(self.grouped), result_minmax)
assert_array_almost_equal(minmax(self.random_grouped), result_minmax)

def test_percentile(self):
def test_percentile_random(self):
"""
Testing percentile estimator
Testing percentile estimator on randomized data
"""
assert_array_almost_equal(percentile(self.random_grouped), result_percentile_r)

def test_percentile_grouped(self):
"""
Testing percentile estimator on grouped data
"""
assert_array_almost_equal(percentile(self.grouped), result_percentile)

def test_entropy_uniform_default(self):
"""
Testing the entropy estimator on uniform distributions, with and without gaps
"""
assert_array_almost_equal(entropy(self.grouped), result_uni_entropy)

def test_entropy_uniform_string(self):
"""
Testing entropy estimator with string as bin on uniform distributions
"""
assert_array_almost_equal(np.asarray(entropy(self.grouped, bins='fd')), result_uni_entropy_fd)

def test_entropy_uniform_integer(self):
"""
Testing entropy estimator with integer as bin on uniform distributions
"""
assert_array_almost_equal(np.asarray(entropy(self.grouped, bins=5)), result_uni_entropy_5b)

def test_entropy_uniform_list(self):
"""
Testing entropy estimator with list as bin on uniform distributions
"""
assert_array_almost_equal(
np.asarray(entropy(self.grouped, bins=[0, 0.1, 5, 10, 20, 100])),
result_uni_entropy_ar)

def test_entropy_default(self):
"""
Testing entropy estimator with default settings
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped)), result_entropy)
assert_array_almost_equal(np.asarray(entropy(self.random_grouped)), result_entropy)

def test_entropy_string(self):
"""
Testing entropy estimator with string as bin
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins='fd')), result_entropy_fd)
assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins='fd')), result_entropy_fd)

def test_entropy_integer(self):
"""
Testing entropy estimator with integer as bin
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins=5)), result_entropy_5b)
assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins=5)), result_entropy_5b)

def test_entropy_list(self):
"""
Testing entropy estimator with list as bin
"""
assert_array_almost_equal(
np.asarray(entropy(self.entropy_grouped, bins=[0.1, 5, 10, 20, 100])),
np.asarray(entropy(self.random_grouped, bins=[0, 0.1, 5, 10, 20, 100])),
result_entropy_ar)


if __name__ == '__main__':
unittest.main()

0 comments on commit 316cd71

Please sign in to comment.