From 316cd71809b5be93ef9036534d90b69ecf3190aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 1 Feb 2018 11:10:40 +0100 Subject: [PATCH] rework of entropy, minmax, percentile estimator --- skgstat/estimator.py | 50 +++++++++++++++++++++------ skgstat/tests/estimator.py | 71 ++++++++++++++++++++++++++++++-------- 2 files changed, 97 insertions(+), 24 deletions(-) diff --git a/skgstat/estimator.py b/skgstat/estimator.py index 531745c..bdc5a15 100644 --- a/skgstat/estimator.py +++ b/skgstat/estimator.py @@ -131,9 +131,11 @@ def genton(X): def minmax(X): """ - Returns the MinMax Semivariance of sample X. + Returns the MinMax Semivariance of sample X pairwise differences. X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h. + CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5 + :param X: :return: """ @@ -146,12 +148,20 @@ def minmax(X): if len(_X) % 2 > 0: raise ValueError('The sample does not have an even length: {}'.format(_X)) - return (np.nanmax(_X) - np.nanmin(_X)) / np.nanmean(_X) + # calculate the point pair values + # helper function + ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)] + p = ppairs(_X) + + return (np.nanmax(p) - np.nanmin(p)) / np.nanmean(p) def percentile(X, p=50): """ - Returns the wanted percentile of sample X. + Returns the wanted percentile of sample X pairwise differences. + X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h. + + CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5 :param X: np.ndarray with the given sample to calculate the Semivariance from :param p: float with the percentile of sample X @@ -166,7 +176,12 @@ def percentile(X, p=50): if len(_X) % 2 > 0: raise ValueError('The sample does not have an even length: {}'.format(_X)) - return np.percentile(_X, q=p) + # calculate the point pair values + # helper function + ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)] + pairs = ppairs(_X) + + return np.percentile(pairs, q=p) def entropy(X, bins=None): @@ -177,7 +192,9 @@ def entropy(X, bins=None): This uses Freedman Diacons Estimator and is fairly resilient to outliers. If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and use the same edges in all bins. - CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4 + CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5 + + # TODO: handle the 0s in output of X :param X: np.ndarray with the given sample to calculate the Shannon entropy from :param bins: The bin edges for entropy calculation, or an amount of even spaced bins @@ -185,13 +202,18 @@ def entropy(X, bins=None): """ _X = np.array(X) + # helper function + ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)] + if any([isinstance(_, (list, np.ndarray)) for _ in _X]): # if bins is not set, use the histogram over the full value range if bins is None: # could not fiugre out a better way here. I need the values before calculating the entropy # in order to use the full value range in all bins - vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X] - bins = np.histogram(vals, bins=15)[1][1:] + allp = [ppairs(_) for _ in _X] + minv = np.min(list(map(np.min, allp))) + maxv = np.max(list(map(np.max, allp))) + bins = np.linspace(minv, maxv, 50).tolist() + [maxv] # have no better idea to include the end edge as well return np.array([entropy(_, bins=bins) for _ in _X]) # check even @@ -199,11 +221,19 @@ def entropy(X, bins=None): raise ValueError('The sample does not have an even length: {}'.format(_X)) # calculate the values - vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)] + vals = ppairs(_X) # claculate the bins if bins is None: bins = 15 - pk = np.histogram(vals, bins)[0] - return scipy_entropy(pk=pk) + # get the amounts + amt = np.histogram(vals, bins=bins)[0] + + # add a very small value to the p, otherwise the log2 will be -inf. + p = (amt / np.sum(amt)) + 1e-5 + info = lambda p: -np.log2(p) + + # map info to p and return the inner product + return np.fromiter(map(info, p), dtype=np.float).dot(p) + diff --git a/skgstat/tests/estimator.py b/skgstat/tests/estimator.py index bd20863..464cdc3 100644 --- a/skgstat/tests/estimator.py +++ b/skgstat/tests/estimator.py @@ -20,15 +20,22 @@ result_dowd = np.array([1.09900000e+00, 0.00000000e+00, 1.09900000e+02, 1.09900000e+04]) -result_minmax = [2.0, 0.0, 2.0, 2.0] +result_minmax = np.array([1.9927489681047741, 1.8267020635804991, 1.5392934574169328, 2.0360906123190832]) -result_percentile = [4.5, 5.0, 45.0, 450.0] +result_percentile_r = np.array([6.3486663708926443, 6.5981915461999279, 4.7812030455283168, 7.2582516292816663]) +result_percentile = np.array([1.0, 0.0, 10.0, 100.0]) -result_entropy = np.array([0.69314718, 0.63651417, 0.63651417, 1.60943791]) -result_entropy_fd = np.array([0.67301167, 0.67301167, 0.67301167, 0.95027054]) -result_entropy_5b = np.array([1.05492017, 1.05492017, 1.05492017, 0.95027054]) -result_entropy_ar = np.array([1.05492017, 0.67301167, 1.05492017, 1.05492017]) +# entropies +result_uni_entropy = np.array([0.0081243, 0.0081243, 0.0081243, 0.0081243]) +result_uni_entropy_fd = np.array([-1.44270225e-05, -1.44270225e-05, -1.44270225e-05, + -1.44270225e-05]) +result_uni_entropy_5b = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996]) +result_uni_entropy_ar = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996]) +result_entropy = np.array([1.9295937, 2.32944639, 2.32944639, 2.32944639]) +result_entropy_fd = np.array([1.92211936, 1.37096112, 0.97094233, 1.37096112]) +result_entropy_5b = np.array([1.92211936, 1.92211936, 1.92211936, 1.92211936]) +result_entropy_ar = np.array([1.92211936, 1.52226666, 0.97144062, 1.52226666]) class TestEstimator(unittest.TestCase): def setUp(self): @@ -39,7 +46,7 @@ def setUp(self): self.grouped = [list(np.arange(10)), [5] * 10, list(np.arange(0, 100, 10)), list(np.arange(0, 1000, 100))] np.random.seed(42) - self.entropy_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)), + self.random_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)), list(np.random.gamma(4, 2, 10)), list(np.random.gamma(10,5, 10))] def test_matheron(self): @@ -73,36 +80,72 @@ def test_minmax(self): """ Testing minmax estimator """ - assert_array_almost_equal(minmax(self.grouped), result_minmax) + assert_array_almost_equal(minmax(self.random_grouped), result_minmax) - def test_percentile(self): + def test_percentile_random(self): """ - Testing percentile estimator + Testing percentile estimator on randomized data + """ + assert_array_almost_equal(percentile(self.random_grouped), result_percentile_r) + + def test_percentile_grouped(self): + """ + Testing percentile estimator on grouped data """ assert_array_almost_equal(percentile(self.grouped), result_percentile) + def test_entropy_uniform_default(self): + """ + Testing the entropy estimator on uniform distributions, with and without gaps + """ + assert_array_almost_equal(entropy(self.grouped), result_uni_entropy) + + def test_entropy_uniform_string(self): + """ + Testing entropy estimator with string as bin on uniform distributions + """ + assert_array_almost_equal(np.asarray(entropy(self.grouped, bins='fd')), result_uni_entropy_fd) + + def test_entropy_uniform_integer(self): + """ + Testing entropy estimator with integer as bin on uniform distributions + """ + assert_array_almost_equal(np.asarray(entropy(self.grouped, bins=5)), result_uni_entropy_5b) + + def test_entropy_uniform_list(self): + """ + Testing entropy estimator with list as bin on uniform distributions + """ + assert_array_almost_equal( + np.asarray(entropy(self.grouped, bins=[0, 0.1, 5, 10, 20, 100])), + result_uni_entropy_ar) + def test_entropy_default(self): """ Testing entropy estimator with default settings """ - assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped)), result_entropy) + assert_array_almost_equal(np.asarray(entropy(self.random_grouped)), result_entropy) def test_entropy_string(self): """ Testing entropy estimator with string as bin """ - assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins='fd')), result_entropy_fd) + assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins='fd')), result_entropy_fd) def test_entropy_integer(self): """ Testing entropy estimator with integer as bin """ - assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins=5)), result_entropy_5b) + assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins=5)), result_entropy_5b) def test_entropy_list(self): """ Testing entropy estimator with list as bin """ assert_array_almost_equal( - np.asarray(entropy(self.entropy_grouped, bins=[0.1, 5, 10, 20, 100])), + np.asarray(entropy(self.random_grouped, bins=[0, 0.1, 5, 10, 20, 100])), result_entropy_ar) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file