From 316cd71809b5be93ef9036534d90b69ecf3190aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20M=C3=A4licke?= <mirko.maelicke@kit.edu>
Date: Thu, 1 Feb 2018 11:10:40 +0100
Subject: [PATCH] rework of entropy, minmax, percentile estimator

---
 skgstat/estimator.py       | 50 +++++++++++++++++++++------
 skgstat/tests/estimator.py | 71 ++++++++++++++++++++++++++++++--------
 2 files changed, 97 insertions(+), 24 deletions(-)

diff --git a/skgstat/estimator.py b/skgstat/estimator.py
index 531745c..bdc5a15 100644
--- a/skgstat/estimator.py
+++ b/skgstat/estimator.py
@@ -131,9 +131,11 @@ def genton(X):
 
 def minmax(X):
     """
-    Returns the MinMax Semivariance of sample X.
+    Returns the MinMax Semivariance of sample X pairwise differences.
     X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h.
 
+    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
+
     :param X:
     :return:
     """
@@ -146,12 +148,20 @@ def minmax(X):
     if len(_X) % 2 > 0:
         raise ValueError('The sample does not have an even length: {}'.format(_X))
 
-    return (np.nanmax(_X) - np.nanmin(_X)) / np.nanmean(_X)
+    # calculate the point pair values
+    # helper function
+    ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]
+    p = ppairs(_X)
+
+    return (np.nanmax(p) - np.nanmin(p)) / np.nanmean(p)
 
 
 def percentile(X, p=50):
     """
-    Returns the wanted percentile of sample X.
+    Returns the wanted percentile of sample X pairwise differences.
+    X has to be an even-length array of point pairs like: x1, x1+h, x2, x2+h, ..., xn, xn+h.
+
+    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
 
     :param X: np.ndarray with the given sample to calculate the Semivariance from
     :param p: float with the percentile of sample X
@@ -166,7 +176,12 @@ def percentile(X, p=50):
     if len(_X) % 2 > 0:
         raise ValueError('The sample does not have an even length: {}'.format(_X))
 
-    return np.percentile(_X, q=p)
+    # calculate the point pair values
+    # helper function
+    ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]
+    pairs = ppairs(_X)
+
+    return np.percentile(pairs, q=p)
 
 
 def entropy(X, bins=None):
@@ -177,7 +192,9 @@ def entropy(X, bins=None):
     This uses Freedman Diacons Estimator and is fairly resilient to outliers.
     If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
     use the same edges in all bins.
-    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4
+    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.5
+
+    # TODO: handle the 0s in output of X
 
     :param X:  np.ndarray with the given sample to calculate the Shannon entropy from
     :param bins: The bin edges for entropy calculation, or an amount of even spaced bins
@@ -185,13 +202,18 @@ def entropy(X, bins=None):
     """
     _X = np.array(X)
 
+    # helper function
+    ppairs = lambda x: [np.abs(x[i] - x[i+1]) for i in np.arange(0, len(x), 2)]
+
     if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
         # if bins is not set, use the histogram over the full value range
         if bins is None:
             # could not fiugre out a better way here. I need the values before calculating the entropy
             # in order to use the full value range in all bins
-            vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
-            bins = np.histogram(vals, bins=15)[1][1:]
+            allp = [ppairs(_) for _ in _X]
+            minv = np.min(list(map(np.min, allp)))
+            maxv = np.max(list(map(np.max, allp)))
+            bins = np.linspace(minv, maxv, 50).tolist() + [maxv] # have no better idea to include the end edge as well
         return np.array([entropy(_, bins=bins) for _ in _X])
 
     # check even
@@ -199,11 +221,19 @@ def entropy(X, bins=None):
         raise ValueError('The sample does not have an even length: {}'.format(_X))
 
     # calculate the values
-    vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]
+    vals = ppairs(_X)
 
     # claculate the bins
     if bins is None:
         bins = 15
-    pk = np.histogram(vals, bins)[0]
 
-    return scipy_entropy(pk=pk)
+    # get the amounts
+    amt = np.histogram(vals, bins=bins)[0]
+
+    # add a very small value to the p, otherwise the log2 will be -inf.
+    p = (amt / np.sum(amt)) + 1e-5
+    info = lambda p: -np.log2(p)
+
+    # map info to p and return the inner product
+    return np.fromiter(map(info, p), dtype=np.float).dot(p)
+
diff --git a/skgstat/tests/estimator.py b/skgstat/tests/estimator.py
index bd20863..464cdc3 100644
--- a/skgstat/tests/estimator.py
+++ b/skgstat/tests/estimator.py
@@ -20,15 +20,22 @@
 result_dowd = np.array([1.09900000e+00, 0.00000000e+00, 1.09900000e+02,
                         1.09900000e+04])
 
-result_minmax = [2.0, 0.0, 2.0, 2.0]
+result_minmax = np.array([1.9927489681047741, 1.8267020635804991, 1.5392934574169328, 2.0360906123190832])
 
-result_percentile = [4.5, 5.0, 45.0, 450.0]
+result_percentile_r = np.array([6.3486663708926443, 6.5981915461999279, 4.7812030455283168, 7.2582516292816663])
+result_percentile = np.array([1.0, 0.0, 10.0, 100.0])
 
-result_entropy = np.array([0.69314718, 0.63651417, 0.63651417, 1.60943791])
-result_entropy_fd = np.array([0.67301167, 0.67301167, 0.67301167, 0.95027054])
-result_entropy_5b = np.array([1.05492017, 1.05492017, 1.05492017, 0.95027054])
-result_entropy_ar = np.array([1.05492017, 0.67301167, 1.05492017, 1.05492017])
+# entropies
+result_uni_entropy = np.array([0.0081243, 0.0081243, 0.0081243, 0.0081243])
+result_uni_entropy_fd = np.array([-1.44270225e-05, -1.44270225e-05, -1.44270225e-05,
+                                  -1.44270225e-05])
+result_uni_entropy_5b = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996])
+result_uni_entropy_ar = np.array([0.00064996, 0.00064996, 0.00064996, 0.00064996])
 
+result_entropy = np.array([1.9295937, 2.32944639, 2.32944639, 2.32944639])
+result_entropy_fd = np.array([1.92211936, 1.37096112, 0.97094233, 1.37096112])
+result_entropy_5b = np.array([1.92211936, 1.92211936, 1.92211936, 1.92211936])
+result_entropy_ar = np.array([1.92211936, 1.52226666, 0.97144062, 1.52226666])
 
 class TestEstimator(unittest.TestCase):
     def setUp(self):
@@ -39,7 +46,7 @@ def setUp(self):
         self.grouped = [list(np.arange(10)), [5] * 10, list(np.arange(0, 100, 10)),
                         list(np.arange(0, 1000, 100))]
         np.random.seed(42)
-        self.entropy_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)),
+        self.random_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)),
                                 list(np.random.gamma(4, 2, 10)), list(np.random.gamma(10,5, 10))]
 
     def test_matheron(self):
@@ -73,36 +80,72 @@ def test_minmax(self):
         """
         Testing minmax estimator
         """
-        assert_array_almost_equal(minmax(self.grouped), result_minmax)
+        assert_array_almost_equal(minmax(self.random_grouped), result_minmax)
 
-    def test_percentile(self):
+    def test_percentile_random(self):
         """
-        Testing percentile estimator
+        Testing percentile estimator on randomized data
+        """
+        assert_array_almost_equal(percentile(self.random_grouped), result_percentile_r)
+
+    def test_percentile_grouped(self):
+        """
+        Testing percentile estimator on grouped data
         """
         assert_array_almost_equal(percentile(self.grouped), result_percentile)
 
+    def test_entropy_uniform_default(self):
+        """
+        Testing the entropy estimator on uniform distributions, with and without gaps
+        """
+        assert_array_almost_equal(entropy(self.grouped), result_uni_entropy)
+
+    def test_entropy_uniform_string(self):
+        """
+        Testing entropy estimator with string as bin on uniform distributions
+        """
+        assert_array_almost_equal(np.asarray(entropy(self.grouped, bins='fd')), result_uni_entropy_fd)
+
+    def test_entropy_uniform_integer(self):
+        """
+        Testing entropy estimator with integer as bin on uniform distributions
+        """
+        assert_array_almost_equal(np.asarray(entropy(self.grouped, bins=5)), result_uni_entropy_5b)
+
+    def test_entropy_uniform_list(self):
+        """
+        Testing entropy estimator with list as bin on uniform distributions
+        """
+        assert_array_almost_equal(
+            np.asarray(entropy(self.grouped, bins=[0, 0.1, 5, 10, 20, 100])),
+            result_uni_entropy_ar)
+
     def test_entropy_default(self):
         """
         Testing entropy estimator with default settings
         """
-        assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped)), result_entropy)
+        assert_array_almost_equal(np.asarray(entropy(self.random_grouped)), result_entropy)
 
     def test_entropy_string(self):
         """
         Testing entropy estimator with string as bin
         """
-        assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins='fd')), result_entropy_fd)
+        assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins='fd')), result_entropy_fd)
 
     def test_entropy_integer(self):
         """
         Testing entropy estimator with integer as bin
         """
-        assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins=5)), result_entropy_5b)
+        assert_array_almost_equal(np.asarray(entropy(self.random_grouped, bins=5)), result_entropy_5b)
 
     def test_entropy_list(self):
         """
         Testing entropy estimator with list as bin
         """
         assert_array_almost_equal(
-            np.asarray(entropy(self.entropy_grouped, bins=[0.1, 5, 10, 20, 100])),
+            np.asarray(entropy(self.random_grouped, bins=[0, 0.1, 5, 10, 20, 100])),
             result_entropy_ar)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file