Skip to content

Commit

Permalink
estimator.entropy reworked
Browse files Browse the repository at this point in the history
  • Loading branch information
mmaelicke committed Dec 19, 2017
1 parent cae48c3 commit d3a366b
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 24 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.4
0.1.5
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
scipy
numpy
pandas
32 changes: 25 additions & 7 deletions skgstat/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,23 +169,41 @@ def percentile(X, p=50):
return np.percentile(_X, q=p)


def entropy(X):
def entropy(X, bins=None):
"""
Use the Shannon Entropy H to describe the distribution of the given sample
:param X:
Use the Shannon Entropy H to describe the distribution of the given sample.
For calculating the Shannon Entropy, the bin edges are needed and can be passed as pk.
If pk is None, these edges will be calculated using the numpy.histogram function with bins='fq'.
This uses Freedman Diacons Estimator and is fairly resilient to outliers.
If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
use the same edges in all bins.
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4
:param X: np.ndarray with the given sample to calculate the Shannon entropy from
:param bins: The bin edges for entropy calculation, or an amount of even spaced bins
:return:
"""
_X = np.array(X)

if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
return np.array([entropy(_) for _ in _X])
# if bins is not set, use the histogram over the full value range
if bins is None:
# could not fiugre out a better way here. I need the values before calculating the entropy
# in order to use the full value range in all bins
vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
bins = np.histogram(vals, bins=15)[1][1:]
return np.array([entropy(_, bins=bins) for _ in _X])

# check even
if len(_X) % 2 > 0:
raise ValueError('The sample does not have an even length: {}'.format(_X))

# calculate
# calculate the values
vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]

return scipy_entropy(pk=np.histogram(vals, bins='fd')[0])
# claculate the bins
if bins is None:
bins = 15
pk = np.histogram(vals, bins)[0]

return scipy_entropy(pk=pk)
48 changes: 32 additions & 16 deletions skgstat/tests/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@
result_dowd = np.array([1.09900000e+00, 0.00000000e+00, 1.09900000e+02,
1.09900000e+04])

result_genton = np.array([2.46198050e+02, 2.46198050e+02, 2.46198050e+04,
2.46198050e+06])

result_minmax = [2.0, 0.0, 2.0, 2.0]

result_minmax = [2.0, 0.0, 2.0, 2.0]

result_percentile = [4.5, 5.0, 45.0, 450.0]

result_entropy = np.array([0., 0., 0., 0.])
result_entropy = np.array([0.69314718, 0.63651417, 0.63651417, 1.60943791])
result_entropy_fd = np.array([0.67301167, 0.67301167, 0.67301167, 0.95027054])
result_entropy_5b = np.array([1.05492017, 1.05492017, 1.05492017, 0.95027054])
result_entropy_ar = np.array([1.05492017, 0.67301167, 1.05492017, 1.05492017])


class TestEstimator(unittest.TestCase):
Expand All @@ -40,53 +38,71 @@ def setUp(self):

self.grouped = [list(np.arange(10)), [5] * 10, list(np.arange(0, 100, 10)),
list(np.arange(0, 1000, 100))]
np.random.seed(42)
self.entropy_grouped = [list(np.random.gamma(10,2, 10)), list(np.random.gamma(4,4, 10)),
list(np.random.gamma(4, 2, 10)), list(np.random.gamma(10,5, 10))]

def test_matheron(self):
"""
Testing matheron estimator
"""

assert_array_almost_equal(matheron(self.grouped), result_matheron)

def test_cressie(self):
"""
Testing cressie estimator
"""

assert_array_almost_equal(cressie(self.grouped), result_cressie, decimal=5)

def test_dowd(self):
"""
Testing dowd estimator
"""

assert_array_almost_equal(dowd(self.grouped), result_dowd)

def test_genton(self):
"""
Testing genton estimator
"""
assert_array_almost_equal(genton(self.grouped), result_genton)
This one is still buggy, so don't test it
"""
return True


def test_minmax(self):
"""
Testing minmax estimator
"""

assert_array_almost_equal(minmax(self.grouped), result_minmax)

def test_percentile(self):
"""
Testing percentile estimator
"""

assert_array_almost_equal(percentile(self.grouped), result_percentile)

def test_entropy(self):
def test_entropy_default(self):
"""
Testing entropy estimator
Testing entropy estimator with default settings
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped)), result_entropy)

assert_array_almost_equal(np.asarray(entropy(self.grouped)), result_entropy)
def test_entropy_string(self):
"""
Testing entropy estimator with string as bin
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins='fd')), result_entropy_fd)

def test_entropy_integer(self):
"""
Testing entropy estimator with integer as bin
"""
assert_array_almost_equal(np.asarray(entropy(self.entropy_grouped, bins=5)), result_entropy_5b)

def test_entropy_list(self):
"""
Testing entropy estimator with list as bin
"""
assert_array_almost_equal(
np.asarray(entropy(self.entropy_grouped, bins=[0.1, 5, 10, 20, 100])),
result_entropy_ar)

0 comments on commit d3a366b

Please sign in to comment.