msmbuilder · evanfeinberg · Sep 3, 2016 · Sep 3, 2016 · Sep 3, 2016 · Sep 3, 2016
diff --git a/examples/transfer_entropy_analysis.py b/examples/transfer_entropy_analysis.py
@@ -0,0 +1,123 @@
+import mdentropy
+from mdentropy.core.information import ncmutinf
+import numpy as np
+import multiprocessing as mp
+from functools import partial
+
+"""
+In this example, we demonstrate an application of the MDEntropy API to
+calculate the transfer entropy between every possible pair of features
+between two lists of timeseries, each with the same numbers of frames
+but possibly different numbers of features. 
+"""
+
+
+"""
+Called by compute_tentropy_between. Computes transfer entropy between 
+all pairs of features between timeseries_i[t_id] and timeseries_j[t_id] at 
+lag_time.
+"""
+def compute_tentropy_between_singletraj(timeseries_tuple, lag_time):
+  t_i, t_j = timeseries_tuple
+  tentropy_pairs_single = []
+
+  t_i = t_i[::lag_time]
+  t_j = t_j[::lag_time]
+
+  k = 0
+
+  for i in range(0, t_i.shape[1]):
+    print("Examining, within timeseries, variable %d out of %d" %(i, t_i.shape[1]))
+    for j in range(0, t_j.shape[1]):
+      x = t_i[1:,i].reshape((-1,1))
+      y = t_j[:-1,j].reshape((-1,1))
+      z = t_i[:-1,i].reshape((-1,1))
+      n_frames = x.shape[0]
+      n_bins = None 
+      tent = np.nan_to_num(ncmutinf(n_bins, x, y, z, method='grassberger'))
+
+      tentropy_pairs_single.append(tent * n_frames)
+
+      k += 1  
+
+  return np.array(tentropy_pairs_single)
+
+"""
+Compute Transfer Entropy between all pairs of features
+between two separate lists of featurized timeseries data.
+----------
+Parameters:
+
+timeseries_tuples: list of tuples of two numpy arrays.
+  Each tuple contains two timeseries (each of type np.array), 
+  each with the same number of rows/frames but possibly different numbers of 
+  columns/features. 
+lag_time: int
+  Lag time to be used in computation of Transfer Entropy
+titles_i: list of str
+  If your features have names (e.g., "Arg325" or "Temp. in Kansas"),
+  you can optionally include them here for timeseries_i list.
+titles_j: list of str
+  If your features have names (e.g., "Arg325" or "Temp. in Kansas"),
+  you can optionally include them here for timeseries_j list.
+worker_pool: object from  iPyParallel module
+parallel: bool. If True, will use multiprocessing module 
+  to map over all trajectories in parallel.
+
+Returns:
+tentropy_array: Numpy array of shape (len(timeseries_i) * len(timeseries_j)), containing
+  the transfer entropy between each possible pair of features.
+tentropy_pairs_id_tuples: List of tuples of ints. 
+ In same order as entries of tentropy_array.
+ Each tuple contains two ints describing the ids of the features for which
+ tentropy was calculated. 
+tentropy_pairs_names: List of tuples of strings.
+
+Caveats:
+Implemented weighted mean is not optimal and may not be numerically stable.
+This is an area for improvement.
+"""
+
+def compute_tentropy_between(timeseries_tuples, lag_time,
+                             titles_i=None, titles_j=None, worker_pool=None,
+                             parallel=False):
+  total_frames = 0.
+  n_tent_pairs = timeseries_tuples[0][0].shape[1] * timeseries_tuples[0][1].shape[1]
+
+  tentropy_array = np.zeros(n_tent_pairs)
+  tentropy_pairs_names = []
+  tentropy_pairs_id_tuples = []
+
+  compute_tentropy_between_singletraj_partial = partial(compute_tentropy_between_singletraj,
+                                                        lag_time=lag_time)
+
+  if worker_pool is not None:
+    tentropy_pairs = worker_pool.map_sync(compute_tentropy_between_singletraj_partial, timeseries_tuples)
+  elif parallel:
+    pool = mp.Pool(mp.cpu_count())
+    tentropy_pairs = pool.map(compute_tentropy_between_singletraj_partial, timeseries_tuples)
+    pool.terminate() 
+  else:
+    tentropy_pairs = []
+    for timeseries_tuple in timeseries_tuples:
+      tentropy_pairs.append(compute_tentropy_between_singletraj_partial(timeseries_tuple))
+
+  for arr in tentropy_pairs:
+    tentropy_array += arr
+
+
+  for t_id, timeseries_tuple in enumerate(timeseries_tuples):
+    t_i, t_j = timeseries_tuple
+    t_i = t_i[::lag_time]
+    t_j = t_j[::lag_time]
+    total_frames += (t_i.shape[0] - 1)
+    if t_id == 0:
+      for i in range(0, t_i.shape[1]):
+        for j in range(0, t_j.shape[1]):
+          if titles_i is not None:
+            tentropy_pairs_names.append((titles_i[i], titles_j[j]))
+          tentropy_pairs_id_tuples.append((i, j))
+
+  tentropy_array /= float(total_frames)
+
+  return tentropy_array, tentropy_pairs_id_tuples, tentropy_pairs_names
diff --git a/mdentropy/core/entropy.py b/mdentropy/core/entropy.py
@@ -4,9 +4,9 @@
 
 from numpy import ndarray
 from numpy import sum as npsum
-from numpy import (atleast_2d, arange, bincount, diff, finfo, float32,
+from numpy import (atleast_2d, arange, array, bincount, diff, finfo, float32,
                    hsplit, linspace, log, log2, meshgrid, nan_to_num, nansum,
-                   product, random, ravel, vstack, exp)
+                   product, random, ravel, vstack, exp, bincount, issubdtype, integer)
 
 from scipy.spatial import cKDTree
 from scipy.stats import entropy as naive
@@ -57,7 +57,11 @@ def entropy(n_bins, rng, method, *args):
     if method == 'kde':
         return kde_entropy(rng, *args, grid_size=n_bins or 20)
 
-    counts = symbolic(n_bins, rng, *args)
+    if n_bins is not None or issubdtype(args[0].dtype, float):
+        counts = symbolic(n_bins, rng, *args)
+    else:
+        minlength = max([len(bincount(arg)) for arg in args])
+        counts = array([bincount(arg, minlength=minlength) for arg in args])
 
     if method == 'chaowangjost':
         return chaowangjost(counts)
@@ -67,7 +71,7 @@ def entropy(n_bins, rng, method, *args):
     return naive(counts)
 
 
-def centropy(n_bins, x, y, rng=None, method='grassberger'):
+def centropy(n_bins, x, y, rng=None, method='knn'):
     """Condtional entropy calculation
 
     Parameters

diff --git a/mdentropy/core/information.py b/mdentropy/core/information.py
@@ -1,7 +1,8 @@
 from .entropy import entropy, centropy
 from ..utils import avgdigamma
 
-from numpy import (atleast_2d, diff, finfo, float32, log, nan_to_num, random,
+from numpy import (atleast_2d, diff, finfo, float32, integer, issubdtype,
+                 log, nan_to_num, random,
                    sqrt, hstack)
 
 from scipy.spatial import cKDTree
@@ -11,7 +12,7 @@
 EPS = finfo(float32).eps
 
 
-def mutinf(n_bins, x, y, rng=None, method='grassberger'):
+def mutinf(n_bins, x, y, rng=None, method='knn'):
     """Mutual information calculation
 
     Parameters
@@ -73,7 +74,7 @@ def knn_mutinf(x, y, k=None, boxsize=None):
     return (-a - b + c + d) / log(2)
 
 
-def nmutinf(n_bins, x, y, rng=None, method='grassberger'):
+def nmutinf(n_bins, x, y, rng=None, method='knn'):
     """Normalized mutual information calculation
 
     Parameters
@@ -97,7 +98,7 @@ def nmutinf(n_bins, x, y, rng=None, method='grassberger'):
                       entropy(n_bins, [rng], method, y)))
 
 
-def cmutinf(n_bins, x, y, z, rng=None, method='grassberger'):
+def cmutinf(n_bins, x, y, z, rng=None, method='knn'):
     """Conditional mutual information calculation
 
     Parameters
@@ -164,13 +165,18 @@ def knn_cmutinf(x, y, z, k=None, boxsize=None):
     return (-a - b + c + d) / log(2)
 
 
-def ncmutinf(n_bins, x, y, z, rng=None, method='grassberger'):
+def ncmutinf(n_bins, x, y, z, rng=None, method='knn'):
     """Normalized conditional mutual information calculation
 
     Parameters
     ----------
     n_bins : int
         Number of bins.
+
+        If None, assumes data is pre-binned or a timeseries
+        of discrete variables. In this case, x, y, and z
+        must all be of some integer type: "int", "uint8", etc.
+
     x : array_like, shape = (n_samples, n_dim)
         Conditioned variable
     y : array_like, shape = (n_samples, n_dim)
@@ -186,5 +192,10 @@ def ncmutinf(n_bins, x, y, z, rng=None, method='grassberger'):
     ncmi : float
     """
 
+    if n_bins is None:
+        assert(issubdtype(x.dtype, integer))
+        method = 'grassberger'
+        rng = None
+
     return (cmutinf(n_bins, x, y, z, rng=rng, method=method) /
             centropy(n_bins, x, z, rng=rng, method=method))