diff --git a/examples/transfer_entropy_analysis.py b/examples/transfer_entropy_analysis.py new file mode 100644 index 0000000..bffa0fc --- /dev/null +++ b/examples/transfer_entropy_analysis.py @@ -0,0 +1,123 @@ +import mdentropy +from mdentropy.core.information import ncmutinf +import numpy as np +import multiprocessing as mp +from functools import partial + +""" +In this example, we demonstrate an application of the MDEntropy API to +calculate the transfer entropy between every possible pair of features +between two lists of timeseries, each with the same numbers of frames +but possibly different numbers of features. +""" + + +""" +Called by compute_tentropy_between. Computes transfer entropy between +all pairs of features between timeseries_i[t_id] and timeseries_j[t_id] at +lag_time. +""" +def compute_tentropy_between_singletraj(timeseries_tuple, lag_time): + t_i, t_j = timeseries_tuple + tentropy_pairs_single = [] + + t_i = t_i[::lag_time] + t_j = t_j[::lag_time] + + k = 0 + + for i in range(0, t_i.shape[1]): + print("Examining, within timeseries, variable %d out of %d" %(i, t_i.shape[1])) + for j in range(0, t_j.shape[1]): + x = t_i[1:,i].reshape((-1,1)) + y = t_j[:-1,j].reshape((-1,1)) + z = t_i[:-1,i].reshape((-1,1)) + n_frames = x.shape[0] + n_bins = None + tent = np.nan_to_num(ncmutinf(n_bins, x, y, z, method='grassberger')) + + tentropy_pairs_single.append(tent * n_frames) + + k += 1 + + return np.array(tentropy_pairs_single) + +""" +Compute Transfer Entropy between all pairs of features +between two separate lists of featurized timeseries data. +---------- +Parameters: + +timeseries_tuples: list of tuples of two numpy arrays. + Each tuple contains two timeseries (each of type np.array), + each with the same number of rows/frames but possibly different numbers of + columns/features. +lag_time: int + Lag time to be used in computation of Transfer Entropy +titles_i: list of str + If your features have names (e.g., "Arg325" or "Temp. in Kansas"), + you can optionally include them here for timeseries_i list. +titles_j: list of str + If your features have names (e.g., "Arg325" or "Temp. in Kansas"), + you can optionally include them here for timeseries_j list. +worker_pool: object from iPyParallel module +parallel: bool. If True, will use multiprocessing module + to map over all trajectories in parallel. + +Returns: +tentropy_array: Numpy array of shape (len(timeseries_i) * len(timeseries_j)), containing + the transfer entropy between each possible pair of features. +tentropy_pairs_id_tuples: List of tuples of ints. + In same order as entries of tentropy_array. + Each tuple contains two ints describing the ids of the features for which + tentropy was calculated. +tentropy_pairs_names: List of tuples of strings. + +Caveats: +Implemented weighted mean is not optimal and may not be numerically stable. +This is an area for improvement. +""" + +def compute_tentropy_between(timeseries_tuples, lag_time, + titles_i=None, titles_j=None, worker_pool=None, + parallel=False): + total_frames = 0. + n_tent_pairs = timeseries_tuples[0][0].shape[1] * timeseries_tuples[0][1].shape[1] + + tentropy_array = np.zeros(n_tent_pairs) + tentropy_pairs_names = [] + tentropy_pairs_id_tuples = [] + + compute_tentropy_between_singletraj_partial = partial(compute_tentropy_between_singletraj, + lag_time=lag_time) + + if worker_pool is not None: + tentropy_pairs = worker_pool.map_sync(compute_tentropy_between_singletraj_partial, timeseries_tuples) + elif parallel: + pool = mp.Pool(mp.cpu_count()) + tentropy_pairs = pool.map(compute_tentropy_between_singletraj_partial, timeseries_tuples) + pool.terminate() + else: + tentropy_pairs = [] + for timeseries_tuple in timeseries_tuples: + tentropy_pairs.append(compute_tentropy_between_singletraj_partial(timeseries_tuple)) + + for arr in tentropy_pairs: + tentropy_array += arr + + + for t_id, timeseries_tuple in enumerate(timeseries_tuples): + t_i, t_j = timeseries_tuple + t_i = t_i[::lag_time] + t_j = t_j[::lag_time] + total_frames += (t_i.shape[0] - 1) + if t_id == 0: + for i in range(0, t_i.shape[1]): + for j in range(0, t_j.shape[1]): + if titles_i is not None: + tentropy_pairs_names.append((titles_i[i], titles_j[j])) + tentropy_pairs_id_tuples.append((i, j)) + + tentropy_array /= float(total_frames) + + return tentropy_array, tentropy_pairs_id_tuples, tentropy_pairs_names diff --git a/mdentropy/core/entropy.py b/mdentropy/core/entropy.py index 987ca42..b739c5d 100644 --- a/mdentropy/core/entropy.py +++ b/mdentropy/core/entropy.py @@ -4,9 +4,9 @@ from numpy import ndarray from numpy import sum as npsum -from numpy import (atleast_2d, arange, bincount, diff, finfo, float32, +from numpy import (atleast_2d, arange, array, bincount, diff, finfo, float32, hsplit, linspace, log, log2, meshgrid, nan_to_num, nansum, - product, random, ravel, vstack, exp) + product, random, ravel, vstack, exp, bincount, issubdtype, integer) from scipy.spatial import cKDTree from scipy.stats import entropy as naive @@ -57,7 +57,11 @@ def entropy(n_bins, rng, method, *args): if method == 'kde': return kde_entropy(rng, *args, grid_size=n_bins or 20) - counts = symbolic(n_bins, rng, *args) + if n_bins is not None or issubdtype(args[0].dtype, float): + counts = symbolic(n_bins, rng, *args) + else: + minlength = max([len(bincount(arg)) for arg in args]) + counts = array([bincount(arg, minlength=minlength) for arg in args]) if method == 'chaowangjost': return chaowangjost(counts) @@ -67,7 +71,7 @@ def entropy(n_bins, rng, method, *args): return naive(counts) -def centropy(n_bins, x, y, rng=None, method='grassberger'): +def centropy(n_bins, x, y, rng=None, method='knn'): """Condtional entropy calculation Parameters diff --git a/mdentropy/core/information.py b/mdentropy/core/information.py index cc1e2df..3f49bed 100644 --- a/mdentropy/core/information.py +++ b/mdentropy/core/information.py @@ -1,7 +1,8 @@ from .entropy import entropy, centropy from ..utils import avgdigamma -from numpy import (atleast_2d, diff, finfo, float32, log, nan_to_num, random, +from numpy import (atleast_2d, diff, finfo, float32, integer, issubdtype, + log, nan_to_num, random, sqrt, hstack) from scipy.spatial import cKDTree @@ -11,7 +12,7 @@ EPS = finfo(float32).eps -def mutinf(n_bins, x, y, rng=None, method='grassberger'): +def mutinf(n_bins, x, y, rng=None, method='knn'): """Mutual information calculation Parameters @@ -73,7 +74,7 @@ def knn_mutinf(x, y, k=None, boxsize=None): return (-a - b + c + d) / log(2) -def nmutinf(n_bins, x, y, rng=None, method='grassberger'): +def nmutinf(n_bins, x, y, rng=None, method='knn'): """Normalized mutual information calculation Parameters @@ -97,7 +98,7 @@ def nmutinf(n_bins, x, y, rng=None, method='grassberger'): entropy(n_bins, [rng], method, y))) -def cmutinf(n_bins, x, y, z, rng=None, method='grassberger'): +def cmutinf(n_bins, x, y, z, rng=None, method='knn'): """Conditional mutual information calculation Parameters @@ -164,13 +165,18 @@ def knn_cmutinf(x, y, z, k=None, boxsize=None): return (-a - b + c + d) / log(2) -def ncmutinf(n_bins, x, y, z, rng=None, method='grassberger'): +def ncmutinf(n_bins, x, y, z, rng=None, method='knn'): """Normalized conditional mutual information calculation Parameters ---------- n_bins : int Number of bins. + + If None, assumes data is pre-binned or a timeseries + of discrete variables. In this case, x, y, and z + must all be of some integer type: "int", "uint8", etc. + x : array_like, shape = (n_samples, n_dim) Conditioned variable y : array_like, shape = (n_samples, n_dim) @@ -186,5 +192,10 @@ def ncmutinf(n_bins, x, y, z, rng=None, method='grassberger'): ncmi : float """ + if n_bins is None: + assert(issubdtype(x.dtype, integer)) + method = 'grassberger' + rng = None + return (cmutinf(n_bins, x, y, z, rng=rng, method=method) / centropy(n_bins, x, z, rng=rng, method=method))