add benchmark harness

welch-lab · Feb 14, 2023 · f6aeafc · f6aeafc
1 parent 1b8e71e
commit f6aeafc
Show file tree

Hide file tree

Showing 4 changed files with 253 additions and 30 deletions.
diff --git a/contrib/bench_python.py b/contrib/bench_python.py
@@ -0,0 +1,69 @@
+import itertools
+import pathlib
+from os import PathLike
+from typing import TypeGuard, Union
+import numpy as np
+from scipy import io, sparse, optimize as opt
+from pyliger.factorization._utilities import nla, nnlsm_blockpivot
+from nnlsm_activeset import nnlsm_activeset
+import sys
+
+def parse_glob(p: pathlib.Path) -> list[pathlib.Path]:
+    if p.is_dir():
+        return list(p.glob('*.mtx'))
+    else:
+        return [p]
+
+
+def safe_mmread(mt: PathLike) -> Union[sparse.csr_array, None]:
+    try:
+        return sparse.csr_array(io.mmread(mt).T)
+    except ValueError as e:
+        raise e
+
+
+def is_csr_array(k: sparse.csr_array | None) -> TypeGuard[sparse.csr_array]:
+    return isinstance(k, sparse.csr_array)
+
+
+def _test_nnlsm(A_in, B_in):
+    ### 1. Initialization (W, V_i, H_i)
+    A = A_in.toarray()
+    B = B_in.toarray()
+    X_org = A.T.dot(B)
+    # B = np.random.rand(m,k)
+    # A = np.random.rand(m,n/2)
+    # A = np.concatenate((A,A),axis=1)
+    # A = A + np.random.rand(m,n)*0.01
+    # B = np.random.rand(m,k)
+    import time
+    start = time.time()
+    C1, info = nnlsm_blockpivot(A, B)
+    elapsed2 = time.time() - start
+    rel_norm2 = nla.norm(C1 - X_org) / nla.norm(X_org)
+    print('nnlsm_blockpivot:    ', 'OK  ' if info[0] else 'Fail',
+          'elapsed:{0:.4f} error:{1:.4e}'.format(elapsed2, rel_norm2))
+    start = time.time()
+    C2, info = nnlsm_activeset(A, B)
+    elapsed1 = time.time() - start
+    rel_norm1 = nla.norm(C2 - X_org) / nla.norm(X_org)
+    print('nnlsm_activeset:     ', 'OK  ' if info[0] else 'Fail',
+          'elapsed:{0:.4f} error:{1:.4e}'.format(elapsed1, rel_norm1))
+    start = time.time()
+    C3 = np.zeros(X_org.shape)
+    for i in iter(range(0, X_org.shape[1])):
+        res = opt.nnls(A, B[:, i])
+        C3[:, i] = res[0]
+    elapsed3 = time.time() - start
+    rel_norm3 = nla.norm(C3 - X_org) / nla.norm(X_org)
+    print ('scipy.optimize.nnls: ', 'OK  ',\
+        'elapsed:{0:.4f} error:{1:.4e}'.format(elapsed3, rel_norm3))
+
+
+benchlist = []
+assert len(sys.argv) == 3
+for i in sys.argv[1:]:
+    benchlist.append([path.resolve() for path in parse_glob(pathlib.Path(i))])
+benchlist = [safe_mmread(m) for m in itertools.chain.from_iterable(benchlist)]
+benchlist = list(filter(is_csr_array, benchlist))
+_test_nnlsm(benchlist[0], benchlist[1].T)
diff --git a/contrib/initialize_mtx.py b/contrib/initialize_mtx.py
@@ -0,0 +1,26 @@
+from pyliger import create_liger, read_10X_h5, normalize, scale_not_center, select_genes
+from pyliger.factorization._online_iNMF import _get_scale_data
+from scipy import io, sparse
+import numpy as np
+from os import listdir
+
+titlelist = []
+h5list = []
+for file in listdir('../../repos/theAeon/pyliger/h5'):
+    titlelist.append(file)
+    h5list.append(read_10X_h5(sample_dir='../../repos/theAeon/pyliger/h5',
+                  sample_name=file.split(".")[0], file_name=file, backed=True))
+for i in range(0, len(h5list)//2):
+    j = i + len(h5list)//2
+    titlepair = (titlelist[i], titlelist[j])
+    mouse_cortex = create_liger([h5list[i], h5list[j]])
+    normalize(mouse_cortex)
+    select_genes(mouse_cortex, var_thresh=0.2, do_plot=False)
+    scale_not_center(mouse_cortex)
+    outdata = [_get_scale_data(adata)['scale_data'] for adata in mouse_cortex.adata_list for i in range(2)]
+    outdata = [mat.value.toarray() for mat in outdata]
+    outdata = [sparse.coo_array(np.hstack((mat, np.zeros(mat.shape))).T) for mat in outdata]
+    for i in range(len(titlepair)):
+        io.mmwrite( "%s.mtx" % titlepair[i], outdata[i])
+    densedata = np.random.uniform(0, 2, (10, 2 * len(mouse_cortex.var_genes)))
+    io.mmwrite("%s.dense.mtx" % titlepair[0], densedata)
diff --git a/contrib/nnlsm_activeset.py b/contrib/nnlsm_activeset.py
@@ -0,0 +1,133 @@
+import numpy as np
+from pyliger.factorization._utilities import normal_eq_comb
+import scipy.sparse as sps
+
+
+def nnlsm_activeset(A, B, overwrite=False, is_input_prod=False, init=None):
+    """ Nonnegativity-constrained least squares with active-set method and column grouping
+
+    Solves min ||AX-B||_2^2 s.t. X >= 0 element-wise.
+
+    Algorithm of this routine is close to the one presented in the following paper but
+    is different in organising inner- and outer-loops:
+    M. H. Van Benthem and M. R. Keenan, J. Chemometrics 2004; 18: 441-450
+
+    Parameters
+    ----------
+    A : numpy.array, shape (m,n)
+    B : numpy.array or scipy.sparse matrix, shape (m,k)
+
+    Optional Parameters
+    -------------------
+    is_input_prod : True/False. -  If True, the A and B arguments are interpreted as
+            AtA and AtB, respectively. Default is False.
+    init: numpy.array, shape (n,k). - If provided, init is used as an initial value for the algorithm.
+            Default is None.
+
+    Returns
+    -------
+    X, (success, Y, num_cholesky, num_eq, num_backup)
+    X : numpy.array, shape (n,k) - solution
+    success : True/False - True if the solution is found. False if the algorithm did not terminate
+            due to numerical errors.
+    Y : numpy.array, shape (n,k) - Y = A.T * A * X - A.T * B
+    num_cholesky : int - the number of Cholesky factorizations needed
+    num_eq : int - the number of linear systems of equations needed to be solved
+    """
+    if is_input_prod:
+        AtA = A
+        AtB = B
+    else:
+        AtA = A.T.dot(A)
+        if sps.issparse(B):
+            AtB = B.T.dot(A)
+            AtB = AtB.T
+        else:
+            AtB = A.T.dot(B)
+
+    (n, k) = AtB.shape
+    MAX_ITER = n * 5
+    num_cholesky = 0
+    num_eq = 0
+    not_opt_set = np.ones([k], dtype=bool)
+
+    if overwrite:
+        X, num_cholesky, num_eq = normal_eq_comb(AtA, AtB)
+        PassSet = X > 0
+        not_opt_set = np.any(X < 0, axis=0)
+    elif init is not None:
+        X = init
+        X[X < 0] = 0
+        PassSet = X > 0
+    else:
+        X = np.zeros([n, k])
+        PassSet = np.zeros([n, k], dtype=bool)
+
+    Y = np.zeros([n, k])
+    opt_cols = (~not_opt_set).nonzero()[0]
+    not_opt_cols = not_opt_set.nonzero()[0]
+
+    Y[:, opt_cols] = AtA.dot(X[:, opt_cols]) - AtB[:, opt_cols]
+
+    big_iter = 0
+    success = True
+    while not_opt_cols.size > 0:
+        big_iter += 1
+        if MAX_ITER > 0 and big_iter > MAX_ITER:
+            success = False
+            break
+
+        (Z, temp_cholesky, temp_eq) = normal_eq_comb(
+            AtA, AtB[:, not_opt_cols], PassSet[:, not_opt_cols])
+        num_cholesky += temp_cholesky
+        num_eq += temp_eq
+
+        Z[abs(Z) < 1e-12] = 0
+
+        infea_subset = Z < 0
+        temp = np.any(infea_subset, axis=0)
+        infea_subcols = temp.nonzero()[0]
+        fea_subcols = (~temp).nonzero()[0]
+
+        if infea_subcols.size > 0:
+            infea_cols = not_opt_cols[infea_subcols]
+
+            (ix0, ix1_subsub) = infea_subset[:, infea_subcols].nonzero()
+            ix1_sub = infea_subcols[ix1_subsub]
+            ix1 = not_opt_cols[ix1_sub]
+
+            X_infea = X[(ix0, ix1)]
+
+            alpha = np.zeros([n, len(infea_subcols)])
+            alpha[:] = np.inf
+            alpha[(ix0, ix1_subsub)] = X_infea / (X_infea - Z[(ix0, ix1_sub)])
+            min_ix = np.argmin(alpha, axis=0)
+            min_vals = alpha[(min_ix, range(0, alpha.shape[1]))]
+
+            X[:, infea_cols] = X[:, infea_cols] + \
+                (Z[:, infea_subcols] - X[:, infea_cols]) * min_vals
+            X[(min_ix, infea_cols)] = 0
+            PassSet[(min_ix, infea_cols)] = False
+
+        elif fea_subcols.size > 0:
+            fea_cols = not_opt_cols[fea_subcols]
+
+            X[:, fea_cols] = Z[:, fea_subcols]
+            Y[:, fea_cols] = AtA.dot(X[:, fea_cols]) - AtB[:, fea_cols]
+
+            Y[abs(Y) < 1e-12] = 0
+
+            not_opt_subset = np.logical_and(
+                Y[:, fea_cols] < 0, ~PassSet[:, fea_cols])
+            new_opt_cols = fea_cols[np.all(~not_opt_subset, axis=0)]
+            update_cols = fea_cols[np.any(not_opt_subset, axis=0)]
+
+            if update_cols.size > 0:
+                val = Y[:, update_cols] * ~PassSet[:, update_cols]
+                min_ix = np.argmin(val, axis=0)
+                PassSet[(min_ix, update_cols)] = True
+
+            not_opt_set[new_opt_cols] = False
+            not_opt_cols = not_opt_set.nonzero()[0]
+
+    return X, (success, Y, num_cholesky, num_eq)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,29 +1,28 @@
 [build-system]
-build-backend = "pdm.pep517.api"
-requires = [
-    "pdm-pep517>=1.0.0",
-]
+  build-backend = "pdm.pep517.api"
+  requires      = ["pdm-pep517>=1.0.0"]
 
 [tool.black]
-exclude = "/(\n    \\.eggs\n  | \\.git\n  | \\.hg\n  | \\.mypy_cache\n  | \\.nox\n  | \\.tox\n  | \\.venv\n  | _build\n  | buck-out\n  | build\n  | dist\n)/\n"
-include = "\\.pyi?$"
+  exclude = "/(\n    \\.eggs\n  | \\.git\n  | \\.hg\n  | \\.mypy_cache\n  | \\.nox\n  | \\.tox\n  | \\.venv\n  | _build\n  | buck-out\n  | build\n  | dist\n)/\n"
+  include = "\\.pyi?$"
 
 [tool.pdm]
-package-dir = "src"
-
+  package-dir = "src"
+  [tool.pdm.build]
+    excludes = ["contrib"]
 [project]
-authors = [
+  authors = [
     { name = "Joshua Welch", email = "[email protected]" },
     { name = "Lu Lu", email = "[email protected]" },
-]
-classifiers = [
+  ]
+  classifiers = [
     "Development Status :: 4 - Beta",
     "License :: OSI Approved :: MIT License",
     "Natural Language :: English",
     "Operating System :: OS Independent",
     "Programming Language :: Python :: 3.8",
-]
-dependencies = [
+  ]
+  dependencies = [
     "adjustText>=0.7.3",
     "anndata>=0.8.0",
     "annoy",
@@ -44,22 +43,18 @@ dependencies = [
     "scikit-learn",
     "seaborn>=0.12.2",
     "umap-learn",
-]
-description = "The Python version of LIGER package."
-keywords = [
-    "LIGER",
-]
-maintainers = [
-    { name = "Andrew Robbins", email = "[email protected]" },
-]
-name = "pyliger"
-readme = "README.md"
-requires-python = "<3.11, >=3.8"
-version = "0.1.1"
+  ]
+  description = "The Python version of LIGER package."
+  keywords = ["LIGER"]
+  maintainers = [{ name = "Andrew Robbins", email = "[email protected]" }]
+  name = "pyliger"
+  readme = "README.md"
+  requires-python = "<3.11, >=3.8"
+  version = "0.1.1"
 
-[project.license]
-text = "MIT"
+  [project.license]
+    text = "MIT"
 
-[project.urls]
-homepage = "https://welch-lab.github.io"
-repository = "https://github.com/welch-lab/pyliger"
+  [project.urls]
+    homepage   = "https://welch-lab.github.io"
+    repository = "https://github.com/welch-lab/pyliger"