From 951cb854389ee2960e806a10752ec8ead4d13e15 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 11 Dec 2018 16:38:31 -0500 Subject: [PATCH 01/32] [ENH] Adds layout for PLS examples Including functions for loading example datasets and structure for adding new examples --- pyls/examples/__init__.py | 3 + pyls/examples/datasets.json | 10 +++ pyls/examples/datasets.py | 126 ++++++++++++++++++++++++++++++++++++ pyls/info.py | 3 +- 4 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 pyls/examples/__init__.py create mode 100644 pyls/examples/datasets.json create mode 100644 pyls/examples/datasets.py diff --git a/pyls/examples/__init__.py b/pyls/examples/__init__.py new file mode 100644 index 0000000..e40fc85 --- /dev/null +++ b/pyls/examples/__init__.py @@ -0,0 +1,3 @@ +__all__ = ['available_datasets', 'load_dataset'] + +from .datasets import available_datasets, load_dataset diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json new file mode 100644 index 0000000..ecbe555 --- /dev/null +++ b/pyls/examples/datasets.json @@ -0,0 +1,10 @@ +{ + "whitaker_vertes_2016": { + "urls": [ + "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/DATA/PLS_gene_predictor_vars.csv", + "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/CT_MT_ANALYSES/COMPLETE/PLS/COVARS_none/PLS_MRI_response_vars.csv" + ], + "X": "PLS_gene_predictor_vars.csv", + "Y": "PLS_MRI_response_vars.csv" + } +} diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py new file mode 100644 index 0000000..e11849d --- /dev/null +++ b/pyls/examples/datasets.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +""" +Functions and utilities for getting datasets for PLS examples +""" + +import json +import os +from pkg_resources import resource_filename +import requests +import urllib + +import pandas as pd + +from ..structures import PLSInputs + +with open(resource_filename('pyls', 'examples/datasets.json'), 'r') as src: + _DATASETS = json.load(src) + + +def available_datasets(): + """ + Lists available datasets to download + + Returns + ------- + datasets : list + List of available datasets + """ + + return list(_DATASETS.keys()) + + +def _get_data_dir(data_dir=None): + """ + Gets path to pyls data directory + + Parameters + ---------- + data_dir : str, optional + Path to use as data directory. If not specified, will check for + environmental variable 'PYLS_DATA'; if that is not set, will use + `~/pyls-data` instead. Default: None + + Returns + ------- + data_dir : str + Path to use as data directory + """ + + if data_dir is None: + data_dir = os.environ.get('PYLS_DATA', os.path.join('~', 'pyls-data')) + data_dir = os.path.expanduser(data_dir) + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + return data_dir + + +def load_dataset(name, data_dir=None, verbose=1): + """ + Loads dataset provided by `name` into a :obj:`PLSInputs` object + + Parameters + ---------- + name : str + Name of dataset. Must be one of the datasets listed in + :func:`pyls.examples.available_datasets()` + data_dir : str, optional + Path to use as data directory to store dataset. If not specified, will + check for environmental variable 'PYLS_DATA'; if that is not set, will + use `~/pyls-data` instead. Default: None + verbose : int, optional + Level of verbosity for status messages about fetching/loading dataset. + Set to 0 for no updates. Default: 1 + + Returns + ------- + dataset : :obj:`~.structures.PLSInputs` + PLSInputs object containing pre-loaded data ready to run PLS analysis. + Rerun the analysis by calling :func:`pyls.behavioral_pls(**dataset)` or + :func:`pyls.meancentered_pls(**dataset)`, as appropriate + """ + + if name not in available_datasets(): + raise ValueError('Provided dataset {} not available. Must be one of {}' + .format(name, available_datasets())) + + data_path = os.path.join(_get_data_dir(data_dir), name) + _get_dataset(name, data_path, verbose=verbose) + + dataset = PLSInputs() + for key, value in _DATASETS.get(name, {}).items(): + if isinstance(value, str): + value = pd.read_csv(os.path.join(data_path, value), index_col=0) + dataset[key] = value + + # make some dataset-specific corrections + if name == 'whitaker_vertes_2016': + dataset.X = dataset.X.T + + return dataset + + +def _get_dataset(name, data_dir, verbose=1): + """ + Downloads dataset defined by `name` + + Parameters + ---------- + name : str + Name of dataset. Must be one of the datasets listed in + :func:`pyls.examples.available_datasets()` + data_dir : str + Path to use as data directory to store dataset + """ + + os.makedirs(data_dir, exist_ok=True) + + for url in _DATASETS.get(name, {}).get('urls', []): + parse = urllib.parse.urlparse(url) + fname = os.path.join(data_dir, os.path.basename(parse.path)) + + if not os.path.exists(fname): + out = requests.get(url) + with open(fname, 'w') as dest: + dest.write(out.text) diff --git a/pyls/info.py b/pyls/info.py index c9b8c32..0f2b4da 100644 --- a/pyls/info.py +++ b/pyls/info.py @@ -46,7 +46,8 @@ PACKAGE_DATA = { 'pyls': [ - 'tests/data/*' + 'tests/data/*mat', + 'examples/datasets.json' ] } From dce7c2237cb27576d859289db79c1802bac88b32 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 17 Dec 2018 15:45:36 -0500 Subject: [PATCH 02/32] [ENH] Start of Mirchi et al., 2018 replication Found the URLs required! Will need to rethink how I want to do this since the FC data are all separate files and the behavioral data requires some wrangling... --- pyls/examples/datasets.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json index ecbe555..5e2416c 100644 --- a/pyls/examples/datasets.json +++ b/pyls/examples/datasets.json @@ -1,4 +1,10 @@ { + "mirchi_2018": { + "urls": [ + "https://s3.amazonaws.com/openneuro/ds000031/ds000031_R1.0.4/uncompressed/sub-01/sub-01_sessions.tsv", + "https://s3.amazonaws.com/openneuro/ds000031/ds000031_R1.0.2/uncompressed/derivatives/sub-01/ses-{0:03d}/sub-01_ses-{0:03d}_task-rest_run-001_parcel-timeseries.txt" + ] + }, "whitaker_vertes_2016": { "urls": [ "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/DATA/PLS_gene_predictor_vars.csv", From 20d6ecc3becc4d3f8caf92756915962a7ec9ca42 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 13:12:54 -0500 Subject: [PATCH 03/32] [ENH] Mirchi et al., 2018 added as full example Now fully loadable (files uploaded to Dropbox -- will have to host scripts for creation elsewhere). --- pyls/examples/datasets.json | 15 +++++++++++---- pyls/examples/datasets.py | 17 ++++++++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json index 5e2416c..9592837 100644 --- a/pyls/examples/datasets.json +++ b/pyls/examples/datasets.json @@ -1,9 +1,15 @@ { "mirchi_2018": { "urls": [ - "https://s3.amazonaws.com/openneuro/ds000031/ds000031_R1.0.4/uncompressed/sub-01/sub-01_sessions.tsv", - "https://s3.amazonaws.com/openneuro/ds000031/ds000031_R1.0.2/uncompressed/derivatives/sub-01/ses-{0:03d}/sub-01_ses-{0:03d}_task-rest_run-001_parcel-timeseries.txt" - ] + "https://www.dropbox.com/s/29pmo4uf19go442/myconnectome_fc.npy?dl=1", + "https://www.dropbox.com/s/w7px20kxwvqx1d1/myconnectome_panas.csv?dl=1" + ], + "X": "myconnectome_fc.npy", + "Y": "myconnectome_panas.csv", + "n_perm": 10000, + "n_boot": 10000, + "test_size": 0.25, + "test_split": 100 }, "whitaker_vertes_2016": { "urls": [ @@ -11,6 +17,7 @@ "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/CT_MT_ANALYSES/COMPLETE/PLS/COVARS_none/PLS_MRI_response_vars.csv" ], "X": "PLS_gene_predictor_vars.csv", - "Y": "PLS_MRI_response_vars.csv" + "Y": "PLS_MRI_response_vars.csv", + "n_perm": 1000 } } diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index e11849d..aaf0efa 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -9,6 +9,7 @@ import requests import urllib +import numpy as np import pandas as pd from ..structures import PLSInputs @@ -91,7 +92,17 @@ def load_dataset(name, data_dir=None, verbose=1): dataset = PLSInputs() for key, value in _DATASETS.get(name, {}).items(): if isinstance(value, str): - value = pd.read_csv(os.path.join(data_path, value), index_col=0) + fname = os.path.join(data_path, value) + if fname.endswith('.csv'): + value = pd.read_csv(fname, index_col=0) + elif fname.endswith('.txt'): + value = np.loadtxt(fname) + elif fname.endswith('.npy'): + value = np.load(fname) + else: + raise ValueError('Cannot recognize datatype of {}. Please ' + 'create an issue on GitHub with dataset you ' + 'are trying to load ({})'.format(fname, name)) dataset[key] = value # make some dataset-specific corrections @@ -122,5 +133,5 @@ def _get_dataset(name, data_dir, verbose=1): if not os.path.exists(fname): out = requests.get(url) - with open(fname, 'w') as dest: - dest.write(out.text) + with open(fname, 'wb') as dest: + dest.write(out.content) From a9ec5c222b165d6d9005460fd7e4692b750f6e2e Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 13:32:59 -0500 Subject: [PATCH 04/32] [ENH] Adds reference and description for examples As well as ability to query that information with `query_dataset()` --- pyls/examples/__init__.py | 4 +-- pyls/examples/datasets.json | 4 +++ pyls/examples/datasets.py | 49 ++++++++++++++++++++++++++++++++----- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/pyls/examples/__init__.py b/pyls/examples/__init__.py index e40fc85..a544bc0 100644 --- a/pyls/examples/__init__.py +++ b/pyls/examples/__init__.py @@ -1,3 +1,3 @@ -__all__ = ['available_datasets', 'load_dataset'] +__all__ = ['available_datasets', 'load_dataset', 'query_dataset'] -from .datasets import available_datasets, load_dataset +from .datasets import available_datasets, load_dataset, query_dataset diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json index 9592837..c01f003 100644 --- a/pyls/examples/datasets.json +++ b/pyls/examples/datasets.json @@ -1,5 +1,7 @@ { "mirchi_2018": { + "description": "Study examining the relationship between changes in functional brain connectivity derived from resting-state functional magnetic resonance imaging (rsfMRI) and behavioral mood scores using the MyConnectome database.", + "reference": "Mirchi, N., Betzel, R. F., Bernhardt, B. C., Dagher, A., & Mišić, B. (2018). Tracking mood fluctuations with functional network patterns. Social Cognitive and Affective Neuroscience.", "urls": [ "https://www.dropbox.com/s/29pmo4uf19go442/myconnectome_fc.npy?dl=1", "https://www.dropbox.com/s/w7px20kxwvqx1d1/myconnectome_panas.csv?dl=1" @@ -12,6 +14,8 @@ "test_split": 100 }, "whitaker_vertes_2016": { + "description": "Study examining the relationship between developmental brain changes derived from structural magnetic resonance imaging (sMRI) and genetic expression in the brain using the NeuroScience in Psychiatry Network (NSPN) dataset", + "reference": "Whitaker, K. J., Vértes, P. E., Romero-Garcia, R., Váša, F., Moutoussis, M., Prabhu, G., Weiskopf, N., Callaghan, M. F., Wagstyl, K., Rittman, T., Tait, R., Ooi, C., Suckling, J., Inkster, B., Fonagy, P., Dolan, R. J., Jones, P. B., Goodyer, I. M., Bullmore, E. T. (2016). Adolescence is associated with genomically patterned consolidation of the hubs of the human brain connectome. Proceedings of the National Academy of Sciences, 113(32), 9105-9110.", "urls": [ "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/DATA/PLS_gene_predictor_vars.csv", "https://raw.githubusercontent.com/KirstieJane/NSPN_WhitakerVertes_PNAS2016/master/CT_MT_ANALYSES/COMPLETE/PLS/COVARS_none/PLS_MRI_response_vars.csv" diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index aaf0efa..dbf28a4 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -31,6 +31,39 @@ def available_datasets(): return list(_DATASETS.keys()) +def query_dataset(name, key='description'): + """ + Queries dataset `name` for information specified by `key` + + Parameters + ---------- + name : str + Name of dataset. Must be in :func:`pyls.examples.available_datasets()` + key : str, optional + Key to query from `name`. If not specified will return a list of + available keys. Default: 'description' + + Returns + ------- + value + Value specified by `key` for dataset `name` + """ + + if name not in available_datasets(): + raise ValueError('Provided dataset {} not available. Must be one of {}' + .format(name, available_datasets())) + if key is None: + return list(_DATASETS.get(name).keys()) + + value = _DATASETS.get(name).get(key, None) + if value is None: + raise KeyError('Provided key {} not specified for dataset {}. ' + 'Available keys are {}' + .format(name, key, list(_DATASETS.get(name).keys()))) + + return value + + def _get_data_dir(data_dir=None): """ Gets path to pyls data directory @@ -57,15 +90,14 @@ def _get_data_dir(data_dir=None): return data_dir -def load_dataset(name, data_dir=None, verbose=1): +def load_dataset(name, data_dir=None, verbose=1, return_reference=False): """ Loads dataset provided by `name` into a :obj:`PLSInputs` object Parameters ---------- name : str - Name of dataset. Must be one of the datasets listed in - :func:`pyls.examples.available_datasets()` + Name of dataset. Must be in :func:`pyls.examples.available_datasets()` data_dir : str, optional Path to use as data directory to store dataset. If not specified, will check for environmental variable 'PYLS_DATA'; if that is not set, will @@ -73,6 +105,9 @@ def load_dataset(name, data_dir=None, verbose=1): verbose : int, optional Level of verbosity for status messages about fetching/loading dataset. Set to 0 for no updates. Default: 1 + return_reference : bool, optional + Whether to return APA-style reference for dataset specified by `name`. + Default: False Returns ------- @@ -91,7 +126,7 @@ def load_dataset(name, data_dir=None, verbose=1): dataset = PLSInputs() for key, value in _DATASETS.get(name, {}).items(): - if isinstance(value, str): + if isinstance(value, str) and value in PLSInputs.allowed: fname = os.path.join(data_path, value) if fname.endswith('.csv'): value = pd.read_csv(fname, index_col=0) @@ -109,6 +144,9 @@ def load_dataset(name, data_dir=None, verbose=1): if name == 'whitaker_vertes_2016': dataset.X = dataset.X.T + if return_reference: + return dataset, query_dataset(name, 'reference') + return dataset @@ -119,8 +157,7 @@ def _get_dataset(name, data_dir, verbose=1): Parameters ---------- name : str - Name of dataset. Must be one of the datasets listed in - :func:`pyls.examples.available_datasets()` + Name of dataset. Must be in :func:`pyls.examples.available_datasets()` data_dir : str Path to use as data directory to store dataset """ From aa0dab28146f4c01c267a3046248ea4ea09df13a Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 16:07:15 -0500 Subject: [PATCH 05/32] [ENH] Updates examples.available_datasets() --- pyls/__init__.py | 3 ++- pyls/examples/datasets.json | 6 ++++-- pyls/examples/datasets.py | 30 +++++++++++++++++------------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pyls/__init__.py b/pyls/__init__.py index 4a63c08..8eeea3d 100644 --- a/pyls/__init__.py +++ b/pyls/__init__.py @@ -4,7 +4,7 @@ '__author__', '__description__', '__email__', '__license__', '__maintainer__', '__packagename__', '__url__', '__version__', 'behavioral_pls', 'meancentered_pls', 'import_matlab_result', - 'PLSInputs', 'PLSResults', 'save_results', 'load_results' + 'PLSInputs', 'PLSResults', 'save_results', 'load_results', 'examples' ] from ._version import get_versions @@ -21,6 +21,7 @@ __url__, ) +from . import examples from .io import load_results, save_results from .matlab import import_matlab_result from .structures import PLSInputs, PLSResults diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json index c01f003..1484229 100644 --- a/pyls/examples/datasets.json +++ b/pyls/examples/datasets.json @@ -4,14 +4,16 @@ "reference": "Mirchi, N., Betzel, R. F., Bernhardt, B. C., Dagher, A., & Mišić, B. (2018). Tracking mood fluctuations with functional network patterns. Social Cognitive and Affective Neuroscience.", "urls": [ "https://www.dropbox.com/s/29pmo4uf19go442/myconnectome_fc.npy?dl=1", - "https://www.dropbox.com/s/w7px20kxwvqx1d1/myconnectome_panas.csv?dl=1" + "https://www.dropbox.com/s/w7px20kxwvqx1d1/myconnectome_panas.csv?dl=1", + "http://web.stanford.edu/group/poldracklab/myconnectome-data/base/parcellation/parcel_data.txt" ], "X": "myconnectome_fc.npy", "Y": "myconnectome_panas.csv", "n_perm": 10000, "n_boot": 10000, "test_size": 0.25, - "test_split": 100 + "test_split": 100, + "parcellation": "parcel_data.txt" }, "whitaker_vertes_2016": { "description": "Study examining the relationship between developmental brain changes derived from structural magnetic resonance imaging (sMRI) and genetic expression in the brain using the NeuroScience in Psychiatry Network (NSPN) dataset", diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index dbf28a4..7764dc8 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -18,7 +18,7 @@ _DATASETS = json.load(src) -def available_datasets(): +def available_datasets(name=None): """ Lists available datasets to download @@ -28,6 +28,14 @@ def available_datasets(): List of available datasets """ + if name is not None: + if name not in _DATASETS.keys(): + raise ValueError('Provided dataset {} is not available. Dataset ' + 'must be one of: {}.' + .format(name, available_datasets())) + else: + return name + return list(_DATASETS.keys()) @@ -49,9 +57,7 @@ def query_dataset(name, key='description'): Value specified by `key` for dataset `name` """ - if name not in available_datasets(): - raise ValueError('Provided dataset {} not available. Must be one of {}' - .format(name, available_datasets())) + name = available_datasets(name) if key is None: return list(_DATASETS.get(name).keys()) @@ -117,17 +123,14 @@ def load_dataset(name, data_dir=None, verbose=1, return_reference=False): :func:`pyls.meancentered_pls(**dataset)`, as appropriate """ - if name not in available_datasets(): - raise ValueError('Provided dataset {} not available. Must be one of {}' - .format(name, available_datasets())) - - data_path = os.path.join(_get_data_dir(data_dir), name) - _get_dataset(name, data_path, verbose=verbose) + name = available_datasets(name) + data_dir = _get_data_dir(data_dir) + _get_dataset(name, data_dir, verbose=verbose) dataset = PLSInputs() for key, value in _DATASETS.get(name, {}).items(): - if isinstance(value, str) and value in PLSInputs.allowed: - fname = os.path.join(data_path, value) + if isinstance(value, str) and key in PLSInputs.allowed: + fname = os.path.join(data_dir, name, value) if fname.endswith('.csv'): value = pd.read_csv(fname, index_col=0) elif fname.endswith('.txt'): @@ -150,7 +153,7 @@ def load_dataset(name, data_dir=None, verbose=1, return_reference=False): return dataset -def _get_dataset(name, data_dir, verbose=1): +def _get_dataset(name, data_dir=None, verbose=1): """ Downloads dataset defined by `name` @@ -162,6 +165,7 @@ def _get_dataset(name, data_dir, verbose=1): Path to use as data directory to store dataset """ + data_dir = os.path.join(_get_data_dir(data_dir), name) os.makedirs(data_dir, exist_ok=True) for url in _DATASETS.get(name, {}).get('urls', []): From 928b790b18b88ae7b2f43db895c4f8504f2e3dd8 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 16:07:32 -0500 Subject: [PATCH 06/32] [TEST] Adds tests for pyls.examples --- pyls/tests/test_examples.py | 78 +++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 pyls/tests/test_examples.py diff --git a/pyls/tests/test_examples.py b/pyls/tests/test_examples.py new file mode 100644 index 0000000..870a373 --- /dev/null +++ b/pyls/tests/test_examples.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +import os +import pytest +import pyls + +DATASETS = [ + 'mirchi_2018', 'whitaker_vertes_2016' +] + + +def test_available_datasets(): + # make sure we get a list of strings when called with no arguments + avail = pyls.examples.available_datasets() + assert isinstance(avail, list) + assert all([isinstance(f, str) for f in avail]) + + # check that we get all expected datasets back + assert len(set(DATASETS) - set(avail)) == 0 + + # check that we can supply dataset names to function to confirm validity + for f in DATASETS: + assert f == pyls.examples.available_datasets(f) + + # check that providing non-valid dataset name errors + with pytest.raises(ValueError): + pyls.examples.available_datasets('thisisnotadataset') + pyls.exampleipyts.available_datasets(10) + + +@pytest.mark.parametrize(('dataset', 'keys'), [ + ('mirchi_2018', [ + 'description', 'reference', 'urls', 'X', 'Y', + 'n_perm', 'n_boot', 'test_size', 'test_split', 'parcellation' + ]), + ('whitaker_vertes_2016', [ + 'description', 'reference', 'urls', 'X', 'Y', 'n_perm' + ]) +]) +def test_query_dataset(dataset, keys): + # check that default return string (description) + assert isinstance(pyls.examples.query_dataset(dataset), str) + # check that supplying None returns all available keys + assert pyls.examples.query_dataset(dataset, None) == keys + # check that all valid keys return something + for k in keys: + assert pyls.examples.query_dataset(dataset, k) is not None + # check nonsense keys + for k in ['notakey', 10, 20.5132]: + with pytest.raises(KeyError): + pyls.examples.query_dataset(dataset, k) + + +def test_get_data_dir(tmpdir): + # check that default (no arguments) returns valid default directory + data_dir = pyls.examples.datasets._get_data_dir() + assert isinstance(data_dir, str) + assert os.path.exists(data_dir) + assert os.path.basename(data_dir) == 'pyls-data' + + # check supplying directory returns same directory + assert pyls.examples.datasets._get_data_dir(tmpdir) == str(tmpdir) + assert os.path.exists(str(tmpdir)) + + # check that _get_data_dir() pulls from environmental variable correctly + os.environ['PYLS_DATA'] = str(tmpdir) + assert pyls.examples.datasets._get_data_dir() == str(tmpdir) + + +@pytest.mark.parametrize(('dataset', 'keys'), [ + ('mirchi_2018', ['X', 'Y', 'n_perm', 'n_boot', 'test_size', 'test_split']), + ('whitaker_vertes_2016', ['X', 'Y', 'n_perm']) +]) +def test_load_dataset(tmpdir, dataset, keys): + ds = pyls.examples.load_dataset(dataset, tmpdir) + assert isinstance(ds, pyls.structures.PLSInputs) + for k in keys: + assert hasattr(ds, k) and getattr(ds, k) is not None From 748b769a7f9441da282db92449ea9207f8618b61 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 16:20:08 -0500 Subject: [PATCH 07/32] [TEST] Pandas optional --- .travis.yml | 8 +++++--- pyls/examples/datasets.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index b7158ce..3b6001b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,14 +15,13 @@ env: matrix: - LINTING=1 - DOCTEST=1 - - JOBLIB=1 + - JOBLIB=1 PANDAS=1 matrix: include: - python: 3.6 env: - - COVERAGE=1 - - JOBLIB=1 + - COVERAGE=1 JOBLIB=1 PANDAS=1 - python: 3.7 dist: xenial sudo: required @@ -43,6 +42,9 @@ before_install: - if [ "${JOBLIB}" == "1" ]; then pip install joblib; fi + - if [ "${PANDAS}" == "1" ]; then + pip install pandas; + fi install: - if [ "${DOCTEST}" == "1" ]; then diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index 7764dc8..ee7fb7d 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -10,10 +10,15 @@ import urllib import numpy as np -import pandas as pd from ..structures import PLSInputs +try: + import pandas as pd + pandas_avail = True +except ImportError: + pandas_avail = False + with open(resource_filename('pyls', 'examples/datasets.json'), 'r') as src: _DATASETS = json.load(src) @@ -132,7 +137,10 @@ def load_dataset(name, data_dir=None, verbose=1, return_reference=False): if isinstance(value, str) and key in PLSInputs.allowed: fname = os.path.join(data_dir, name, value) if fname.endswith('.csv'): - value = pd.read_csv(fname, index_col=0) + if pandas_avail: + value = pd.read_csv(fname, index_col=0) + else: + value = np.loadtxt(fname, skiprows=1, delimiter=',')[:, 1:] elif fname.endswith('.txt'): value = np.loadtxt(fname) elif fname.endswith('.npy'): From 70ce8540a1547d4b2d90162f40cc8739623d6f49 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 16:43:42 -0500 Subject: [PATCH 08/32] [FIX] Removes requests module from pyls.examples Can just use urllib for the little that we need --- pyls/examples/datasets.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index ee7fb7d..01a11e7 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -6,7 +6,6 @@ import json import os from pkg_resources import resource_filename -import requests import urllib import numpy as np @@ -181,6 +180,6 @@ def _get_dataset(name, data_dir=None, verbose=1): fname = os.path.join(data_dir, os.path.basename(parse.path)) if not os.path.exists(fname): - out = requests.get(url) + out = urllib.request.urlopen(url) with open(fname, 'wb') as dest: dest.write(out.content) From 4792d9f36fd457f74a8a57ed2fd397a1cdcfcb4c Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 20:42:18 -0500 Subject: [PATCH 09/32] [FIX] Urllib errors and py3.5 dicts --- pyls/examples/datasets.py | 2 +- pyls/tests/test_examples.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index 01a11e7..ea3c56a 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -182,4 +182,4 @@ def _get_dataset(name, data_dir=None, verbose=1): if not os.path.exists(fname): out = urllib.request.urlopen(url) with open(fname, 'wb') as dest: - dest.write(out.content) + dest.write(out.read()) diff --git a/pyls/tests/test_examples.py b/pyls/tests/test_examples.py index 870a373..c6c89ef 100644 --- a/pyls/tests/test_examples.py +++ b/pyls/tests/test_examples.py @@ -41,7 +41,7 @@ def test_query_dataset(dataset, keys): # check that default return string (description) assert isinstance(pyls.examples.query_dataset(dataset), str) # check that supplying None returns all available keys - assert pyls.examples.query_dataset(dataset, None) == keys + assert set(pyls.examples.query_dataset(dataset, None)) == set(keys) # check that all valid keys return something for k in keys: assert pyls.examples.query_dataset(dataset, k) is not None @@ -59,7 +59,7 @@ def test_get_data_dir(tmpdir): assert os.path.basename(data_dir) == 'pyls-data' # check supplying directory returns same directory - assert pyls.examples.datasets._get_data_dir(tmpdir) == str(tmpdir) + assert pyls.examples.datasets._get_data_dir(str(tmpdir)) == str(tmpdir) assert os.path.exists(str(tmpdir)) # check that _get_data_dir() pulls from environmental variable correctly @@ -72,7 +72,7 @@ def test_get_data_dir(tmpdir): ('whitaker_vertes_2016', ['X', 'Y', 'n_perm']) ]) def test_load_dataset(tmpdir, dataset, keys): - ds = pyls.examples.load_dataset(dataset, tmpdir) + ds = pyls.examples.load_dataset(dataset, str(tmpdir)) assert isinstance(ds, pyls.structures.PLSInputs) for k in keys: assert hasattr(ds, k) and getattr(ds, k) is not None From 301c338eba1cf6bbe36df4735dd7a1893b574dae Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 21:37:20 -0500 Subject: [PATCH 10/32] [FIX] Fixes issues in np.loadtxt with exampls Have to use np.genfromtxt with CSV files that would otherwise be loaded with pandas (if pandas isn't available). --- pyls/examples/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index ea3c56a..148e272 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -139,7 +139,8 @@ def load_dataset(name, data_dir=None, verbose=1, return_reference=False): if pandas_avail: value = pd.read_csv(fname, index_col=0) else: - value = np.loadtxt(fname, skiprows=1, delimiter=',')[:, 1:] + value = np.genfromtxt(fname, skip_header=True, + delimiter=',')[:, 1:] elif fname.endswith('.txt'): value = np.loadtxt(fname) elif fname.endswith('.npy'): From a718da6e61892a6823b23bb0a738ae20d5d9302a Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 22:38:01 -0500 Subject: [PATCH 11/32] [TEST] Minor examples test updates Wasn't testing a few lines of code that really should have been tested... --- pyls/examples/datasets.py | 4 +--- pyls/tests/test_examples.py | 9 ++++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pyls/examples/datasets.py b/pyls/examples/datasets.py index 148e272..2b80127 100644 --- a/pyls/examples/datasets.py +++ b/pyls/examples/datasets.py @@ -135,14 +135,12 @@ def load_dataset(name, data_dir=None, verbose=1, return_reference=False): for key, value in _DATASETS.get(name, {}).items(): if isinstance(value, str) and key in PLSInputs.allowed: fname = os.path.join(data_dir, name, value) - if fname.endswith('.csv'): + if fname.endswith('.csv') or fname.endswith('.txt'): if pandas_avail: value = pd.read_csv(fname, index_col=0) else: value = np.genfromtxt(fname, skip_header=True, delimiter=',')[:, 1:] - elif fname.endswith('.txt'): - value = np.loadtxt(fname) elif fname.endswith('.npy'): value = np.load(fname) else: diff --git a/pyls/tests/test_examples.py b/pyls/tests/test_examples.py index c6c89ef..41ad6cd 100644 --- a/pyls/tests/test_examples.py +++ b/pyls/tests/test_examples.py @@ -23,9 +23,9 @@ def test_available_datasets(): assert f == pyls.examples.available_datasets(f) # check that providing non-valid dataset name errors - with pytest.raises(ValueError): - pyls.examples.available_datasets('thisisnotadataset') - pyls.exampleipyts.available_datasets(10) + for f in ['thisisnotadataset', 10]: + with pytest.raises(ValueError): + pyls.examples.available_datasets(f) @pytest.mark.parametrize(('dataset', 'keys'), [ @@ -76,3 +76,6 @@ def test_load_dataset(tmpdir, dataset, keys): assert isinstance(ds, pyls.structures.PLSInputs) for k in keys: assert hasattr(ds, k) and getattr(ds, k) is not None + ds, ref = pyls.examples.load_dataset(dataset, str(tmpdir), + return_reference=True) + assert isinstance(ref, str) From 2f607cd28f031a7316a6f21eb4366f0cb2705a76 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 23:05:46 -0500 Subject: [PATCH 12/32] [DOC] Outline for user documentation --- docs/api.rst | 17 +++++++++++++---- docs/conf.py | 8 ++------ docs/index.rst | 26 +++----------------------- docs/installation.rst | 19 +++++++++++++++++++ docs/usage.rst | 16 ++++++++++++---- docs/user_guide/behavioral.rst | 4 ++++ docs/user_guide/matlab.rst | 4 ++++ docs/user_guide/meancentered.rst | 4 ++++ docs/user_guide/overview.rst | 4 ++++ docs/user_guide/results.rst | 4 ++++ 10 files changed, 69 insertions(+), 37 deletions(-) create mode 100644 docs/installation.rst create mode 100644 docs/user_guide/behavioral.rst create mode 100644 docs/user_guide/matlab.rst create mode 100644 docs/user_guide/meancentered.rst create mode 100644 docs/user_guide/overview.rst create mode 100644 docs/user_guide/results.rst diff --git a/docs/api.rst b/docs/api.rst index b5b2399..3705b3c 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,20 +1,29 @@ -.. _api_ref: +.. _api: .. currentmodule:: pyls API === -PLS Decompositions +PLS decompositions ------------------ .. autofunction:: pyls.behavioral_pls .. autofunction:: pyls.meancentered_pls -PLS Results ------------ +PLS results objects +------------------- .. autoclass:: pyls.structures.PLSResults .. autoclass:: pyls.structures.PLSPermResults .. autoclass:: pyls.structures.PLSBootResults .. autoclass:: pyls.structures.PLSSplitHalfResults .. autoclass:: pyls.structures.PLSCrossValidationResults .. autoclass:: pyls.structures.PLSInputs + +Results I/O +----------- +.. autofunction:: pyls.save_results +.. autofunction:: pyls.load_results + +Matlab compatibility +-------------------- +.. autofunction:: pyls.import_matlab_result diff --git a/docs/conf.py b/docs/conf.py index 0ed48b9..a347e12 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,8 +15,8 @@ # -- Project information ----------------------------------------------------- project = 'pyls' -copyright = '2018, Ross Markello' -author = 'Ross Markello' +copyright = '2018, pyls developers' +author = 'pyls developers' # Import project to get version info sys.path.insert(0, os.path.abspath(os.path.pardir)) @@ -26,7 +26,6 @@ # The full version, including alpha/beta/rc tags release = pyls.__version__ - # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be @@ -71,7 +70,6 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' - # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -91,13 +89,11 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] - # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'pylsdoc' - # -- Extension configuration ------------------------------------------------- intersphinx_mapping = { 'numpy': ('https://docs.scipy.org/doc/numpy', None), diff --git a/docs/index.rst b/docs/index.rst index 56667d7..af0ee12 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,6 +3,8 @@ pyls: Partial Least Squares in Python ===================================== .. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master :target: https://travis-ci.org/rmarkello/pyls +.. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield + :target: https://circleci.com/gh/rmarkello/pyls .. image:: https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg :target: https://codecov.io/gh/rmarkello/pyls .. image:: https://readthedocs.org/projects/pyls/badge/?version=latest @@ -10,31 +12,9 @@ pyls: Partial Least Squares in Python .. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg :target: https://opensource.org/licenses/GPL-2.0 -About ------ -``pyls`` is a Python implementation of Partial Least Squares correlation as -introduced by `McIntosh et al., (1996)`_ in their `MATLAB toolbox`_. - -Usage ------ -See the :ref:`Usage ` or the :ref:`API reference ` for -information on how to get started with this package. - - -License Information -------------------- -This codebase is licensed under version 2 of the GNU General Public License. -The full license can be found in the `LICENSE`_ file in the ``pyls`` -distribution. - -All trademarks referenced herein are property of their respective holders. - -.. _McIntosh et al., (1996): https://www.ncbi.nlm.nih.gov/pubmed/9345485 -.. _MATLAB toolbox: https://www.rotman-baycrest.on.ca/index.php?section=84 -.. _LICENSE: https://github.com/rmarkello/pyls/blob/master/LICENSE - .. toctree:: :maxdepth: 1 + installation usage api diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..d7c18a4 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,19 @@ +.. _installation_setup: + +Installation and setup +====================== + +.. _basic_installation: + +Basic installation +-------------------- + +This package requires Python >= 3.5. Assuming you have the correct version of +Python installed, you can install ``pyls`` by opening a terminal and running +the following: + +.. code-block:: bash + + git clone https://github.com/rmarkello/pyls.git + cd pyls + python setup.py install diff --git a/docs/usage.rst b/docs/usage.rst index f2166ca..0194d74 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -1,5 +1,13 @@ -.. _usage_ref: +.. _usage: -Usage -===== -Work in progress. Check back soon! +User guide +========== + +.. toctree:: + :numbered: + + user_guide/overview.rst + user_guide/meancentered.rst + user_guide/behavioral.rst + user_guide/results.rst + user_guide/matlab.rst diff --git a/docs/user_guide/behavioral.rst b/docs/user_guide/behavioral.rst new file mode 100644 index 0000000..faf2a27 --- /dev/null +++ b/docs/user_guide/behavioral.rst @@ -0,0 +1,4 @@ +.. _usage_behavioral: + +Behavioral PLS +-------------- diff --git a/docs/user_guide/matlab.rst b/docs/user_guide/matlab.rst new file mode 100644 index 0000000..f4ca450 --- /dev/null +++ b/docs/user_guide/matlab.rst @@ -0,0 +1,4 @@ +.. _usage_matlab: + +Matlab compatibility +-------------------- diff --git a/docs/user_guide/meancentered.rst b/docs/user_guide/meancentered.rst new file mode 100644 index 0000000..3ba61c3 --- /dev/null +++ b/docs/user_guide/meancentered.rst @@ -0,0 +1,4 @@ +.. _usage_meancentered: + +Mean-centered PLS +----------------- diff --git a/docs/user_guide/overview.rst b/docs/user_guide/overview.rst new file mode 100644 index 0000000..ce9e30b --- /dev/null +++ b/docs/user_guide/overview.rst @@ -0,0 +1,4 @@ +.. _usage_overview: + +Partial Least Squares (PLS) decompositions +------------------------------------------ diff --git a/docs/user_guide/results.rst b/docs/user_guide/results.rst new file mode 100644 index 0000000..cee6a27 --- /dev/null +++ b/docs/user_guide/results.rst @@ -0,0 +1,4 @@ +.. _usage_results: + +The ``PLSResults`` data object +------------------------------ From 1f53c27b2dabedae1dc06ca04fb0fea4a5156c74 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 18 Dec 2018 23:05:58 -0500 Subject: [PATCH 13/32] [DOC] Adds license badge to README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 872f941..eb50012 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ This package provides a Python interface for partial least squares (PLS) analysi [![CircleCI](https://circleci.com/gh/rmarkello/pyls.svg?style=shield)](https://circleci.com/gh/rmarkello/pyls) [![Codecov](https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg)](https://codecov.io/gh/rmarkello/pyls) [![Documentation Status](https://readthedocs.org/projects/pyls/badge/?version=latest)](http://pyls.readthedocs.io/en/latest/?badge=latest) +[![License](https://img.shields.io/badge/License-GPL%202.0-blue.svg)](https://opensource.org/licenses/GPL-2.0) ## Table of Contents From bf1d5c39ee5df63ac119e0e7b7052a543f9451b2 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 21 Dec 2018 16:24:19 -0500 Subject: [PATCH 14/32] [DOC] No svgs for latex in index.rst --- docs/index.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index af0ee12..0e3f4ed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,16 +1,20 @@ ===================================== pyls: Partial Least Squares in Python ===================================== -.. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master - :target: https://travis-ci.org/rmarkello/pyls -.. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield - :target: https://circleci.com/gh/rmarkello/pyls -.. image:: https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg - :target: https://codecov.io/gh/rmarkello/pyls -.. image:: https://readthedocs.org/projects/pyls/badge/?version=latest - :target: http://pyls.readthedocs.io/en/latest -.. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg - :target: https://opensource.org/licenses/GPL-2.0 + +.. only:: html and epub + + .. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master + :target: https://travis-ci.org/rmarkello/pyls + .. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield + :target: https://circleci.com/gh/rmarkello/pyls + .. image:: https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg + :target: https://codecov.io/gh/rmarkello/pyls + .. image:: https://readthedocs.org/projects/pyls/badge/?version=latest + :target: http://pyls.readthedocs.io/en/latest + .. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg + :target: https://opensource.org/licenses/GPL-2.0 + .. toctree:: :maxdepth: 1 From 3f5e79227d2f9f80887e80bea107a9c7e6b0e0c2 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 18 Jan 2019 16:05:27 -0500 Subject: [PATCH 15/32] [DATA] New toy data examples --- data/linnerud_exercise.csv | 21 +++++ data/linnerud_physio.csv | 21 +++++ data/wine.csv | 179 +++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 data/linnerud_exercise.csv create mode 100644 data/linnerud_physio.csv create mode 100644 data/wine.csv diff --git a/data/linnerud_exercise.csv b/data/linnerud_exercise.csv new file mode 100644 index 0000000..de917ad --- /dev/null +++ b/data/linnerud_exercise.csv @@ -0,0 +1,21 @@ +,Chins,Situps,Jumps +0,5.0,162.0,60.0 +1,2.0,110.0,60.0 +2,12.0,101.0,101.0 +3,12.0,105.0,37.0 +4,13.0,155.0,58.0 +5,4.0,101.0,42.0 +6,8.0,101.0,38.0 +7,6.0,125.0,40.0 +8,15.0,200.0,40.0 +9,17.0,251.0,250.0 +10,17.0,120.0,38.0 +11,13.0,210.0,115.0 +12,14.0,215.0,105.0 +13,1.0,50.0,50.0 +14,6.0,70.0,31.0 +15,12.0,210.0,120.0 +16,4.0,60.0,25.0 +17,11.0,230.0,80.0 +18,15.0,225.0,73.0 +19,2.0,110.0,43.0 diff --git a/data/linnerud_physio.csv b/data/linnerud_physio.csv new file mode 100644 index 0000000..d124ad7 --- /dev/null +++ b/data/linnerud_physio.csv @@ -0,0 +1,21 @@ +,Weight,Waist,Pulse +0,191.0,36.0,50.0 +1,189.0,37.0,52.0 +2,193.0,38.0,58.0 +3,162.0,35.0,62.0 +4,189.0,35.0,46.0 +5,182.0,36.0,56.0 +6,211.0,38.0,56.0 +7,167.0,34.0,60.0 +8,176.0,31.0,74.0 +9,154.0,33.0,56.0 +10,169.0,34.0,50.0 +11,166.0,33.0,52.0 +12,154.0,34.0,64.0 +13,247.0,46.0,50.0 +14,193.0,36.0,46.0 +15,202.0,37.0,62.0 +16,176.0,37.0,54.0 +17,157.0,32.0,52.0 +18,156.0,33.0,54.0 +19,138.0,33.0,68.0 diff --git a/data/wine.csv b/data/wine.csv new file mode 100644 index 0000000..dcd82c3 --- /dev/null +++ b/data/wine.csv @@ -0,0 +1,179 @@ +,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline +0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0 +1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0 +2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0 +3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0 +4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0 +5,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0 +6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0 +7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0 +8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0 +9,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0 +10,14.1,2.16,2.3,18.0,105.0,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510.0 +11,14.12,1.48,2.32,16.8,95.0,2.2,2.43,0.26,1.57,5.0,1.17,2.82,1280.0 +12,13.75,1.73,2.41,16.0,89.0,2.6,2.76,0.29,1.81,5.6,1.15,2.9,1320.0 +13,14.75,1.73,2.39,11.4,91.0,3.1,3.69,0.43,2.81,5.4,1.25,2.73,1150.0 +14,14.38,1.87,2.38,12.0,102.0,3.3,3.64,0.29,2.96,7.5,1.2,3.0,1547.0 +15,13.63,1.81,2.7,17.2,112.0,2.85,2.91,0.3,1.46,7.3,1.28,2.88,1310.0 +16,14.3,1.92,2.72,20.0,120.0,2.8,3.14,0.33,1.97,6.2,1.07,2.65,1280.0 +17,13.83,1.57,2.62,20.0,115.0,2.95,3.4,0.4,1.72,6.6,1.13,2.57,1130.0 +18,14.19,1.59,2.48,16.5,108.0,3.3,3.93,0.32,1.86,8.7,1.23,2.82,1680.0 +19,13.64,3.1,2.56,15.2,116.0,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845.0 +20,14.06,1.63,2.28,16.0,126.0,3.0,3.17,0.24,2.1,5.65,1.09,3.71,780.0 +21,12.93,3.8,2.65,18.6,102.0,2.41,2.41,0.25,1.98,4.5,1.03,3.52,770.0 +22,13.71,1.86,2.36,16.6,101.0,2.61,2.88,0.27,1.69,3.8,1.11,4.0,1035.0 +23,12.85,1.6,2.52,17.8,95.0,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015.0 +24,13.5,1.81,2.61,20.0,96.0,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845.0 +25,13.05,2.05,3.22,25.0,124.0,2.63,2.68,0.47,1.92,3.58,1.13,3.2,830.0 +26,13.39,1.77,2.62,16.1,93.0,2.85,2.94,0.34,1.45,4.8,0.92,3.22,1195.0 +27,13.3,1.72,2.14,17.0,94.0,2.4,2.19,0.27,1.35,3.95,1.02,2.77,1285.0 +28,13.87,1.9,2.8,19.4,107.0,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915.0 +29,14.02,1.68,2.21,16.0,96.0,2.65,2.33,0.26,1.98,4.7,1.04,3.59,1035.0 +30,13.73,1.5,2.7,22.5,101.0,3.0,3.25,0.29,2.38,5.7,1.19,2.71,1285.0 +31,13.58,1.66,2.36,19.1,106.0,2.86,3.19,0.22,1.95,6.9,1.09,2.88,1515.0 +32,13.68,1.83,2.36,17.2,104.0,2.42,2.69,0.42,1.97,3.84,1.23,2.87,990.0 +33,13.76,1.53,2.7,19.5,132.0,2.95,2.74,0.5,1.35,5.4,1.25,3.0,1235.0 +34,13.51,1.8,2.65,19.0,110.0,2.35,2.53,0.29,1.54,4.2,1.1,2.87,1095.0 +35,13.48,1.81,2.41,20.5,100.0,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920.0 +36,13.28,1.64,2.84,15.5,110.0,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880.0 +37,13.05,1.65,2.55,18.0,98.0,2.45,2.43,0.29,1.44,4.25,1.12,2.51,1105.0 +38,13.07,1.5,2.1,15.5,98.0,2.4,2.64,0.28,1.37,3.7,1.18,2.69,1020.0 +39,14.22,3.99,2.51,13.2,128.0,3.0,3.04,0.2,2.08,5.1,0.89,3.53,760.0 +40,13.56,1.71,2.31,16.2,117.0,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795.0 +41,13.41,3.84,2.12,18.8,90.0,2.45,2.68,0.27,1.48,4.28,0.91,3.0,1035.0 +42,13.88,1.89,2.59,15.0,101.0,3.25,3.56,0.17,1.7,5.43,0.88,3.56,1095.0 +43,13.24,3.98,2.29,17.5,103.0,2.64,2.63,0.32,1.66,4.36,0.82,3.0,680.0 +44,13.05,1.77,2.1,17.0,107.0,3.0,3.0,0.28,2.03,5.04,0.88,3.35,885.0 +45,14.21,4.04,2.44,18.9,111.0,2.85,2.65,0.3,1.25,5.24,0.87,3.33,1080.0 +46,14.38,3.59,2.28,16.0,102.0,3.25,3.17,0.27,2.19,4.9,1.04,3.44,1065.0 +47,13.9,1.68,2.12,16.0,101.0,3.1,3.39,0.21,2.14,6.1,0.91,3.33,985.0 +48,14.1,2.02,2.4,18.8,103.0,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060.0 +49,13.94,1.73,2.27,17.4,108.0,2.88,3.54,0.32,2.08,8.9,1.12,3.1,1260.0 +50,13.05,1.73,2.04,12.4,92.0,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150.0 +51,13.83,1.65,2.6,17.2,94.0,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265.0 +52,13.82,1.75,2.42,14.0,111.0,3.88,3.74,0.32,1.87,7.05,1.01,3.26,1190.0 +53,13.77,1.9,2.68,17.1,115.0,3.0,2.79,0.39,1.68,6.3,1.13,2.93,1375.0 +54,13.74,1.67,2.25,16.4,118.0,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060.0 +55,13.56,1.73,2.46,20.5,116.0,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120.0 +56,14.22,1.7,2.3,16.3,118.0,3.2,3.0,0.26,2.03,6.38,0.94,3.31,970.0 +57,13.29,1.97,2.68,16.8,102.0,3.0,3.23,0.31,1.66,6.0,1.07,2.84,1270.0 +58,13.72,1.43,2.5,16.7,108.0,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285.0 +59,12.37,0.94,1.36,10.6,88.0,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520.0 +60,12.33,1.1,2.28,16.0,101.0,2.05,1.09,0.63,0.41,3.27,1.25,1.67,680.0 +61,12.64,1.36,2.02,16.8,100.0,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450.0 +62,13.67,1.25,1.92,18.0,94.0,2.1,1.79,0.32,0.73,3.8,1.23,2.46,630.0 +63,12.37,1.13,2.16,19.0,87.0,3.5,3.1,0.19,1.87,4.45,1.22,2.87,420.0 +64,12.17,1.45,2.53,19.0,104.0,1.89,1.75,0.45,1.03,2.95,1.45,2.23,355.0 +65,12.37,1.21,2.56,18.1,98.0,2.42,2.65,0.37,2.08,4.6,1.19,2.3,678.0 +66,13.11,1.01,1.7,15.0,78.0,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502.0 +67,12.37,1.17,1.92,19.6,78.0,2.11,2.0,0.27,1.04,4.68,1.12,3.48,510.0 +68,13.34,0.94,2.36,17.0,110.0,2.53,1.3,0.55,0.42,3.17,1.02,1.93,750.0 +69,12.21,1.19,1.75,16.8,151.0,1.85,1.28,0.14,2.5,2.85,1.28,3.07,718.0 +70,12.29,1.61,2.21,20.4,103.0,1.1,1.02,0.37,1.46,3.05,0.906,1.82,870.0 +71,13.86,1.51,2.67,25.0,86.0,2.95,2.86,0.21,1.87,3.38,1.36,3.16,410.0 +72,13.49,1.66,2.24,24.0,87.0,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472.0 +73,12.99,1.67,2.6,30.0,139.0,3.3,2.89,0.21,1.96,3.35,1.31,3.5,985.0 +74,11.96,1.09,2.3,21.0,101.0,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886.0 +75,11.66,1.88,1.92,16.0,97.0,1.61,1.57,0.34,1.15,3.8,1.23,2.14,428.0 +76,13.03,0.9,1.71,16.0,86.0,1.95,2.03,0.24,1.46,4.6,1.19,2.48,392.0 +77,11.84,2.89,2.23,18.0,112.0,1.72,1.32,0.43,0.95,2.65,0.96,2.52,500.0 +78,12.33,0.99,1.95,14.8,136.0,1.9,1.85,0.35,2.76,3.4,1.06,2.31,750.0 +79,12.7,3.87,2.4,23.0,101.0,2.83,2.55,0.43,1.95,2.57,1.19,3.13,463.0 +80,12.0,0.92,2.0,19.0,86.0,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278.0 +81,12.72,1.81,2.2,18.8,86.0,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714.0 +82,12.08,1.13,2.51,24.0,78.0,2.0,1.58,0.4,1.4,2.2,1.31,2.72,630.0 +83,13.05,3.86,2.32,22.5,85.0,1.65,1.59,0.61,1.62,4.8,0.84,2.01,515.0 +84,11.84,0.89,2.58,18.0,94.0,2.2,2.21,0.22,2.35,3.05,0.79,3.08,520.0 +85,12.67,0.98,2.24,18.0,99.0,2.2,1.94,0.3,1.46,2.62,1.23,3.16,450.0 +86,12.16,1.61,2.31,22.8,90.0,1.78,1.69,0.43,1.56,2.45,1.33,2.26,495.0 +87,11.65,1.67,2.62,26.0,88.0,1.92,1.61,0.4,1.34,2.6,1.36,3.21,562.0 +88,11.64,2.06,2.46,21.6,84.0,1.95,1.69,0.48,1.35,2.8,1.0,2.75,680.0 +89,12.08,1.33,2.3,23.6,70.0,2.2,1.59,0.42,1.38,1.74,1.07,3.21,625.0 +90,12.08,1.83,2.32,18.5,81.0,1.6,1.5,0.52,1.64,2.4,1.08,2.27,480.0 +91,12.0,1.51,2.42,22.0,86.0,1.45,1.25,0.5,1.63,3.6,1.05,2.65,450.0 +92,12.69,1.53,2.26,20.7,80.0,1.38,1.46,0.58,1.62,3.05,0.96,2.06,495.0 +93,12.29,2.83,2.22,18.0,88.0,2.45,2.25,0.25,1.99,2.15,1.15,3.3,290.0 +94,11.62,1.99,2.28,18.0,98.0,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345.0 +95,12.47,1.52,2.2,19.0,162.0,2.5,2.27,0.32,3.28,2.6,1.16,2.63,937.0 +96,11.81,2.12,2.74,21.5,134.0,1.6,0.99,0.14,1.56,2.5,0.95,2.26,625.0 +97,12.29,1.41,1.98,16.0,85.0,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428.0 +98,12.37,1.07,2.1,18.5,88.0,3.52,3.75,0.24,1.95,4.5,1.04,2.77,660.0 +99,12.29,3.17,2.21,18.0,88.0,2.85,2.99,0.45,2.81,2.3,1.42,2.83,406.0 +100,12.08,2.08,1.7,17.5,97.0,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710.0 +101,12.6,1.34,1.9,18.5,88.0,1.45,1.36,0.29,1.35,2.45,1.04,2.77,562.0 +102,12.34,2.45,2.46,21.0,98.0,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438.0 +103,11.82,1.72,1.88,19.5,86.0,2.5,1.64,0.37,1.42,2.06,0.94,2.44,415.0 +104,12.51,1.73,1.98,20.5,85.0,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672.0 +105,12.42,2.55,2.27,22.0,90.0,1.68,1.84,0.66,1.42,2.7,0.86,3.3,315.0 +106,12.25,1.73,2.12,19.0,80.0,1.65,2.03,0.37,1.63,3.4,1.0,3.17,510.0 +107,12.72,1.75,2.28,22.5,84.0,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488.0 +108,12.22,1.29,1.94,19.0,92.0,2.36,2.04,0.39,2.08,2.7,0.86,3.02,312.0 +109,11.61,1.35,2.7,20.0,94.0,2.74,2.92,0.29,2.49,2.65,0.96,3.26,680.0 +110,11.46,3.74,1.82,19.5,107.0,3.18,2.58,0.24,3.58,2.9,0.75,2.81,562.0 +111,12.52,2.43,2.17,21.0,88.0,2.55,2.27,0.26,1.22,2.0,0.9,2.78,325.0 +112,11.76,2.68,2.92,20.0,103.0,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607.0 +113,11.41,0.74,2.5,21.0,88.0,2.48,2.01,0.42,1.44,3.08,1.1,2.31,434.0 +114,12.08,1.39,2.5,22.5,84.0,2.56,2.29,0.43,1.04,2.9,0.93,3.19,385.0 +115,11.03,1.51,2.2,21.5,85.0,2.46,2.17,0.52,2.01,1.9,1.71,2.87,407.0 +116,11.82,1.47,1.99,20.8,86.0,1.98,1.6,0.3,1.53,1.95,0.95,3.33,495.0 +117,12.42,1.61,2.19,22.5,108.0,2.0,2.09,0.34,1.61,2.06,1.06,2.96,345.0 +118,12.77,3.43,1.98,16.0,80.0,1.63,1.25,0.43,0.83,3.4,0.7,2.12,372.0 +119,12.0,3.43,2.0,19.0,87.0,2.0,1.64,0.37,1.87,1.28,0.93,3.05,564.0 +120,11.45,2.4,2.42,20.0,96.0,2.9,2.79,0.32,1.83,3.25,0.8,3.39,625.0 +121,11.56,2.05,3.23,28.5,119.0,3.18,5.08,0.47,1.87,6.0,0.93,3.69,465.0 +122,12.42,4.43,2.73,26.5,102.0,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365.0 +123,13.05,5.8,2.13,21.5,86.0,2.62,2.65,0.3,2.01,2.6,0.73,3.1,380.0 +124,11.87,4.31,2.39,21.0,82.0,2.86,3.03,0.21,2.91,2.8,0.75,3.64,380.0 +125,12.07,2.16,2.17,21.0,85.0,2.6,2.65,0.37,1.35,2.76,0.86,3.28,378.0 +126,12.43,1.53,2.29,21.5,86.0,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352.0 +127,11.79,2.13,2.78,28.5,92.0,2.13,2.24,0.58,1.76,3.0,0.97,2.44,466.0 +128,12.37,1.63,2.3,24.5,88.0,2.22,2.45,0.4,1.9,2.12,0.89,2.78,342.0 +129,12.04,4.3,2.38,22.0,80.0,2.1,1.75,0.42,1.35,2.6,0.79,2.57,580.0 +130,12.86,1.35,2.32,18.0,122.0,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630.0 +131,12.88,2.99,2.4,20.0,104.0,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530.0 +132,12.81,2.31,2.4,24.0,98.0,1.15,1.09,0.27,0.83,5.7,0.66,1.36,560.0 +133,12.7,3.55,2.36,21.5,106.0,1.7,1.2,0.17,0.84,5.0,0.78,1.29,600.0 +134,12.51,1.24,2.25,17.5,85.0,2.0,0.58,0.6,1.25,5.45,0.75,1.51,650.0 +135,12.6,2.46,2.2,18.5,94.0,1.62,0.66,0.63,0.94,7.1,0.73,1.58,695.0 +136,12.25,4.72,2.54,21.0,89.0,1.38,0.47,0.53,0.8,3.85,0.75,1.27,720.0 +137,12.53,5.51,2.64,25.0,96.0,1.79,0.6,0.63,1.1,5.0,0.82,1.69,515.0 +138,13.49,3.59,2.19,19.5,88.0,1.62,0.48,0.58,0.88,5.7,0.81,1.82,580.0 +139,12.84,2.96,2.61,24.0,101.0,2.32,0.6,0.53,0.81,4.92,0.89,2.15,590.0 +140,12.93,2.81,2.7,21.0,96.0,1.54,0.5,0.53,0.75,4.6,0.77,2.31,600.0 +141,13.36,2.56,2.35,20.0,89.0,1.4,0.5,0.37,0.64,5.6,0.7,2.47,780.0 +142,13.52,3.17,2.72,23.5,97.0,1.55,0.52,0.5,0.55,4.35,0.89,2.06,520.0 +143,13.62,4.95,2.35,20.0,92.0,2.0,0.8,0.47,1.02,4.4,0.91,2.05,550.0 +144,12.25,3.88,2.2,18.5,112.0,1.38,0.78,0.29,1.14,8.21,0.65,2.0,855.0 +145,13.16,3.57,2.15,21.0,102.0,1.5,0.55,0.43,1.3,4.0,0.6,1.68,830.0 +146,13.88,5.04,2.23,20.0,80.0,0.98,0.34,0.4,0.68,4.9,0.58,1.33,415.0 +147,12.87,4.61,2.48,21.5,86.0,1.7,0.65,0.47,0.86,7.65,0.54,1.86,625.0 +148,13.32,3.24,2.38,21.5,92.0,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650.0 +149,13.08,3.9,2.36,21.5,113.0,1.41,1.39,0.34,1.14,9.4,0.57,1.33,550.0 +150,13.5,3.12,2.62,24.0,123.0,1.4,1.57,0.22,1.25,8.6,0.59,1.3,500.0 +151,12.79,2.67,2.48,22.0,112.0,1.48,1.36,0.24,1.26,10.8,0.48,1.47,480.0 +152,13.11,1.9,2.75,25.5,116.0,2.2,1.28,0.26,1.56,7.1,0.61,1.33,425.0 +153,13.23,3.3,2.28,18.5,98.0,1.8,0.83,0.61,1.87,10.52,0.56,1.51,675.0 +154,12.58,1.29,2.1,20.0,103.0,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640.0 +155,13.17,5.19,2.32,22.0,93.0,1.74,0.63,0.61,1.55,7.9,0.6,1.48,725.0 +156,13.84,4.12,2.38,19.5,89.0,1.8,0.83,0.48,1.56,9.01,0.57,1.64,480.0 +157,12.45,3.03,2.64,27.0,97.0,1.9,0.58,0.63,1.14,7.5,0.67,1.73,880.0 +158,14.34,1.68,2.7,25.0,98.0,2.8,1.31,0.53,2.7,13.0,0.57,1.96,660.0 +159,13.48,1.67,2.64,22.5,89.0,2.6,1.1,0.52,2.29,11.75,0.57,1.78,620.0 +160,12.36,3.83,2.38,21.0,88.0,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520.0 +161,13.69,3.26,2.54,20.0,107.0,1.83,0.56,0.5,0.8,5.88,0.96,1.82,680.0 +162,12.85,3.27,2.58,22.0,106.0,1.65,0.6,0.6,0.96,5.58,0.87,2.11,570.0 +163,12.96,3.45,2.35,18.5,106.0,1.39,0.7,0.4,0.94,5.28,0.68,1.75,675.0 +164,13.78,2.76,2.3,22.0,90.0,1.35,0.68,0.41,1.03,9.58,0.7,1.68,615.0 +165,13.73,4.36,2.26,22.5,88.0,1.28,0.47,0.52,1.15,6.62,0.78,1.75,520.0 +166,13.45,3.7,2.6,23.0,111.0,1.7,0.92,0.43,1.46,10.68,0.85,1.56,695.0 +167,12.82,3.37,2.3,19.5,88.0,1.48,0.66,0.4,0.97,10.26,0.72,1.75,685.0 +168,13.58,2.58,2.69,24.5,105.0,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750.0 +169,13.4,4.6,2.86,25.0,112.0,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630.0 +170,12.2,3.03,2.32,19.0,96.0,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510.0 +171,12.77,2.39,2.28,19.5,86.0,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470.0 +172,14.16,2.51,2.48,20.0,91.0,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660.0 +173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740.0 +174,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0 +175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0 +176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0 +177,14.13,4.1,2.74,24.5,96.0,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560.0 From 8e8958690021f9669e4bea3f560a8c7702d2c337 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 18 Jan 2019 16:12:22 -0500 Subject: [PATCH 16/32] [DATA] Adds new examples to datasets.json for loading --- pyls/examples/datasets.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pyls/examples/datasets.json b/pyls/examples/datasets.json index 1484229..f3fa747 100644 --- a/pyls/examples/datasets.json +++ b/pyls/examples/datasets.json @@ -1,4 +1,16 @@ { + "linnerud": { + "description": "These data come from a toy example demonstrating the relationship between exercise ability and physiological fitness.", + "reference": "Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Editions Technip.", + "urls": [ + "https://raw.githubusercontent.com/rmarkello/pyls/3f5e79227d2f9f80887e80bea107a9c7e6b0e0c2/data/linnerud_exercise.csv", + "https://raw.githubusercontent.com/rmarkello/pyls/3f5e79227d2f9f80887e80bea107a9c7e6b0e0c2/data/linnerud_physio.csv" + ], + "X": "linnerud_exercise.csv", + "Y": "linnerud_physio.csv", + "n_perm": 1000, + "n_boot": 1000 + }, "mirchi_2018": { "description": "Study examining the relationship between changes in functional brain connectivity derived from resting-state functional magnetic resonance imaging (rsfMRI) and behavioral mood scores using the MyConnectome database.", "reference": "Mirchi, N., Betzel, R. F., Bernhardt, B. C., Dagher, A., & Mišić, B. (2018). Tracking mood fluctuations with functional network patterns. Social Cognitive and Affective Neuroscience.", @@ -15,6 +27,21 @@ "test_split": 100, "parcellation": "parcel_data.txt" }, + "wine": { + "description": "These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines.", + "reference": "Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.", + "urls": [ + "https://raw.githubusercontent.com/rmarkello/pyls/3f5e79227d2f9f80887e80bea107a9c7e6b0e0c2/data/wine.csv" + ], + "X": "wine.csv", + "n_perm": 1000, + "n_boot": 1000, + "groups": [ + 59, + 71, + 48 + ] + }, "whitaker_vertes_2016": { "description": "Study examining the relationship between developmental brain changes derived from structural magnetic resonance imaging (sMRI) and genetic expression in the brain using the NeuroScience in Psychiatry Network (NSPN) dataset", "reference": "Whitaker, K. J., Vértes, P. E., Romero-Garcia, R., Váša, F., Moutoussis, M., Prabhu, G., Weiskopf, N., Callaghan, M. F., Wagstyl, K., Rittman, T., Tait, R., Ooi, C., Suckling, J., Inkster, B., Fonagy, P., Dolan, R. J., Jones, P. B., Goodyer, I. M., Bullmore, E. T. (2016). Adolescence is associated with genomically patterned consolidation of the hubs of the human brain connectome. Proceedings of the National Academy of Sciences, 113(32), 9105-9110.", From dfa24e15a58d3346ecbdccd1b60043c307359213 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 21 Jan 2019 16:23:25 -0500 Subject: [PATCH 17/32] [TEST] Fix tests for newest examples --- pyls/__init__.py | 3 +-- pyls/info.py | 2 +- pyls/tests/test_examples.py | 12 ++++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pyls/__init__.py b/pyls/__init__.py index 8eeea3d..4a63c08 100644 --- a/pyls/__init__.py +++ b/pyls/__init__.py @@ -4,7 +4,7 @@ '__author__', '__description__', '__email__', '__license__', '__maintainer__', '__packagename__', '__url__', '__version__', 'behavioral_pls', 'meancentered_pls', 'import_matlab_result', - 'PLSInputs', 'PLSResults', 'save_results', 'load_results', 'examples' + 'PLSInputs', 'PLSResults', 'save_results', 'load_results' ] from ._version import get_versions @@ -21,7 +21,6 @@ __url__, ) -from . import examples from .io import load_results, save_results from .matlab import import_matlab_result from .structures import PLSInputs, PLSResults diff --git a/pyls/info.py b/pyls/info.py index 0f2b4da..3ee139a 100644 --- a/pyls/info.py +++ b/pyls/info.py @@ -24,7 +24,7 @@ ] TESTS_REQUIRE = [ - 'pytest', + 'pytest>=3.6', 'pytest-cov' ] diff --git a/pyls/tests/test_examples.py b/pyls/tests/test_examples.py index 41ad6cd..ac84819 100644 --- a/pyls/tests/test_examples.py +++ b/pyls/tests/test_examples.py @@ -2,10 +2,10 @@ import os import pytest -import pyls +import pyls.examples DATASETS = [ - 'mirchi_2018', 'whitaker_vertes_2016' + 'mirchi_2018', 'whitaker_vertes_2016', 'wine', 'linnerud' ] @@ -29,10 +29,16 @@ def test_available_datasets(): @pytest.mark.parametrize(('dataset', 'keys'), [ + ('linnerud', [ + 'description', 'reference', 'urls', 'X', 'Y', 'n_perm', 'n_boot' + ]) ('mirchi_2018', [ 'description', 'reference', 'urls', 'X', 'Y', 'n_perm', 'n_boot', 'test_size', 'test_split', 'parcellation' ]), + ('wine', [ + 'description', 'reference', 'urls', 'X', 'n_perm', 'n_boot', 'groups' + ]), ('whitaker_vertes_2016', [ 'description', 'reference', 'urls', 'X', 'Y', 'n_perm' ]) @@ -68,7 +74,9 @@ def test_get_data_dir(tmpdir): @pytest.mark.parametrize(('dataset', 'keys'), [ + ('linnerud', ['X', 'Y', 'n_perm', 'n_boot']), ('mirchi_2018', ['X', 'Y', 'n_perm', 'n_boot', 'test_size', 'test_split']), + ('wine', ['X', 'groups', 'n_perm', 'n_boot']), ('whitaker_vertes_2016', ['X', 'Y', 'n_perm']) ]) def test_load_dataset(tmpdir, dataset, keys): From d8c09c1b78e3bca85c927a5d7f76492356e8a3e3 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 21 Jan 2019 16:24:32 -0500 Subject: [PATCH 18/32] [DOC] Better API and updates to User Guide --- .gitignore | 1 + docs/_templates/class.rst | 12 ++++ docs/_templates/function.rst | 10 +++ docs/api.rst | 104 +++++++++++++++++++++++-------- docs/conf.py | 7 +++ docs/usage.rst | 21 ++++++- docs/user_guide/behavioral.rst | 100 ++++++++++++++++++++++++++++- docs/user_guide/matlab.rst | 6 +- docs/user_guide/meancentered.rst | 11 +++- docs/user_guide/overview.rst | 4 -- docs/user_guide/results.rst | 2 +- 11 files changed, 243 insertions(+), 35 deletions(-) create mode 100644 docs/_templates/class.rst create mode 100644 docs/_templates/function.rst delete mode 100644 docs/user_guide/overview.rst diff --git a/.gitignore b/.gitignore index 364b859..9068f25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ plsc/ .vscode/ +docs/_generated/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/docs/_templates/class.rst b/docs/_templates/class.rst new file mode 100644 index 0000000..b57f31e --- /dev/null +++ b/docs/_templates/class.rst @@ -0,0 +1,12 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :no-members: + :no-inherited-members: + +.. raw:: html + +
diff --git a/docs/_templates/function.rst b/docs/_templates/function.rst new file mode 100644 index 0000000..20a4211 --- /dev/null +++ b/docs/_templates/function.rst @@ -0,0 +1,10 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autofunction:: {{ objname }} + +.. raw:: html + +
diff --git a/docs/api.rst b/docs/api.rst index 3705b3c..82c2acf 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,29 +1,83 @@ .. _api: +Reference API +============= + +This is the primary reference of ``pyls``. Please refer to the :ref:`user guide +` for more information on how to best implement these functions in your +own workflows. + +.. _decomp_ref: + +:mod:`pyls` - PLS Decompositions +-------------------------------------- + +.. automodule:: pyls.types + :no-members: + :no-inherited-members: + .. currentmodule:: pyls -API -=== - -PLS decompositions ------------------- -.. autofunction:: pyls.behavioral_pls -.. autofunction:: pyls.meancentered_pls - -PLS results objects -------------------- -.. autoclass:: pyls.structures.PLSResults -.. autoclass:: pyls.structures.PLSPermResults -.. autoclass:: pyls.structures.PLSBootResults -.. autoclass:: pyls.structures.PLSSplitHalfResults -.. autoclass:: pyls.structures.PLSCrossValidationResults -.. autoclass:: pyls.structures.PLSInputs - -Results I/O ------------ -.. autofunction:: pyls.save_results -.. autofunction:: pyls.load_results - -Matlab compatibility --------------------- -.. autofunction:: pyls.import_matlab_result +.. autosummary:: + :template: function.rst + :toctree: _generated/ + + behavioral_pls + meancentered_pls + +.. _results_ref: + +:mod:`pyls.structures` - PLS Results Objects +-------------------------------------------- + +.. automodule:: pyls.structures + :no-members: + :no-inherited-members: + +.. currentmodule:: pyls.structures + +.. autosummary:: + :template: class.rst + :toctree: _generated/ + + PLSResults + PLSPermResults + PLSBootResults + PLSSplitHalfResults + PLSCrossValidationResults + PLSInputs + +.. _io_ref: + +:mod:`pyls.io` - Data I/O +------------------------- + +.. automodule:: pyls.io + :no-members: + :no-inherited-members: + +.. currentmodule:: pyls.io + +.. autosummary:: + :template: function.rst + :toctree: _generated/ + + save_results + load_results + +.. _matlab_ref: + +:mod:`pyls.matlab` - Matlab Compatibility +----------------------------------------- + +.. automodule:: pyls.matlab + :no-members: + :no-inherited-members: + +.. currentmodule:: pyls.matlab + +.. autosummary:: + :template: function.rst + :toctree: _generated/ + + import_matlab_result diff --git a/docs/conf.py b/docs/conf.py index a347e12..898b8c8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,6 +43,7 @@ # Generate the API documentation when building autosummary_generate = True +autodoc_default_flags = ['members', 'inherited-members'] numpydoc_show_class_members = False autoclass_content = "class" @@ -84,6 +85,12 @@ # # html_theme_options = {} +html_context = { + 'css_files': [ + '_static/theme_overrides.css' + ] +} + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". diff --git a/docs/usage.rst b/docs/usage.rst index 0194d74..5259d1a 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -3,11 +3,28 @@ User guide ========== +Partial least squares (PLS) is a multivariate statistical technique that aims +to find shared information between two sets of variables. If you're unfamiliar +with PLS and are interested in a thorough (albeit quite technical) treatment, +`Abdi et al., 2013 `_ is a good +resource. + +This user guide will go through the basic statistical concepts of and detail +the two types of PLS implemented in the current package +(:ref:`usage_behavioral` and :ref:`usage_meancentered`), demonstrate how to +interpret and use the results of a PLS analysis (:ref:`usage_results`), and +provide some information on the compatibility of this toolbox with the +`Matlab PLS toolbox `_ +(:ref:`usage_matlab`). + .. toctree:: + :caption: Table of Contents :numbered: - user_guide/overview.rst - user_guide/meancentered.rst user_guide/behavioral.rst + user_guide/meancentered.rst user_guide/results.rst user_guide/matlab.rst + +Note that you can always refer to the :ref:`api` if you have questions about a +specific function! diff --git a/docs/user_guide/behavioral.rst b/docs/user_guide/behavioral.rst index faf2a27..4d45571 100644 --- a/docs/user_guide/behavioral.rst +++ b/docs/user_guide/behavioral.rst @@ -1,4 +1,102 @@ .. _usage_behavioral: Behavioral PLS --------------- +============== + +What we call behavioral PLS in the ``pyls`` package is actually the more +traditional form of PLS, which attempts to find shared information between two +sets of variables. However, as with all things, there are a number of +ever-so-slightly different kinds of behavioral PLS that exist in the wild, so +to be thorough we're going to briefly explain the exact flavor implemented. + +:py:func:`pyls.behavioral_pls` employs a symmetrical, singular value +decomposition (SVD) based form of PLS. It is sometimes referred to as +PLS-correlation or PLS-SVD. Notably, it is **not** the same as PLS regression. +That is, we are not assessing *dependent* relationships between sets of data, +but rather how the two sets generally covary. + +To understand this a bit more we can walk through a quick example. + +An exercise in calisthenics +--------------------------- + +Let's assume we have two matrices :math:`X` and :math:`Y`. For the sake of +working with something concrete we're going to use one of our example +datasets [1]_: + +.. doctest:: + + >>> import pyls.examples + >>> data = pyls.examples.load_dataset('linnerud') + >>> data + PLSInputs(X, Y, n_perm, n_boot) + +.. note:: + + This is the same dataset as :py:func:`sklearn.datasets.load_linnerud`; the + formatting has been lightly modified to better suit our purposes. + +Looking at our matrices, we see: + +.. doctest:: + + >>> data.X.shape + (20, 3) + >>> data.X.head() + Chins Situps Jumps + 0 5.0 162.0 60.0 + 1 2.0 110.0 60.0 + 2 12.0 101.0 101.0 + 3 12.0 105.0 37.0 + 4 13.0 155.0 58.0 + +The rows of our :math:`X` matrix here represent subjects, and the columns +indicate different types of exercises these subjects were able to perform. + +.. doctest:: + + >>> data.Y.shape + (20, 3) + >>> data.Y.head() + Weight Waist Pulse + 0 191.0 36.0 50.0 + 1 189.0 37.0 52.0 + 2 193.0 38.0 58.0 + 3 162.0 35.0 62.0 + 4 189.0 35.0 46.0 + +The rows of our :math:`Y` matrix *also* represent subjects (critically, the +same subjects as in :math:`X`), and the columns indicate physiological +measurements taken for each subject. We can use behavioral PLS to establish +whether a relationship exists between the measured exercise and physiological +variables. + +The cross-covariance matrix +--------------------------- + +Behavioral PLS works by decomposing the cross-covariance matrix, :math:`R`, +generated from the input matrices, where :math:`R = Y^{T} \times X`. The +results of PLS are a bit easier to interpret when :math:`R` is the +cross-correlation matrix instead of the cross-covariance matrix, which means +that we should z-score each feature in :math:`X` and :math:`Y` before +multiplying them; this is done automatically by :py:func:`pyls.behavioral_pls` +(but can be turned off by passing the ``covariance=True`` parameter). + +In our example, :math:`R` ends up being a 3 x 3 matrix. Note that we pass +``norm=False`` to the cross-correlation function + +.. doctest:: + + >>> from pyls.compute import xcorr + >>> R = xcorr(data.X, data.Y, norm=False) + >>> R + array([[-0.38969365, -0.49308365, -0.22629556], + [-0.55223213, -0.64559803, -0.19149937], + [ 0.15064802, 0.22503808, 0.03493306]]) + +Examining the first row, we can see that -0.3897 represents the correlation +between ``Chins`` and ``Weight`` across all the subjects, -0.4931 the +correlation between ``Situps`` and ``Weight``, and so on. + +.. [1] Tenenhaus, M. (1998). La régression PLS: théorie et pratique. Editions + technip. diff --git a/docs/user_guide/matlab.rst b/docs/user_guide/matlab.rst index f4ca450..052c7a3 100644 --- a/docs/user_guide/matlab.rst +++ b/docs/user_guide/matlab.rst @@ -1,4 +1,8 @@ .. _usage_matlab: Matlab compatibility --------------------- +==================== + +``pyls`` supports loading PLS results that were generated by the `Matlab PLS +toolbox `_. + diff --git a/docs/user_guide/meancentered.rst b/docs/user_guide/meancentered.rst index 3ba61c3..3f1810f 100644 --- a/docs/user_guide/meancentered.rst +++ b/docs/user_guide/meancentered.rst @@ -1,4 +1,13 @@ .. _usage_meancentered: Mean-centered PLS ------------------ +================= + +In contrast to behavioral PLS, mean-centered_PLS doesn't aim to find +relationships between two sets of variables. Instead, it tries to find +relationships between *groupings* in a single set of variables. Indeed, you can +think of it almost like a multivariate t-test or ANOVA (depending on how many +groups you have!). + +An oenological example +---------------------- diff --git a/docs/user_guide/overview.rst b/docs/user_guide/overview.rst deleted file mode 100644 index ce9e30b..0000000 --- a/docs/user_guide/overview.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _usage_overview: - -Partial Least Squares (PLS) decompositions ------------------------------------------- diff --git a/docs/user_guide/results.rst b/docs/user_guide/results.rst index cee6a27..0673d8c 100644 --- a/docs/user_guide/results.rst +++ b/docs/user_guide/results.rst @@ -1,4 +1,4 @@ .. _usage_results: The ``PLSResults`` data object ------------------------------- +============================== From 2f98ece37b2a0fefdf5552af0d4604b11d1672e3 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 21 Jan 2019 16:25:01 -0500 Subject: [PATCH 19/32] [DOC] Word wrapping in documentation tables --- docs/_static/theme_overrides.css | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 docs/_static/theme_overrides.css diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css new file mode 100644 index 0000000..63ee6cc --- /dev/null +++ b/docs/_static/theme_overrides.css @@ -0,0 +1,13 @@ +/* override table width restrictions */ +@media screen and (min-width: 767px) { + + .wy-table-responsive table td { + /* !important prevents the common CSS stylesheets from overriding + this as on RTD they are loaded after this stylesheet */ + white-space: normal !important; + } + + .wy-table-responsive { + overflow: visible !important; + } +} From 03425066cfe0337e312ba08d131c52a39e73f50a Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 21 Jan 2019 16:25:27 -0500 Subject: [PATCH 20/32] [DOC] Module and function doc-string improvements --- pyls/base.py | 5 +++-- pyls/compute.py | 9 ++------- pyls/io.py | 8 +++----- pyls/matlab/__init__.py | 6 +++++- pyls/structures.py | 5 ++++- pyls/tests/matlab.py | 4 ++-- pyls/types/__init__.py | 3 +++ pyls/types/meancentered.py | 2 +- 8 files changed, 23 insertions(+), 19 deletions(-) diff --git a/pyls/base.py b/pyls/base.py index 0ca4086..bd89310 100644 --- a/pyls/base.py +++ b/pyls/base.py @@ -243,7 +243,8 @@ class BasePLS(): {groups} {conditions} **kwargs : optional - Additional key-value pairs; see :obj:`pyls.PLSInputs` for more info + Additional key-value pairs; see :obj:`pyls.structures.PLSInputs` for + more info References ---------- @@ -346,7 +347,7 @@ def run_pls(self, X, Y): Returns ------- - results : :obj:`pyls.PLSResults` + results : :obj:`pyls.structures.PLSResults` Results of PLS (not including PLS type-specific outputs) """ diff --git a/pyls/compute.py b/pyls/compute.py index 11a9eef..0a4ebe8 100644 --- a/pyls/compute.py +++ b/pyls/compute.py @@ -2,7 +2,7 @@ import numpy as np from sklearn.utils.extmath import randomized_svd -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_array, check_X_y from pyls import utils @@ -29,12 +29,7 @@ def xcorr(X, Y, norm=True, covariance=False): Cross-covariance of `X` and `Y` """ - if X.ndim != Y.ndim: - raise ValueError('Number of dims of `X` and `Y` must match.') - if X.ndim != 2: - raise ValueError('`X` and `Y` must each have 2 dims.') - if len(X) != len(Y): - raise ValueError('The first dim of `X` and `Y` must match.') + X, Y = check_X_y(X, Y, multi_output=True) if not covariance: Xn, Yn = zscore(X), zscore(Y) diff --git a/pyls/io.py b/pyls/io.py index 249d856..1ade014 100644 --- a/pyls/io.py +++ b/pyls/io.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- """ -Functions for saving and loading PLS results objects - -Most functions adapted from https://codereview.stackexchange.com/a/121308 +Functions for saving and loading PLS data objects """ import h5py @@ -21,7 +19,7 @@ def save_results(fname, results): ---------- fname : str Filepath to where hdf5 file should be created and `results` stored - results : :obj:`pyls.PLSResults` + results : :obj:`pyls.structures.PLSResults` PLSResults object to be saved Returns @@ -75,7 +73,7 @@ def load_results(fname): Returns ------- - results : :obj:`pyls.PLSResults` + results : :obj:`pyls.structures.PLSResults` Loaded PLS results """ diff --git a/pyls/matlab/__init__.py b/pyls/matlab/__init__.py index 24ff372..c67a436 100644 --- a/pyls/matlab/__init__.py +++ b/pyls/matlab/__init__.py @@ -1,4 +1,8 @@ -__all__ = ['import_matlab_result'] +# -*- coding: utf-8 -*- +""" +Utilities for handling PLS results generated using the Matlab PLS toolbox +""" +__all__ = ['import_matlab_result'] from .io import import_matlab_result diff --git a/pyls/structures.py b/pyls/structures.py index 8b066f8..d5716bc 100644 --- a/pyls/structures.py +++ b/pyls/structures.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +""" +Data structures to hold PLS inputs and results objects +""" from multiprocessing import cpu_count from textwrap import dedent @@ -100,7 +103,7 @@ all available processors. Default: None\ """), pls_results=dedent("""\ - results : :obj:`pyls.PLSResults` + results : :obj:`pyls.structures.PLSResults` Dictionary-like object containing results from the PLS analysis\ """), references=dedent("""\ diff --git a/pyls/tests/matlab.py b/pyls/tests/matlab.py index 0e52f56..a23fb35 100644 --- a/pyls/tests/matlab.py +++ b/pyls/tests/matlab.py @@ -120,9 +120,9 @@ def compare_python_matlab(python, matlab, *, atol=1e-4, corr=0.975, alpha=0.05, Parameters ---------- - python : :obj:`pyls.PLSResults` + python : :obj:`pyls.structures.PLSResults` PLSResults object generated from Python - matlab : :obj:`pyls.PLSResults` + matlab : :obj:`pyls.structures.PLSResults` PLSResults object generated from Matlab atol : float, optional Absolute tolerance permitted between `python` and `matlab` results diff --git a/pyls/types/__init__.py b/pyls/types/__init__.py index f634ae0..05e5626 100644 --- a/pyls/types/__init__.py +++ b/pyls/types/__init__.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +""" +The primary PLS decomposition methods for use in conducting PLS analyses +""" __all__ = ['behavioral_pls', 'meancentered_pls'] diff --git a/pyls/types/meancentered.py b/pyls/types/meancentered.py index 18e6e52..4aa3c90 100644 --- a/pyls/types/meancentered.py +++ b/pyls/types/meancentered.py @@ -114,7 +114,7 @@ def run_pls(self, X, Y): Returns ------- - res : :obj:`pyls.PLSResults` + res : :obj:`pyls.structures.PLSResults` PLS results object """ From a2bcc06a802c3704b8955cbada61b6e8c36c50fd Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 22 Jan 2019 11:21:07 -0500 Subject: [PATCH 21/32] [REF] Set norm=False default in compute.xcorr() It doesn't really make sense to have norm=True since we almost never use it! --- pyls/compute.py | 4 ++-- pyls/tests/test_compute.py | 2 +- pyls/types/behavioral.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pyls/compute.py b/pyls/compute.py index 0a4ebe8..3228512 100644 --- a/pyls/compute.py +++ b/pyls/compute.py @@ -6,7 +6,7 @@ from pyls import utils -def xcorr(X, Y, norm=True, covariance=False): +def xcorr(X, Y, norm=False, covariance=False): """ Calculates the cross-covariance matrix of `X` and `Y` @@ -18,7 +18,7 @@ def xcorr(X, Y, norm=True, covariance=False): Input matrix, where `S` is samples and `T` is features. norm : bool, optional Whether to normalize `X` and `Y` (i.e., sum of squares = 1). Default: - True + False covariance : bool, optional Whether to calculate the cross-covariance matrix instead of the cross- correlation matrix. Default: False diff --git a/pyls/tests/test_compute.py b/pyls/tests/test_compute.py index c1518ac..efdcfc7 100644 --- a/pyls/tests/test_compute.py +++ b/pyls/tests/test_compute.py @@ -31,7 +31,7 @@ def test_xcorr(): xcorr = pyls.compute.xcorr(X, Y) assert xcorr.shape == (25, 200) - xcorr = pyls.compute.xcorr(X, Y, norm=False) + xcorr = pyls.compute.xcorr(X, Y, norm=True) assert xcorr.shape == (25, 200) with pytest.raises(ValueError): diff --git a/pyls/types/behavioral.py b/pyls/types/behavioral.py index 1c6ac94..ef2b8a8 100644 --- a/pyls/types/behavioral.py +++ b/pyls/types/behavioral.py @@ -53,7 +53,6 @@ def gen_covcorr(self, X, Y, groups, **kwargs): crosscov = [] for grp in groups.T.astype(bool): crosscov.append(compute.xcorr(X[grp], Y[grp], - norm=False, covariance=self.inputs.covariance)) return np.row_stack(crosscov) From d021cf26ba8aced79502ed60e8e23121ac5c2a0b Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Tue, 22 Jan 2019 15:32:55 -0500 Subject: [PATCH 22/32] [FIX] Error in example testing --- pyls/tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyls/tests/test_examples.py b/pyls/tests/test_examples.py index ac84819..f38d2c4 100644 --- a/pyls/tests/test_examples.py +++ b/pyls/tests/test_examples.py @@ -31,7 +31,7 @@ def test_available_datasets(): @pytest.mark.parametrize(('dataset', 'keys'), [ ('linnerud', [ 'description', 'reference', 'urls', 'X', 'Y', 'n_perm', 'n_boot' - ]) + ]), ('mirchi_2018', [ 'description', 'reference', 'urls', 'X', 'Y', 'n_perm', 'n_boot', 'test_size', 'test_split', 'parcellation' From 5ba7796cc3f4443135428f7b53c31850cd7f4b27 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 25 Jan 2019 15:52:05 -0500 Subject: [PATCH 23/32] [DOC] Updates user guide Still a long ways to go... --- docs/api.rst | 23 ++-- docs/conf.py | 1 + docs/usage.rst | 13 +- docs/user_guide/behavioral.rst | 221 +++++++++++++++++++++++++------ docs/user_guide/matlab.rst | 8 -- docs/user_guide/meancentered.rst | 31 ++++- docs/user_guide/results.rst | 37 +++++- 7 files changed, 262 insertions(+), 72 deletions(-) delete mode 100644 docs/user_guide/matlab.rst diff --git a/docs/api.rst b/docs/api.rst index 82c2acf..edc25dc 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,4 +1,4 @@ -.. _api: +.. _ref_api: Reference API ============= @@ -7,9 +7,12 @@ This is the primary reference of ``pyls``. Please refer to the :ref:`user guide ` for more information on how to best implement these functions in your own workflows. -.. _decomp_ref: +.. contents:: **List of modules** + :local: -:mod:`pyls` - PLS Decompositions +.. _ref_decomp: + +:mod:`pyls` - PLS decompositions -------------------------------------- .. automodule:: pyls.types @@ -25,9 +28,9 @@ own workflows. behavioral_pls meancentered_pls -.. _results_ref: +.. _ref_results: -:mod:`pyls.structures` - PLS Results Objects +:mod:`pyls.structures` - PLS data structures -------------------------------------------- .. automodule:: pyls.structures @@ -47,10 +50,10 @@ own workflows. PLSCrossValidationResults PLSInputs -.. _io_ref: +.. _ref_io: -:mod:`pyls.io` - Data I/O -------------------------- +:mod:`pyls.io` - Data I/O functionality +--------------------------------------- .. automodule:: pyls.io :no-members: @@ -65,9 +68,9 @@ own workflows. save_results load_results -.. _matlab_ref: +.. _ref_matlab: -:mod:`pyls.matlab` - Matlab Compatibility +:mod:`pyls.matlab` - Matlab compatibility ----------------------------------------- .. automodule:: pyls.matlab diff --git a/docs/conf.py b/docs/conf.py index 898b8c8..17657e7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,6 +70,7 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +highlight_language = 'python3' # -- Options for HTML output ------------------------------------------------- diff --git a/docs/usage.rst b/docs/usage.rst index 5259d1a..f8d633f 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -11,11 +11,10 @@ resource. This user guide will go through the basic statistical concepts of and detail the two types of PLS implemented in the current package -(:ref:`usage_behavioral` and :ref:`usage_meancentered`), demonstrate how to -interpret and use the results of a PLS analysis (:ref:`usage_results`), and -provide some information on the compatibility of this toolbox with the -`Matlab PLS toolbox `_ -(:ref:`usage_matlab`). +(:ref:`usage_behavioral` and :ref:`usage_meancentered`) and demonstrate how to +interpret and use the results of a PLS analysis (:ref:`usage_results`). If you +still have questions after going through that you can refer to the +:ref:`ref_api`. .. toctree:: :caption: Table of Contents @@ -24,7 +23,3 @@ provide some information on the compatibility of this toolbox with the user_guide/behavioral.rst user_guide/meancentered.rst user_guide/results.rst - user_guide/matlab.rst - -Note that you can always refer to the :ref:`api` if you have questions about a -specific function! diff --git a/docs/user_guide/behavioral.rst b/docs/user_guide/behavioral.rst index 4d45571..23fa78d 100644 --- a/docs/user_guide/behavioral.rst +++ b/docs/user_guide/behavioral.rst @@ -1,45 +1,67 @@ +.. testsetup:: + + import numpy as np + np.set_printoptions(suppress=True) + .. _usage_behavioral: Behavioral PLS ============== What we call behavioral PLS in the ``pyls`` package is actually the more -traditional form of PLS, which attempts to find shared information between two -sets of variables. However, as with all things, there are a number of -ever-so-slightly different kinds of behavioral PLS that exist in the wild, so -to be thorough we're going to briefly explain the exact flavor implemented. +traditional form of PLS (and is generally not prefixed with "behavioral" --- +more on that later). This form of PLS, at its core, attempts to find shared +information between two sets of features from a common set of samples. However, +as with all things, there are a number of ever-so-slightly different kinds of +PLS that exist in the wild, so to be thorough we're going to briefly explain +the exact flavor implemented here before diving into a more illustrative +example. + +What *exactly* is behavioral PLS? +--------------------------------- :py:func:`pyls.behavioral_pls` employs a symmetrical, singular value -decomposition (SVD) based form of PLS. It is sometimes referred to as -PLS-correlation or PLS-SVD. Notably, it is **not** the same as PLS regression. -That is, we are not assessing *dependent* relationships between sets of data, -but rather how the two sets generally covary. +decomposition (SVD) based form of PLS, and is sometimes referred to as +PLS-correlation (PLS-C) or PLS-SVD. Notably, it is **not** the same as PLS +regression (PLS-R). Indeed, you can think of the differences between PLS-C and +PLS-R similar to how you might consider the differences between a Pearson +correlation and a simple linear regression. Though this analogy is a bit of a +simplification, the primary difference to take away is that behavioral PLS +(PLS-C) does *not assess directional relationships between sets of data* (e.g., +X → Y), but rather how the two sets generally covary (e.g., X ↔ Y). -To understand this a bit more we can walk through a quick example. +To understand this a bit more we can walk through a detailed example. An exercise in calisthenics --------------------------- -Let's assume we have two matrices :math:`X` and :math:`Y`. For the sake of -working with something concrete we're going to use one of our example -datasets [1]_: +.. note:: + Descriptions of PLS are almost always accompanied by a litany of equations, + and for good reason: understanding PLS requires understanding the math + behind it. As such, this example is going to rely on these equations, but + will always do so in the context of real data. The hope is that this + approach will help make the more abstract mathematical concepts a bit more + concrete (and easier to apply to new data sets!). + +We'll start by loading the example dataset [1]_: .. doctest:: - >>> import pyls.examples - >>> data = pyls.examples.load_dataset('linnerud') - >>> data - PLSInputs(X, Y, n_perm, n_boot) + >>> from pyls.examples import load_dataset + >>> data = load_dataset('linnerud') -.. note:: +This is the same dataset as in :py:func:`sklearn.datasets.load_linnerud`; the +formatting has just been lightly modified to better suit our purposes. - This is the same dataset as :py:func:`sklearn.datasets.load_linnerud`; the - formatting has been lightly modified to better suit our purposes. - -Looking at our matrices, we see: +Our ``data`` object can be treated as a dictionary, containing all the +information necessary to run a PLS analysis. The keys can be accessed as +attributes, so we can take a quick look at our input matrices +:math:`\textbf{X}` and :math:`\textbf{Y}`: .. doctest:: + >>> data.keys() + dict_keys(['X', 'Y', 'n_perm', 'n_boot']) >>> data.X.shape (20, 3) >>> data.X.head() @@ -50,8 +72,10 @@ Looking at our matrices, we see: 3 12.0 105.0 37.0 4 13.0 155.0 58.0 -The rows of our :math:`X` matrix here represent subjects, and the columns -indicate different types of exercises these subjects were able to perform. +The rows of our :math:`\textbf{X}_{n \times p}` matrix here represent *n* +subjects, and the columns indicate *p* different types of exercises these +subjects were able to perform. So the first subject was able to do 5 chin-ups, +162 situps, and 60 jumping jacks. .. doctest:: @@ -65,38 +89,153 @@ indicate different types of exercises these subjects were able to perform. 3 162.0 35.0 62.0 4 189.0 35.0 46.0 -The rows of our :math:`Y` matrix *also* represent subjects (critically, the -same subjects as in :math:`X`), and the columns indicate physiological -measurements taken for each subject. We can use behavioral PLS to establish -whether a relationship exists between the measured exercise and physiological -variables. +The rows of our :math:`\textbf{Y}_{n \times q}` matrix *also* represent *n* +subjects (critically, the same subjects as in :math:`\textbf{X}`), and the +columns indicate *q* physiological measurements taken for each subject. That +same subject referenced above thus has a weight of 191 pounds, a 36 inch waist, +and a resting pulse of 50 beats per minute. + +Behavioral PLS will attempt to establish whether a relationship exists between +the exercises performed and these physiological variables. If we wanted to run +the full analysis right away, we could do so with: + +.. doctest:: + + >>> from pyls import behavioral_pls + >>> results = behavioral_pls(**data) + +If you're comfortable with the down-and-dirty of PLS and want to go ahead and +start understanding the ``results`` object, feel free to jump ahead to +:ref:`usage_results`. Otherwise, read on for more about what's happening behind +the scenes of :py:func:`~.behavioral_pls` The cross-covariance matrix --------------------------- -Behavioral PLS works by decomposing the cross-covariance matrix, :math:`R`, -generated from the input matrices, where :math:`R = Y^{T} \times X`. The -results of PLS are a bit easier to interpret when :math:`R` is the -cross-correlation matrix instead of the cross-covariance matrix, which means -that we should z-score each feature in :math:`X` and :math:`Y` before -multiplying them; this is done automatically by :py:func:`pyls.behavioral_pls` -(but can be turned off by passing the ``covariance=True`` parameter). +Behavioral PLS works by decomposing the cross-covariance matrix +:math:`\textbf{R}_{q \times p}` generated from the input matrices, where +:math:`\textbf{R} = \textbf{Y}^{T} \textbf{X}`. The results of PLS are a +bit easier to interpret when :math:`\textbf{R}` is the cross-correlation matrix +instead of the cross-covariance matrix, which means that we should z-score each +feature in :math:`\textbf{X}` and :math:`\textbf{Y}` before multiplying them; +this is done automatically by the :py:func:`~.behavioral_pls` function. -In our example, :math:`R` ends up being a 3 x 3 matrix. Note that we pass -``norm=False`` to the cross-correlation function +In our example, :math:`\textbf{R}` ends up being a 3 x 3 matrix: .. doctest:: >>> from pyls.compute import xcorr - >>> R = xcorr(data.X, data.Y, norm=False) + >>> R = xcorr(data.X, data.Y) >>> R array([[-0.38969365, -0.49308365, -0.22629556], [-0.55223213, -0.64559803, -0.19149937], [ 0.15064802, 0.22503808, 0.03493306]]) -Examining the first row, we can see that -0.3897 represents the correlation -between ``Chins`` and ``Weight`` across all the subjects, -0.4931 the -correlation between ``Situps`` and ``Weight``, and so on. +The :math:`q` rows of this matrix correspond to the physiological measurements +and the :math:`p` columns to the exercises. Examining the first row, we can see +that ``-0.38969365`` is the correlation between ``Weight`` and ``Chins`` across +all the subjects, ``-0.49308365`` the correlation between ``Weight`` and +``Situps``, and so on. + +Singular value decomposition +---------------------------- + +Once we have generated our correlation matrix :math:`\textbf{R}` we subject it +to a singular value decomposition, where :math:`\textbf{R} = \textbf{USV}^{T}`: + +.. doctest:: + + >>> from pyls.compute import svd + >>> U, S, V = svd(R) + >>> U.shape, S.shape, V.shape + ((3, 3), (3, 3), (3, 3)) + +The outputs of this decomposition are two arrays of left and right singular +vectors (:math:`\textbf{U}_{p \times l}` and :math:`\textbf{V}_{q \times l}`) +and a diagonal matrix of singular values (:math:`\textbf{S}_{l \times l}`). The +rows of :math:`\textbf{U}` correspond to the exercises from our input matrix +:math:`\textbf{X}`, and the rows of :math:`\textbf{V}` correspond to the +physiological measurements from our input matrix :math:`\textbf{Y}`. The +columns of :math:`\textbf{U}` and :math:`\textbf{V}`, on the other hand, +represent new dimensions or components that have been "discovered" in the data. + +The :math:`i^{th}` columns of :math:`\textbf{U}` and :math:`\textbf{V}` weigh +the contributions of these exercises and physiological measurements, +respectively. Taken together, the :math:`i^{th}` left and right singular +vectors and singular value represent a *latent variable*, a multivariate +pattern that weighs the original exercise and physiological measurements such +that they maximally covary with each other. + +The :math:`i^{th}` singular value is proportional to the total +exercise-physiology covariance accounted for by the latent variable. The +effect size (:math:`\eta`) associated with a particular latent variable can be +estimated as the ratio of the squared singular value (:math:`\sigma`) to the +sum of all the squared singular values: + +.. math:: + + \eta_{i} = \sigma_{i}^{2} \big/ \sum \limits_{j=1}^{l} \sigma_{j}^{2} + +We can use the helper function :py:func:`pyls.compute.varexp` to calculate this +for us: + +.. doctest:: + + >>> from pyls.compute import varexp + >>> varexp(S)[0, 0] + 0.99471333682479335 + +Taking a look at the variance explained, we see that a whopping ~99.5% of the +covariance between the exercises and physiological measurements in +:math:`\textbf{X}` and :math:`\textbf{Y}` are explained by this latent +variable, suggesting that the relationship between these variable can be +effectively explained by a single dimension. + +Examining the weights from the singular vectors: + +.. doctest:: + + >>> U[:, 0] + array([ 0.61330742, 0.7469717 , 0.25668519]) + >>> V[:, 0] + array([-0.58989118, -0.77134059, 0.23887675]) + +we see that all the exercises (``U[:, 0]``) are positively weighted, but that +the physiological measurements (``V[:, 0]``) are split, with ``Weight`` and +``Waist`` measurements negatively weighted and ``Pulse`` positively weighted. +(Note that the order of the weights is the same as the order of the original +columns in our :math:`\textbf{X}` and :math:`\textbf{Y}` matrices.) Taken +together this suggests that, for the subjects in this dataset, individuals who +completed more of a given exercise tended to: + +1. Complete more of the other exercises, and +2. Have a lower weight, smaller waist, and higher heart rate. + +It is also worth examining how correlated the projections of the original +variables on this latent variable are. To do that, we can multiply the original +data matrices by the relevant singular vectors and then correlate the results: + +.. doctest:: + + >>> from scipy.stats import pearsonr + >>> XU = np.dot(data.X, U) + >>> YV = np.dot(data.Y, V) + >>> pearsonr(XU[:, 0], YV[:, 0]) + (0.48997247845503833, 0.028304653097330421) + +The correlation value of this latent variable (~ ``0.49`` ) suggests that our +interpretation of the singular vectors weights, above, is moderately true. + +Latent variable significance testing +------------------------------------ + +How can we check that these latent variables are significant? That is, that +the likelihood of them explaining *this much* variance is greater than chance +(to some prespecified alpha)? + +Reliability of the singular vectors +----------------------------------- + .. [1] Tenenhaus, M. (1998). La régression PLS: théorie et pratique. Editions technip. diff --git a/docs/user_guide/matlab.rst b/docs/user_guide/matlab.rst deleted file mode 100644 index 052c7a3..0000000 --- a/docs/user_guide/matlab.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _usage_matlab: - -Matlab compatibility -==================== - -``pyls`` supports loading PLS results that were generated by the `Matlab PLS -toolbox `_. - diff --git a/docs/user_guide/meancentered.rst b/docs/user_guide/meancentered.rst index 3f1810f..6bad1b4 100644 --- a/docs/user_guide/meancentered.rst +++ b/docs/user_guide/meancentered.rst @@ -3,11 +3,38 @@ Mean-centered PLS ================= -In contrast to behavioral PLS, mean-centered_PLS doesn't aim to find +In contrast to behavioral PLS, mean-centered PLS doesn't aim to find relationships between two sets of variables. Instead, it tries to find relationships between *groupings* in a single set of variables. Indeed, you can think of it almost like a multivariate t-test or ANOVA (depending on how many -groups you have!). +groups you have). An oenological example ---------------------- + +.. doctest:: + + >>> from pyls.examples import load_dataset + >>> data = load_dataset('wine') + +This is the same dataset as in :py:func:`sklearn.datasets.load_wine`; the +formatting has just been lightly modified to better suit our purposes. + +Our ``data`` object can be treated as a dictionary, containing all the +information necessary to run a PLS analysis. The keys can be accessed as +attributes, so we can take a quick look at our input matrix: + +.. doctest:: + + >>> data.keys() + dict_keys(['X', 'n_perm', 'n_boot', 'groups']) + >>> data.X.shape + (178, 13) + >>> data.X.columns + Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', + 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', + 'proanthocyanins', 'color_intensity', 'hue', + 'od280/od315_of_diluted_wines', 'proline'], + dtype='object') + >>> data.groups + [59, 71, 48] diff --git a/docs/user_guide/results.rst b/docs/user_guide/results.rst index 0673d8c..e6f57a2 100644 --- a/docs/user_guide/results.rst +++ b/docs/user_guide/results.rst @@ -1,4 +1,37 @@ .. _usage_results: -The ``PLSResults`` data object -============================== +PLS Results +=========== + +So you ran a PLS analysis and got some results. Congratulations! The easy part +is done. 🙃 Interpreting (trying to interpret) the results of a PLS +analysis---similar to interpreting the results of a PCA or factor analysis or +CCA or any other complex decomposition---can be difficult. The ``pyls`` package +contains some functions, tools, and data structures to try and help. + +The :py:class:`~.structures.PLSResults` data structure is, at its core, a +Python dictionary that is designed to contain all possible results from any of +the analyses available in :py:mod:`pyls.types`. Let's generate a small example +results object to play around with. We'll use the dataset from the +:ref:`usage_behavioral` example: + +.. doctest:: + + >>> from pyls.examples import load_dataset + >>> data = load_dataset('linnerud') + +We can generate the results file by running the behavioral PLS analysis again. +We pass the ``verbose=False`` flag to suppress the progress bar that would +normally be displayed: + +.. doctest:: + + >>> from pyls import behavioral_pls + >>> results = behavioral_pls(**data, verbose=False) + >>> results + PLSResults(u, s, v, brainscores, behavscores, behavcorr, permres, bootres, cvres, inputs) + +Printing the ``results`` object gives us a helpful view of some of the +different outputs available to us. While we won't go into detail about all of +these (see the :ref:`ref_api` for info on those), we'll touch on a few of the +potentially more confusing ones. From 8efb6ab6cfea7f4580e896e8df48150de0d3d690 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 25 Jan 2019 15:52:32 -0500 Subject: [PATCH 24/32] [ENH] Edits default n_split for pls types --- pyls/structures.py | 3 +-- pyls/types/behavioral.py | 2 +- pyls/types/meancentered.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pyls/structures.py b/pyls/structures.py index d5716bc..17b084b 100644 --- a/pyls/structures.py +++ b/pyls/structures.py @@ -58,8 +58,7 @@ Default: 5000 n_split : int, optional Number of split-half resamples to assess during permutation testing. - This also controls the number of train/test splits examined during - cross-validation if :attr:`test_size` is not zero. Default: 100\ + Default: 0\ """), cross_val=dedent("""\ test_split : int, optional diff --git a/pyls/types/behavioral.py b/pyls/types/behavioral.py index ef2b8a8..8f216af 100644 --- a/pyls/types/behavioral.py +++ b/pyls/types/behavioral.py @@ -254,7 +254,7 @@ def run_pls(self, X, Y): # let's make it a function def behavioral_pls(X, Y, *, groups=None, n_cond=1, n_perm=5000, n_boot=5000, - n_split=100, test_size=0.25, test_split=100, + n_split=0, test_size=0.25, test_split=100, covariance=False, rotate=True, ci=95, seed=None, verbose=True, n_proc=None, **kwargs): pls = BehavioralPLS(X=X, Y=Y, groups=groups, n_cond=n_cond, diff --git a/pyls/types/meancentered.py b/pyls/types/meancentered.py index 4aa3c90..66f767d 100644 --- a/pyls/types/meancentered.py +++ b/pyls/types/meancentered.py @@ -154,7 +154,7 @@ def run_pls(self, X, Y): def meancentered_pls(X, *, groups=None, n_cond=1, mean_centering=0, - n_perm=5000, n_boot=5000, n_split=100, rotate=True, ci=95, + n_perm=5000, n_boot=5000, n_split=0, rotate=True, ci=95, seed=None, verbose=True, n_proc=None, **kwargs): pls = MeanCenteredPLS(X=X, groups=groups, n_cond=n_cond, mean_centering=mean_centering, From b37bd9c178c9880b5b65fd2314a9f8cf361bba45 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 25 Jan 2019 15:53:07 -0500 Subject: [PATCH 25/32] [ENH] Moves SVD out of base.py --- pyls/base.py | 17 +------ pyls/compute.py | 126 +++++++++++++++++++++++++++--------------------- 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/pyls/base.py b/pyls/base.py index bd89310..22b3782 100644 --- a/pyls/base.py +++ b/pyls/base.py @@ -3,7 +3,6 @@ import gc import warnings import numpy as np -from sklearn.utils.extmath import randomized_svd from sklearn.utils.validation import check_random_state from . import compute, structures, utils @@ -426,21 +425,9 @@ def svd(self, X, Y, groups=None, seed=None): # generate cross-covariance matrix and determine # of components crosscov = self.gen_covcorr(X, Y, groups=groups) - n_comp = min(crosscov.shape) - - # run most computationally efficient SVD - if crosscov.shape[0] <= crosscov.shape[1]: - U, d, V = randomized_svd(crosscov.T, - n_components=n_comp, - random_state=check_random_state(seed)) - V = V.T - else: - V, d, U = randomized_svd(crosscov, - n_components=n_comp, - random_state=check_random_state(seed)) - U = U.T + U, d, V = compute.svd(crosscov) - return U, np.diag(d), V + return U, d, V def bootstrap(self, X, Y, seed=None): """ diff --git a/pyls/compute.py b/pyls/compute.py index 3228512..6be8147 100644 --- a/pyls/compute.py +++ b/pyls/compute.py @@ -1,11 +1,56 @@ # -*- coding: utf-8 -*- import numpy as np +from scipy.stats import zscore, zmap from sklearn.utils.extmath import randomized_svd -from sklearn.utils.validation import check_array, check_X_y +from sklearn.utils.validation import check_X_y, check_random_state from pyls import utils +def svd(crosscov, n_components=None, seed=None): + """ + Calculates the SVD of `crosscov` and returns singular vectors/values + + Parameters + ---------- + crosscov : (B, T) array_like + Cross-covariance (or cross-correlation) matrix to be decomposed + n_components : int, optional + Number of components to retain from decomposition + seed : {int, :obj:`numpy.random.RandomState`, None}, optional + Seed for random number generation. Default: None + + Returns + ------- + U : (B, L) `numpy.ndarray` + Left singular vectors from singular value decomposition + d : (L, L) `numpy.ndarray` + Diagonal array of singular values from singular value decomposition + V : (J, L) `numpy.ndarray` + Right singular vectors from singular value decomposition + """ + + seed = check_random_state(seed) + + if n_components is None: + n_components = min(crosscov.shape) + elif not isinstance(n_components, int): + raise TypeError('Provided `n_components` {} must be of type int' + .format(n_components)) + + # run most computationally efficient SVD + if crosscov.shape[0] <= crosscov.shape[1]: + U, d, V = randomized_svd(crosscov.T, n_components=n_components, + random_state=seed, transpose=False) + V = V.T + else: + V, d, U = randomized_svd(crosscov, n_components=n_components, + random_state=seed, transpose=False) + U = U.T + + return U, np.diag(d), V + + def xcorr(X, Y, norm=False, covariance=False): """ Calculates the cross-covariance matrix of `X` and `Y` @@ -32,68 +77,18 @@ def xcorr(X, Y, norm=False, covariance=False): X, Y = check_X_y(X, Y, multi_output=True) if not covariance: - Xn, Yn = zscore(X), zscore(Y) + Xn, Yn = zscore(X, ddof=1), zscore(Y, ddof=1) else: Xn, Yn = X - X.mean(0, keepdims=True), Y - Y.mean(0, keepdims=True) if norm: Xn, Yn = normalize(Xn), normalize(Yn) + xprod = (Yn.T @ Xn) / (len(Xn) - 1) return xprod -def zscore(data, axis=0, ddof=1, comp=None): - """ - Z-scores `X` by subtracting mean and dividing by standard deviation - - Effectively the same as `np.nan_to_num(scipy.stats.zscore(X))` but - handles DivideByZero without issuing annoying warnings. - - Parameters - ---------- - data : (N, ...) array_like - Data to be z-scored - axis : int, optional - Axis along which to z-score. Default: 0 - ddof : int, optional - Delta degrees of freedom. The divisor used in calculations is - `M - ddof`, where `M` is the number of elements along `axis` in - `comp`. Default: 1 - comp : (M, ...) array_like - Distribution to z-score `data`. Should have same dimension as `data` - along every axis except `axis`. If not provided, `data` will be used. - Default: None - - Returns - ------- - zarr : (N, ...) `numpy.ndarray` - Z-scored version of `data` - """ - - data = check_array(data, ensure_2d=False, allow_nd=True) - - # check if z-score against another distribution or self - if comp is not None: - comp = check_array(comp, ensure_2d=False, allow_nd=True) - else: - comp = data - - avg = comp.mean(axis=axis, keepdims=True) - stdev = comp.std(axis=axis, ddof=ddof, keepdims=True) - - # avoid DivideByZero errors - zeros = stdev == 0 - if np.any(zeros): - avg[zeros] = 0 - stdev[zeros] = 1 - - zarr = (data - avg) / stdev - zarr[np.repeat(zeros, zarr.shape[axis], axis=axis)] = 0 - - return zarr - - def normalize(X, axis=0): """ Normalizes `X` along `axis` @@ -145,7 +140,7 @@ def rescale_test(X_train, X_test, Y_train, U, V): Behavioral matrix, where `S2` is observations and `T` is features """ - X_resc = zscore(X_test, comp=X_train, axis=0, ddof=1) + X_resc = zmap(X_test, compare=X_train, ddof=1) Y_pred = (X_resc @ U @ V.T) + Y_train.mean(axis=0, keepdims=True) return Y_pred @@ -390,3 +385,26 @@ def efficient_corr(x, y): corr[corr < -1] = -1 return corr + + +def varexp(singular): + """ + Calculates the variance explained by values in `singular` + + Parameters + ---------- + singular : (L, L) array_like + Singular values from singular value decomposition + + Returns + ------- + varexp : (L, L) `numpy.ndarray` + Variance explained + """ + + if singular.ndim != 2: + raise ValueError('Provided `singular` array must be a square diagonal ' + 'matrix, not array of shape {}' + .format(singular.shape)) + + return np.diag(np.diag(singular)**2 / np.sum(np.diag(singular)**2)) From 4d1b421f16985772c4bf96e8913be874c75767e2 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 25 Jan 2019 15:53:20 -0500 Subject: [PATCH 26/32] [TEST] No more manual zscore function --- pyls/tests/test_compute.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pyls/tests/test_compute.py b/pyls/tests/test_compute.py index efdcfc7..7868385 100644 --- a/pyls/tests/test_compute.py +++ b/pyls/tests/test_compute.py @@ -7,15 +7,6 @@ rs = np.random.RandomState(1234) -def test_zscore(): - out = pyls.compute.zscore([[1]] * 10) - assert np.allclose(out, 0) - - out = pyls.compute.zscore(rs.rand(10, 10)) - assert out.shape == (10, 10) - assert not np.allclose(out, 0) - - def test_normalize(): X = rs.rand(10, 10) out = pyls.compute.normalize(X, axis=0) From 7c87234c6f9722883e48faf16a5d335d30080d22 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 19 Apr 2019 14:23:07 -0400 Subject: [PATCH 27/32] [STY] Minor compute.xcorr update Independently zscores (not scipy.stats.zscore) to retain input data structures (e.g., pandas dataframes). Good for documentation demonstration purposes to show cross-correlation matrices. Minor update to compute.efficient_corr to use np.clip, as well. --- pyls/compute.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pyls/compute.py b/pyls/compute.py index 6be8147..ce687ba 100644 --- a/pyls/compute.py +++ b/pyls/compute.py @@ -31,6 +31,7 @@ def svd(crosscov, n_components=None, seed=None): """ seed = check_random_state(seed) + crosscov = np.asanyarray(crosscov) if n_components is None: n_components = min(crosscov.shape) @@ -74,10 +75,14 @@ def xcorr(X, Y, norm=False, covariance=False): Cross-covariance of `X` and `Y` """ - X, Y = check_X_y(X, Y, multi_output=True) + check_X_y(X, Y, multi_output=True) + # we could just use scipy.stats zscore but if we do this we retain the + # original data structure; if pandas dataframes were given, a dataframe + # will be returned if not covariance: - Xn, Yn = zscore(X, ddof=1), zscore(Y, ddof=1) + Xn = (X - X.mean(axis=0)) / X.std(axis=0, ddof=1) + Yn = (Y - Y.mean(axis=0)) / Y.std(axis=0, ddof=1) else: Xn, Yn = X - X.mean(0, keepdims=True), Y - Y.mean(0, keepdims=True) @@ -381,8 +386,7 @@ def efficient_corr(x, y): corr = np.sum(zscore(x, ddof=1) * zscore(y, ddof=1), axis=0) / (len(x) - 1) # fix rounding errors - corr[corr > 1] = 1 - corr[corr < -1] = -1 + corr = np.clip(corr, -1, 1) return corr From 7a905a5e2b2584b828203695da2f9e40b7822c4d Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Fri, 19 Apr 2019 14:24:49 -0400 Subject: [PATCH 28/32] [DOC] Behavioral PLS user guide update Still very much a WIP, but...coming along to a degree? --- docs/user_guide/behavioral.rst | 88 ++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/docs/user_guide/behavioral.rst b/docs/user_guide/behavioral.rst index 23fa78d..23eb5da 100644 --- a/docs/user_guide/behavioral.rst +++ b/docs/user_guide/behavioral.rst @@ -9,26 +9,30 @@ Behavioral PLS ============== What we call behavioral PLS in the ``pyls`` package is actually the more -traditional form of PLS (and is generally not prefixed with "behavioral" --- -more on that later). This form of PLS, at its core, attempts to find shared -information between two sets of features from a common set of samples. However, -as with all things, there are a number of ever-so-slightly different kinds of -PLS that exist in the wild, so to be thorough we're going to briefly explain -the exact flavor implemented here before diving into a more illustrative -example. - -What *exactly* is behavioral PLS? ---------------------------------- - -:py:func:`pyls.behavioral_pls` employs a symmetrical, singular value -decomposition (SVD) based form of PLS, and is sometimes referred to as -PLS-correlation (PLS-C) or PLS-SVD. Notably, it is **not** the same as PLS -regression (PLS-R). Indeed, you can think of the differences between PLS-C and -PLS-R similar to how you might consider the differences between a Pearson -correlation and a simple linear regression. Though this analogy is a bit of a -simplification, the primary difference to take away is that behavioral PLS -(PLS-C) does *not assess directional relationships between sets of data* (e.g., -X → Y), but rather how the two sets generally covary (e.g., X ↔ Y). +traditional form of PLS (and is generally not prefixed with "behavioral"). This +form of PLS, at its core, attempts to find shared information between two sets +of features derived from a common set of samples. However, as with all things, +there are a number of ever-so-slightly different kinds of PLS that exist in the +wild, so to be thorough we're going to briefly explain the exact flavor +implemented here before diving into a more illustrative example. + +What *exactly* is "behavioral PLS"? +----------------------------------- + +**Technical answer**: :py:func:`pyls.behavioral_pls` employs a symmetrical, +singular value decomposition (SVD) based form of PLS, and is sometimes referred +to as PLS-correlation (PLS-C), PLS-SVD, or, infrequently, EZ-PLS. Notably, it +is **not** the same as PLS regression (PLS-R). + +**Less technical answer**: :py:func:`pyls.behavioral_pls` is like performing a +correlation when you have two datasets each with multiple features. + +You can think of the differences between PLS-C and PLS-R similar to how you +might consider the differences between a Pearson correlation and a simple +linear regression. Though this analogy is a bit of a simplification, the +primary difference to take away is that behavioral PLS (PLS-C) does *not* +*assess directional relationships between sets of data* (e.g., X → Y), but +rather looks at how the two sets generally covary (e.g., X ↔ Y). To understand this a bit more we can walk through a detailed example. @@ -37,11 +41,12 @@ An exercise in calisthenics .. note:: Descriptions of PLS are almost always accompanied by a litany of equations, - and for good reason: understanding PLS requires understanding the math - behind it. As such, this example is going to rely on these equations, but - will always do so in the context of real data. The hope is that this - approach will help make the more abstract mathematical concepts a bit more - concrete (and easier to apply to new data sets!). + and for good reason: understanding how to interpret the results of a PLS + requires at least a cursory understanding of the math behind it. As such, + this example is going to rely on these equations, but will always do so in + the context of real data. The hope is that this approach will help make the + more abstract mathematical concepts a bit more concrete (and easier to + apply to new data sets!). We'll start by loading the example dataset [1]_: @@ -93,7 +98,7 @@ The rows of our :math:`\textbf{Y}_{n \times q}` matrix *also* represent *n* subjects (critically, the same subjects as in :math:`\textbf{X}`), and the columns indicate *q* physiological measurements taken for each subject. That same subject referenced above thus has a weight of 191 pounds, a 36 inch waist, -and a resting pulse of 50 beats per minute. +and a pulse of 50 beats per minute. Behavioral PLS will attempt to establish whether a relationship exists between the exercises performed and these physiological variables. If we wanted to run @@ -127,14 +132,15 @@ In our example, :math:`\textbf{R}` ends up being a 3 x 3 matrix: >>> from pyls.compute import xcorr >>> R = xcorr(data.X, data.Y) >>> R - array([[-0.38969365, -0.49308365, -0.22629556], - [-0.55223213, -0.64559803, -0.19149937], - [ 0.15064802, 0.22503808, 0.03493306]]) + Chins Situps Jumps + Weight -0.389694 -0.493084 -0.226296 + Waist -0.552232 -0.645598 -0.191499 + Pulse 0.150648 0.225038 0.034933 The :math:`q` rows of this matrix correspond to the physiological measurements and the :math:`p` columns to the exercises. Examining the first row, we can see -that ``-0.38969365`` is the correlation between ``Weight`` and ``Chins`` across -all the subjects, ``-0.49308365`` the correlation between ``Weight`` and +that ``-0.389694`` is the correlation between ``Weight`` and ``Chins`` across +all the subjects, ``-0.493084`` the correlation between ``Weight`` and ``Situps``, and so on. Singular value decomposition @@ -159,6 +165,8 @@ physiological measurements from our input matrix :math:`\textbf{Y}`. The columns of :math:`\textbf{U}` and :math:`\textbf{V}`, on the other hand, represent new dimensions or components that have been "discovered" in the data. + + The :math:`i^{th}` columns of :math:`\textbf{U}` and :math:`\textbf{V}` weigh the contributions of these exercises and physiological measurements, respectively. Taken together, the :math:`i^{th}` left and right singular @@ -223,19 +231,27 @@ data matrices by the relevant singular vectors and then correlate the results: >>> pearsonr(XU[:, 0], YV[:, 0]) (0.48997247845503833, 0.028304653097330421) -The correlation value of this latent variable (~ ``0.49`` ) suggests that our -interpretation of the singular vectors weights, above, is moderately true. +The correlation value of this latent variable (~``0.49``) suggests that our +interpretation of the singular vectors weights, above, is only *somewhat* +accurate. We can think of this correlation (ranging from -1 to 1) as a proxy +for the question: "how often is this interpretation of the singular vectors +true?" Correlations closer to -1 indicate that the interpretation is largely +inaccurate across subjects, whereas correlations closer to 1 indicate the +interpretation is largely accurate across subjects. Latent variable significance testing ------------------------------------ -How can we check that these latent variables are significant? That is, that -the likelihood of them explaining *this much* variance is greater than chance -(to some prespecified alpha)? +How can we check that these latent variables are significant? That is, how can +we check that the likelihood of each latent variable explaining as much +variance as it does is greater than we would expect at random? + +COMING SOON Reliability of the singular vectors ----------------------------------- +COMING SOON .. [1] Tenenhaus, M. (1998). La régression PLS: théorie et pratique. Editions technip. From b51f0029881b4e1e1877bd273db3eea437103afe Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Mon, 23 Sep 2019 12:01:55 -0400 Subject: [PATCH 29/32] [DOC] Better doc handling + user guide updates Some minor maintenance that makes working with the documentation a bit easier (e.g., 'make clean html') and some updates to the user guide! --- .gitignore | 2 +- docs/Makefile | 8 +++++- docs/api.rst | 39 +++++++++++++++------------- docs/index.rst | 25 ++++++++---------- docs/installation.rst | 3 ++- docs/requirements.txt | 2 +- docs/usage.rst | 4 ++- docs/user_guide/behavioral.rst | 47 ++++++++++++++++++++++++---------- 8 files changed, 79 insertions(+), 51 deletions(-) diff --git a/.gitignore b/.gitignore index 9068f25..2a5d4e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ plsc/ .vscode/ -docs/_generated/ +docs/generated/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/docs/Makefile b/docs/Makefile index 8bc7c80..0451d6f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,7 +14,13 @@ help: .PHONY: help Makefile +# For getting rid of generated docs before re-building +clean: + rm -rf $(BUILDDIR)/* auto_examples/ generated/ + +.PHONY: clean + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api.rst b/docs/api.rst index edc25dc..c001fac 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,7 +1,10 @@ .. _ref_api: +.. currentmodule:: pyls + +------------- Reference API -============= +------------- This is the primary reference of ``pyls``. Please refer to the :ref:`user guide ` for more information on how to best implement these functions in your @@ -23,10 +26,10 @@ own workflows. .. autosummary:: :template: function.rst - :toctree: _generated/ + :toctree: generated/ - behavioral_pls - meancentered_pls + pyls.behavioral_pls + pyls.meancentered_pls .. _ref_results: @@ -41,14 +44,14 @@ own workflows. .. autosummary:: :template: class.rst - :toctree: _generated/ + :toctree: generated/ - PLSResults - PLSPermResults - PLSBootResults - PLSSplitHalfResults - PLSCrossValidationResults - PLSInputs + pyls.structures.PLSResults + pyls.structures.PLSPermResults + pyls.structures.PLSBootResults + pyls.structures.PLSSplitHalfResults + pyls.structures.PLSCrossValidationResults + pyls.structures.PLSInputs .. _ref_io: @@ -59,14 +62,14 @@ own workflows. :no-members: :no-inherited-members: -.. currentmodule:: pyls.io +.. currentmodule:: pyls .. autosummary:: :template: function.rst - :toctree: _generated/ + :toctree: generated/ - save_results - load_results + pyls.save_results + pyls.load_results .. _ref_matlab: @@ -77,10 +80,10 @@ own workflows. :no-members: :no-inherited-members: -.. currentmodule:: pyls.matlab +.. currentmodule:: pyls .. autosummary:: :template: function.rst - :toctree: _generated/ + :toctree: generated/ - import_matlab_result + pyls.import_matlab_result diff --git a/docs/index.rst b/docs/index.rst index 0e3f4ed..9c59070 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,22 +2,19 @@ pyls: Partial Least Squares in Python ===================================== -.. only:: html and epub - - .. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master - :target: https://travis-ci.org/rmarkello/pyls - .. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield - :target: https://circleci.com/gh/rmarkello/pyls - .. image:: https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg - :target: https://codecov.io/gh/rmarkello/pyls - .. image:: https://readthedocs.org/projects/pyls/badge/?version=latest - :target: http://pyls.readthedocs.io/en/latest - .. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg - :target: https://opensource.org/licenses/GPL-2.0 - +.. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master + :target: https://travis-ci.org/rmarkello/pyls +.. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield + :target: https://circleci.com/gh/rmarkello/pyls +.. image:: https://codecov.io/gh/rmarkello/pyls/branch/master/graph/badge.svg + :target: https://codecov.io/gh/rmarkello/pyls +.. image:: https://readthedocs.org/projects/pyls/badge/?version=latest + :target: http://pyls.readthedocs.io/en/latest +.. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg + :target: https://opensource.org/licenses/GPL-2.0 .. toctree:: - :maxdepth: 1 + :maxdepth: 2 installation usage diff --git a/docs/installation.rst b/docs/installation.rst index d7c18a4..a4552e1 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,7 +1,8 @@ .. _installation_setup: +---------------------- Installation and setup -====================== +---------------------- .. _basic_installation: diff --git a/docs/requirements.txt b/docs/requirements.txt index ed30bc9..803b1f3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -r ../requirements.txt -sphinx>=1.2 +sphinx>=2.0 sphinx_rtd_theme diff --git a/docs/usage.rst b/docs/usage.rst index f8d633f..01bb76d 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -1,7 +1,8 @@ .. _usage: +---------- User guide -========== +---------- Partial least squares (PLS) is a multivariate statistical technique that aims to find shared information between two sets of variables. If you're unfamiliar @@ -19,6 +20,7 @@ still have questions after going through that you can refer to the .. toctree:: :caption: Table of Contents :numbered: + :maxdepth: 2 user_guide/behavioral.rst user_guide/meancentered.rst diff --git a/docs/user_guide/behavioral.rst b/docs/user_guide/behavioral.rst index 23eb5da..ef50d4e 100644 --- a/docs/user_guide/behavioral.rst +++ b/docs/user_guide/behavioral.rst @@ -8,6 +8,13 @@ Behavioral PLS ============== +Running a behavioral PLS using ``pyls`` is as simple as: + +.. code-block:: + + >>> import pyls + >>> out = pyls.behavioral_pls(X, Y) + What we call behavioral PLS in the ``pyls`` package is actually the more traditional form of PLS (and is generally not prefixed with "behavioral"). This form of PLS, at its core, attempts to find shared information between two sets @@ -16,8 +23,8 @@ there are a number of ever-so-slightly different kinds of PLS that exist in the wild, so to be thorough we're going to briefly explain the exact flavor implemented here before diving into a more illustrative example. -What *exactly* is "behavioral PLS"? ------------------------------------ +What *exactly* do we mean by "behavioral PLS"? +---------------------------------------------- **Technical answer**: :py:func:`pyls.behavioral_pls` employs a symmetrical, singular value decomposition (SVD) based form of PLS, and is sometimes referred @@ -25,14 +32,18 @@ to as PLS-correlation (PLS-C), PLS-SVD, or, infrequently, EZ-PLS. Notably, it is **not** the same as PLS regression (PLS-R). **Less technical answer**: :py:func:`pyls.behavioral_pls` is like performing a -correlation when you have two datasets each with multiple features. +principal components analysis (PCA) but when you have two related datasets, +each with multiple features. + +Differences from PLS regression (PLS-R) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can think of the differences between PLS-C and PLS-R similar to how you might consider the differences between a Pearson correlation and a simple -linear regression. Though this analogy is a bit of a simplification, the -primary difference to take away is that behavioral PLS (PLS-C) does *not* -*assess directional relationships between sets of data* (e.g., X → Y), but -rather looks at how the two sets generally covary (e.g., X ↔ Y). +linear regression. Though this analogy is an over-simplification, the primary +difference to take away is that behavioral PLS (PLS-C) does *not assess* +*directional relationships between sets of data* (e.g., X → Y), but rather +looks at how the two sets generally covary (e.g., X ↔ Y). To understand this a bit more we can walk through a detailed example. @@ -165,7 +176,7 @@ physiological measurements from our input matrix :math:`\textbf{Y}`. The columns of :math:`\textbf{U}` and :math:`\textbf{V}`, on the other hand, represent new dimensions or components that have been "discovered" in the data. - +.. The :math:`i^{th}` columns of :math:`\textbf{U}` and :math:`\textbf{V}` weigh the contributions of these exercises and physiological measurements, @@ -231,7 +242,7 @@ data matrices by the relevant singular vectors and then correlate the results: >>> pearsonr(XU[:, 0], YV[:, 0]) (0.48997247845503833, 0.028304653097330421) -The correlation value of this latent variable (~``0.49``) suggests that our +The correlation value of this latent variable (~ ``0.49``) suggests that our interpretation of the singular vectors weights, above, is only *somewhat* accurate. We can think of this correlation (ranging from -1 to 1) as a proxy for the question: "how often is this interpretation of the singular vectors @@ -242,16 +253,24 @@ interpretation is largely accurate across subjects. Latent variable significance testing ------------------------------------ -How can we check that these latent variables are significant? That is, how can -we check that the likelihood of each latent variable explaining as much -variance as it does is greater than we would expect at random? +Scientists love null-hypothesis significance testing, so there's a strong urge +for researchers doing these sorts of analyses to want to find a way to +determine whether observed latent variables are significant(ly different from a +specified null model). The issue comes in determining what aspect of the latent +variables to test! + +With behavioral PLS we assess whether the **variance explained** by a given +latent variable is significantly different than would be expected by a null. +Importantly, that null is generated by re-computing the latent variables from +random permutations of the original data, generating a non-parametric +distribution of explained variances by which to measure "significance." -COMING SOON +.. Reliability of the singular vectors ----------------------------------- -COMING SOON + .. [1] Tenenhaus, M. (1998). La régression PLS: théorie et pratique. Editions technip. From a6a9e673dadf40e3c997026a3853a84365bbded7 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Wed, 25 Sep 2019 16:33:57 -0400 Subject: [PATCH 30/32] [REF] Import pyls.examples to main namespace --- pyls/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyls/__init__.py b/pyls/__init__.py index 4a63c08..be18e24 100644 --- a/pyls/__init__.py +++ b/pyls/__init__.py @@ -4,7 +4,7 @@ '__author__', '__description__', '__email__', '__license__', '__maintainer__', '__packagename__', '__url__', '__version__', 'behavioral_pls', 'meancentered_pls', 'import_matlab_result', - 'PLSInputs', 'PLSResults', 'save_results', 'load_results' + 'examples', 'PLSInputs', 'PLSResults', 'save_results', 'load_results' ] from ._version import get_versions @@ -21,6 +21,7 @@ __url__, ) +from . import examples from .io import load_results, save_results from .matlab import import_matlab_result from .structures import PLSInputs, PLSResults From 1026f15008d851d57ba7c59b5097954a72ee5a70 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Wed, 25 Sep 2019 16:34:07 -0400 Subject: [PATCH 31/32] [DOC] Minor text on docs landing page --- docs/index.rst | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/index.rst b/docs/index.rst index 9c59070..c43b5f0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,6 +2,9 @@ pyls: Partial Least Squares in Python ===================================== +This package provides a Python interface for performing partial least squares +(PLS) analyses. + .. image:: https://travis-ci.org/rmarkello/pyls.svg?branch=master :target: https://travis-ci.org/rmarkello/pyls .. image:: https://circleci.com/gh/rmarkello/pyls.svg?style=shield @@ -13,9 +16,60 @@ pyls: Partial Least Squares in Python .. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg :target: https://opensource.org/licenses/GPL-2.0 +.. readme_installation: + +Installation requirements +------------------------- +Currently, ``pyls`` works with Python 3.5+ and requires a few dependencies: + + - h5py + - numpy + - scikit-learn + - scipy, and + - tqdm + +For detailed information on how to install ``pyls``, including these +dependencies, refer to our `installation instructions`_. + +.. readme_development: + +Development and getting involved +-------------------------------- + +If you've found a bug, are experiencing a problem, or have a question about +using the package, please head on over to our `GitHub issues`_ and make a new +issue with some information about it! Someone will try and get back to you +as quickly as possible, though please note that the primary developer for +``pyls`` (@rmarkello) is a graduate student so responses make take some time! + +If you're interested in getting involved in the project: welcome |sparkles|! +We're thrilled to welcome new contributors. You should start by reading our +`code of conduct`_; all activity on ``pyls`` should adhere to the CoC. After +that, take a look at our `contributing guidelines`_ so you're familiar with the +processes we (generally) try to follow when making changes to the repository! +Once you're ready to jump in head on over to our issues to see if there's +anything you might like to work on. + +.. readme_licensing: + +License Information +------------------- + +This codebase is licensed under the GNU General Public License, version 2. The +full license can be found in the `LICENSE`_ file in the ``pyls`` distribution. + +All trademarks referenced herein are property of their respective holders. + .. toctree:: :maxdepth: 2 installation usage api + +.. |sparkles| replace:: ✨ +.. _code of conduct: https://github.com/rmarkello/pyls/blob/master/CODE_OF_CONDUCT.md +.. _contributing guidelines: https://github.com/rmarkello/pyls/blob/master/CONTRIBUTING.md +.. _GitHub issues: https://github.com/rmarkello/pyls/issues +.. _installation instructions: https://pyls.readthedocs.io/en/latest/installation.html +.. _LICENSE: https://github.com/rmarkello/pyls/blob/master/LICENSE From 98aeb962ca7db76517ffc773011d8cca9fd79394 Mon Sep 17 00:00:00 2001 From: Ross Markello Date: Wed, 25 Sep 2019 16:56:40 -0400 Subject: [PATCH 32/32] [DOC] More docs updates [skip ci] --- docs/index.rst | 40 ++++++++++++++++++++++++++++++++-------- docs/installation.rst | 20 -------------------- docs/usage.rst | 11 +++++------ 3 files changed, 37 insertions(+), 34 deletions(-) delete mode 100644 docs/installation.rst diff --git a/docs/index.rst b/docs/index.rst index c43b5f0..8ffeef4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,3 @@ -===================================== pyls: Partial Least Squares in Python ===================================== @@ -16,10 +15,11 @@ This package provides a Python interface for performing partial least squares .. image:: http://img.shields.io/badge/License-GPL%202.0-blue.svg :target: https://opensource.org/licenses/GPL-2.0 -.. readme_installation: +.. _readme_installation: Installation requirements ------------------------- + Currently, ``pyls`` works with Python 3.5+ and requires a few dependencies: - h5py @@ -28,10 +28,36 @@ Currently, ``pyls`` works with Python 3.5+ and requires a few dependencies: - scipy, and - tqdm -For detailed information on how to install ``pyls``, including these -dependencies, refer to our `installation instructions`_. +Assuming you have the correct version of Python installed, you can install +``pyls`` by opening a terminal and running the following: + +.. code-block:: bash + + git clone https://github.com/rmarkello/pyls.git + cd pyls + python setup.py install + +All relevant dependencies will be installed alongside the ``pyls`` module. + +.. _readme_quickstart: + +Quickstart +---------- + +There are a number of ways to use ``pyls``, depending on the type of analysis +you would like to perform. Assuming you have two matrices ``X`` and ``Y`` +representing different observations from a set of samples (i.e., subjects, +neurons, brain regions), you can run a simple analysis with: + +.. code-block:: python + + >>> import pyls + >>> results = pyls.behavioral_pls(X, Y) + +For detailed information on the different methods available and how to +interpret the results object, please refer to our :ref:`user guide `. -.. readme_development: +.. _readme_development: Development and getting involved -------------------------------- @@ -50,7 +76,7 @@ processes we (generally) try to follow when making changes to the repository! Once you're ready to jump in head on over to our issues to see if there's anything you might like to work on. -.. readme_licensing: +.. _readme_licensing: License Information ------------------- @@ -63,7 +89,6 @@ All trademarks referenced herein are property of their respective holders. .. toctree:: :maxdepth: 2 - installation usage api @@ -71,5 +96,4 @@ All trademarks referenced herein are property of their respective holders. .. _code of conduct: https://github.com/rmarkello/pyls/blob/master/CODE_OF_CONDUCT.md .. _contributing guidelines: https://github.com/rmarkello/pyls/blob/master/CONTRIBUTING.md .. _GitHub issues: https://github.com/rmarkello/pyls/issues -.. _installation instructions: https://pyls.readthedocs.io/en/latest/installation.html .. _LICENSE: https://github.com/rmarkello/pyls/blob/master/LICENSE diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index a4552e1..0000000 --- a/docs/installation.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _installation_setup: - ----------------------- -Installation and setup ----------------------- - -.. _basic_installation: - -Basic installation --------------------- - -This package requires Python >= 3.5. Assuming you have the correct version of -Python installed, you can install ``pyls`` by opening a terminal and running -the following: - -.. code-block:: bash - - git clone https://github.com/rmarkello/pyls.git - cd pyls - python setup.py install diff --git a/docs/usage.rst b/docs/usage.rst index 01bb76d..176b74c 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -10,12 +10,11 @@ with PLS and are interested in a thorough (albeit quite technical) treatment, `Abdi et al., 2013 `_ is a good resource. -This user guide will go through the basic statistical concepts of and detail -the two types of PLS implemented in the current package -(:ref:`usage_behavioral` and :ref:`usage_meancentered`) and demonstrate how to -interpret and use the results of a PLS analysis (:ref:`usage_results`). If you -still have questions after going through that you can refer to the -:ref:`ref_api`. +This user guide will go through the basic statistical concepts of the two types +of PLS implemented in the current package (:ref:`usage_behavioral` and +:ref:`usage_meancentered`) and demonstrate how to interpret and use the results +of a PLS analysis (:ref:`usage_results`). If you still have questions after +going through this guide then you can refer to the :ref:`ref_api`! .. toctree:: :caption: Table of Contents