index.html

<!--
slidedeck: A modification of the Google IO 2012 HTML5 slide template
URL: https://github.com/rmcgibbo/slidedeck

Based on https://github.com/francescolaffi/elastic-google-io-slides, and
ultimately:

Google IO 2012 HTML5 Slide Template
Authors: Eric Bidelman <ebidel@gmail.com>
         Luke Mahe <lukem@google.com>
URL: https://code.google.com/p/io-2012-slides
-->
<!DOCTYPE html>
<html>
<head>
  <title>MSMBuilder3</title>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="chrome=1">
  <!--<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0">-->
  <!--<meta name="viewport" content="width=device-width, initial-scale=1.0">-->
  <!--This one seems to work all the time, but really small on ipad-->
  <!--<meta name="viewport" content="initial-scale=0.4">-->
  <meta name="apple-mobile-web-app-capable" content="yes">
  
  <link rel="stylesheet" media="all" href="theme/css/default.css">
  <link rel="stylesheet" media="all" href="theme/css/custom.css">
  <link rel="stylesheet" media="only screen and (max-device-width: 480px)" href="theme/css/phone.css">
  <base target="_blank"> <!-- This amazingness opens all links in a new tab. -->
  <script data-main="js/slides", src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.14/require.min.js"></script>


  <!-- MathJax support  -->
  <script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    extensions: ["tex2jax.js"],
    jax: ["input/TeX", "output/HTML-CSS"],
    showProcessingMessages: false,
    tex2jax: {
      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
      processEscapes: true
    },
    TeX: {
      extensions: ["color.js"]
    },
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
  </script>
  <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
  <div style="display:hidden">
  \[
    \definecolor{data}{RGB}{18,110,213}
    \definecolor{unknown}{RGB}{217,86,16}
    \definecolor{learned}{RGB}{175,114,176}
  \]
  </div>

</head>

<body style="opacity: 0">

<slides class="layout-widescreen">
<slide class="title-slide segue nobackground">
  <hgroup class="auto-fadein">

    <h1> <div style="text-align: center; margin-left: 0em; margin-bottom: -.3em;"> <img height=300 src="http://msmbuilder.org/3.5.0/_static/logo.png"></div></h1>
    <h2></h2>
    <p> Carlos X. Hernández, Matthew P. Harrigan, M. Muneeb Sultan, Brooke E. Husic<br/> Updated: Jun. 23, 2016 (msmbuilder v3.5)</p>
  </hgroup>
</slide>


<slide  >
  
    <hgroup>
      <h2>Old-School Analysis of MD Data</h2>
      <h3></h3>
    </hgroup>
    <article ><ul>
<li>Analysis happens in walled gardens (Gromacs, Amber, VMD)</li>
<li>Exclusively command line interfaces, C and Fortran code</li>
<li>Duplication of statistical algorithms by non-experts (e.g. chemists, biologists)</li>
<li>Possible code maintainability issues?</li>
</ul></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Jarvis Patrick Clustering in Gromacs</h2>
      <h3><a href="https://github.com/gromacs/gromacs/blob/master/src/gromacs/gmxana/gmx_cluster.cpp#L502">real code in gromacs</a></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="c++">

static void jarvis_patrick(int n1, real **mat, int M, int P,
real rmsdcut, t_clusters *clust) {
t_dist *row;
t_clustid *c;
int **nnb;
int i, j, k, cid, diff, max;
gmx_bool bChange;
real **mcpy = NULL;
if (rmsdcut < 0) {
rmsdcut = 10000;
}
/* First we sort the entries in the RMSD matrix row by row.
* This gives us the nearest neighbor list.
*/
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Jarvis Patrick Clustering in Gromacs (Cont.)</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="c++">

// Five more pages of this
// You get the idea

// Also, how do we even use this function?
static void jarvis_patrick(int n1, real **mat, int M, int P,
real rmsdcut, t_clusters *clust);
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Enter Data Science</h2>
      <h3></h3>
    </hgroup>
    <article ><ul>
<li>Machine learning is mainstream now!</li>
<li>Thousands of experts are using machine learning approaches</li>
<li>Well-tested, performant, and facile implementations are available</li>
<li>Writing your own is not the way to go!<ul>
<li>E.g: Is clustering <em>that</em> special and MD-specific such that
we need our own custom algorithms and implementations?</li>
</ul>
</li>
</ul></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>MSMBuilder3: Design</h2>
      <h3></h3>
    </hgroup>
    <article ><div style="float:right; margin-top:-100px">
<img src="figures/flow-chart.png" height="600">
</div>

<p>Builds on <a href="http://scikit-learn.org/stable/">scikit-learn</a> idioms:</p>
<ul>
<li>Everything is a <code>Model</code>.</li>
<li>Models are <code>fit()</code> on data.</li>
<li>Models learn <code>attributes_</code>.</li>
<li><code>Pipeline()</code> concatenate models.</li>
<li>Use best-practices (cross-validation)</li>
</ul>
<footer class="source">
<a href="http://msmbuilder.org/3.5.0/apipatterns.html">
http://msmbuilder.org/3.5.0/apipatterns.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Everything is a <code>Model()</code>!</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

>>> import msmbuilder.cluster
>>> clusterer = msmbuilder.cluster.KMeans(n_clusters=4)

>>> import msmbuilder.decomposition
>>> tica = msmbuilder.decomposition.tICA(n_components=3)

>>> import msmbuilder.msm
>>> msm = msmbuilder.msm.MarkovStateModel()

</pre>

<p>Hyperparameters go in the constructor.</p>
<footer class="source">
Actually, everything is a <code>sklearn.base.BaseEstimator()</code>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Models <code>fit()</code> data!</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

>>> import msmbuilder.cluster

>>> trajectories = [np.random.normal(size=(100, 3))]

>>> clusterer = msmbuilder.cluster.KMeans(n_clusters=4, n_init=10)
>>> clusterer.fit(trajectories)

>>> clusterer.cluster_centers_

array([[-0.22340896,  1.0745301 , -0.40222902],
       [-0.25410827, -0.11611431,  0.95394687],
       [ 1.34302485,  0.14004818,  0.01130485],
       [-0.59773874, -0.82508303, -0.95703567]])


</pre>

<p>Estimated parameters <em>always</em> have trailing underscores!</p></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2><code>fit()</code> acts on lists of sequences</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

>>> import msmbuilder.msm

>>> trajectories = [np.array([0, 0, 0, 1, 1, 1, 0, 0])]

>>> msm = msmbuilder.msm.MarkovStateModel()
>>> msm.fit(trajectories)

>>> msm.transmat_

array([[ 0.75      ,  0.25      ],
       [ 0.33333333,  0.66666667]])

</pre>

<p>This is different from sklearn, which uses 2D arrays.</p></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Models <code>transform()</code> data!</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

>>> import msmbuilder.cluster

>>> trajectories = [np.random.normal(size=(100, 3))]

>>> clusterer = msmbuilder.cluster.KMeans(n_clusters=8, n_init=10)
>>> clusterer.fit(trajectories)
>>> Y = clusterer.transform(trajectories)

[array([5, 6, 6, 0, 5, 5, 1, 6, 1, 7, 5, 7, 4, 2, 2, 2, 5, 3, 0, 0, 1, 3, 0,
        5, 5, 0, 4, 0, 0, 3, 4, 7, 3, 5, 5, 5, 6, 1, 1, 0, 0, 7, 4, 4, 2, 6,
        1, 4, 2, 0, 2, 4, 4, 5, 2, 6, 3, 2, 0, 6, 3, 0, 7, 7, 7, 0, 0, 0, 3,
        3, 2, 7, 6, 7, 2, 5, 1, 0, 3, 6, 3, 2, 0, 5, 0, 3, 4, 2, 5, 4, 1, 5,
        5, 4, 3, 3, 7, 2, 1, 4], dtype=int32)]

</pre>

<p>Moving the data-items from one "space" / representation into another.</p></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2><code>Pipeline()</code> concatenates models!</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

>>> import msmbuilder.cluster, msmbuilder.msm
>>> from sklearn.pipeline import Pipeline

>>> trajectories = [np.random.normal(size=(100, 3))]

>>> clusterer = msmbuilder.cluster.KMeans(n_clusters=2, n_init=10)
>>> msm = msmbuilder.msm.MarkovStateModel()
>>> pipeline = Pipeline([("clusterer", clusterer), ("msm", msm)])

>>> pipeline.fit(trajectories)
>>> msm.transmat_

array([[ 0.53703704,  0.46296296],
       [ 0.53333333,  0.46666667]])

</pre>

<p>Data "flows" through transformations in the pipeline.</p></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Loading Trajectories</h2>
      <h3></h3>
    </hgroup>
    <article ><p>You can use MDTraj to load your trajectory files</p>
<pre class="prettyprint" data-lang="python">

>>> import glob
>>> import mdtraj as md

>>> filenames = glob.glob("./Trajectories/ala_*.h5")
>>> trajectories = [md.load(filename) for filename in filenames]

</pre>

<footer class="source">
Note: for big datasets, you can get fancy with <code><a href="http://mdtraj.org/latest/api/generated/mdtraj.iterload.html">md.iterload</a></code>.
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Featurization</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Featurizers wrap MDTraj functions via the <code>transform()</code> function</p>
<div style="float:right;">
<img height=225 src=figures/rama.png />
</div>

<pre class="prettyprint" style="width:75%" data-lang="python">

>>> from msmbuilder.featurizer import DihedralFeaturizer
>>> from matplotlib.pyplot import hexbin, plot

>>> featurizer = DihedralFeaturizer(
...        ["phi", "psi"], sincos=False)
>>> X = featurizer.transform(trajectories)
>>> phi, psi = np.rad2deg(np.concatenate(X).T)

>>> hexbin(phi, psi)
</pre>

<footer class="source">
<a href="http://msmbuilder.org/3.5.0/featurization.html">
http://msmbuilder.org/3.5.0/featurization.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Featurization (Cont.)</h2>
      <h3></h3>
    </hgroup>
    <article ><p>You can even combine featurizers with <code>FeatureSelector</code></p>
<pre class="prettyprint" data-lang="python">

>>> from msmbuilder.featurizer import DihedralFeaturizer, ContactFeaturizer
>>> from msmbuilder.feature_selection import FeatureSelector

>>> dihedrals = DihedralFeaturizer(
...         ["phi", "psi"], sincos=True)
>>> contacts = ContactFeaturizer(scheme='ca')
>>> featurizer = FeatureSelector([('dihedrals', dihedrals),
...                               ('contacts', contacts)])
>>> X = featurizer.transform(trajectories)

</pre>

<footer class="source">
<a href="http://msmbuilder.org/3.5.0/featurization.html">
http://msmbuilder.org/3.5.0/featurization.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Preprocessing</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Preprocessors normalize/whiten your data</p>
<pre class="prettyprint" data-lang="python">

>>> from msmbuilder.preprocessing import RobustScaler
>>> scaler = RobustScaler()

>>> Y = scaler.transform(X)
</pre>

<p>This is essential when combining different featurizers!</p>
<p>Also check out <code>MinMaxScaler</code> and <code>StandardScaler</code></p></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Decomposition</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Reduce the dimensionality of your data</p>
<div style="float:right;">
<img width=275 src="http://msmbuilder.org/3.5.0/_images/tica_vs_pca.png"/>
<figcaption style="font-size: 50%; text-align: center;">
tICA finds the slowest degrees<br>of freedom in time-series data
</figcaption>
</div>

<pre class="prettyprint" style="width:75%"  data-lang="python">

>>> from msmbuilder.decomposition import tICA

>>> tica = tICA(n_components=2, lagtime=5)
>>> Y = tica.fit_transform(X)
</pre>

<p>Also check out <code>PCA</code> and <code>SparseTICA</code></p>
<footer class="source">
<a href="http://msmbuilder.org/3.5.0/decomposition.html">
http://msmbuilder.org/3.5.0/decomposition.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Markov State Models</h2>
      <h3></h3>
    </hgroup>
    <article ><p>We offer two main flavors of MSM:</p>
<ul>
<li><code>MarkovStateModel</code> - Fits a first-order Markov model to a discrete-time integer labeled timeseries.</li>
<li><code>ContinuousTimeMSM</code> - Estimates a continuous rate matrix from discrete-time integer labeled timeseries.</li>
</ul>
<p>Each has a Bayesian version, which estimates the error associated with the model.</p>
<footer class="source">
MarkovStateModel: <a href="http://msmbuilder.org/3.5.0/msm.html">
http://msmbuilder.org/3.5.0/msm.html
</a>
<br>
ContinuousTimeMSM: <a href="http://msmbuilder.org/3.5.0/ratematrix.html">
http://msmbuilder.org/3.5.0/ratematrix.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Hidden Markov Models</h2>
      <h3></h3>
    </hgroup>
    <article ><div style="float: right;">
<img height=225" src="http://msmbuilder.org/3.5.0/_images/kde-vs-histogram.png"/>
<figcaption style="font-size: 50%; text-align: center;">KDE is to histogram as HMM is to MSM</figcaption>
</div>

<p>We also offer two types of HMMs:</p>
<ul>
<li><code>GaussianHMM</code> - Reversible Gaussian Hidden Markov Model L1-Fusion Regularization</li>
<li><code>VonMisesHMM</code> - Hidden Markov Model with von Mises Emissions</li>
</ul>
<p>HMMs are great for macrostate modeling!</p>
<footer class="source">
<a href="http://msmbuilder.org/3.5.0/hmm.html">
http://msmbuilder.org/3.5.0/hmm.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Cross-Validation</h2>
      <h3></h3>
    </hgroup>
    <article ><pre class="prettyprint" data-lang="python">

from sklearn.cross_validation import ShuffleSplit

cv = ShuffleSplit(len(trajectories), n_iter=5, test_size=0.5)

for fold, (train_index, test_index) in enumerate(cv):
    train_data = [trajectories[i] for i in train_index]
    test_data = [trajectories[i] for i in test_index]
    model.fit(train_data)
    model.score(test_data)
</pre>

<p>Also check out scikit-learn's <code>KFold</code>, <code>GridSearchCV</code> and <code>RandomizedSearchCV</code>.</p>
<footer class="source">
If you'd like to see how CV can be done with MSMs, see
<a href="http://msmbuilder.org/3.5.0/gmrq.html">
http://msmbuilder.org/3.5.0/gmrq.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Command-line Tools</h2>
      <h3></h3>
    </hgroup>
    <article ><p>We also offer an easy-to-use CLI for the API-averse</p>
<pre class="prettyprint" data-lang="shell">

$ msmb DihedralFeaturizer --top my_protein.pdb --trjs "*.xtc" \
    --transformed diheds --out featurizer.pkl

$ msmb tICA -i diheds/ --out tica_model.pkl \
    --transformed tica_trajs.h5 --n_components 4

$ msmb MiniBatchKMeans -i tica_trajs.h5 \
    --transformed labeled_trajs.h5 --n_clusters 100

$ msmb MarkovStateModel -i labeled_trajs.h5 \
   --out msm.pkl --lag_time 1

</pre>

<footer class="source">
<a href="http://msmbuilder.org/3.5.0/examples/Intro/Intro.cmd.html">
http://msmbuilder.org/3.5.0/examples/Intro/Intro.cmd.html
</a>
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Related Projects</h2>
      <h3></h3>
    </hgroup>
    <article ><p>We also maintain:</p>
<ul>
<li><a href="http://github.com/msmbuilder/osprey"><b>Osprey</b></a>- machine learning hyperparameter optimization</li>
<li><a href="http://github.com/msmbuilder/mdentropy"><b>MDEntropy</b></a> - entropy calculations for MD data</li>
</ul></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Fully-automated, large-scale hyperparameter optimization</p>
<div style="text-align: center">
<img height=225 style="padding-bottom: 1em;" src="http://msmbuilder.org/osprey/development/_static/osprey.svg"/>
<figcaption>http://github.com/msmbuilder/osprey</figcaption>
</div>

<footer class="source">
Not just for MSMs!
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Estimator</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Define your model</p>
<pre class="prettyprint" data-lang="yaml">

estimator:
  # The model/estimator to be fit.

  eval_scope: msmbuilder
  eval: |
    Pipeline([
            ('featurizer', DihedralFeaturizer(types=['phi', 'psi'])),
            ('scaler', RobustScaler()),
            ('tica', tICA(n_components=2)),
            ('cluster', MiniBatchKMeans()),
            ('msm', MarkovStateModel(n_timescales=5, verbose=False)),
    ])

</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Search Strategy</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Choose how to search over your hyperparameter space</p>
<pre class="prettyprint" data-lang="yaml">

strategy:

    name: gp  # or random, grid, hyperopt_tpe
    params:
      seeds: 50
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Search Space</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Select which hyperparameters to optimize</p>
<pre class="prettyprint" data-lang="yaml">

search_space:
  featurizer__types:
    choices:
      - ['phi', 'psi']
      - ['phi', 'psi', 'chi1']
    type: enum

  cluster__n_clusters:
    min: 2
    max: 1000
    type: int
    warp: log # search over log-space
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Cross-Validation</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Pick your favorite cross-validator</p>
<pre class="prettyprint" data-lang="yaml">

cv:
  name: shufflesplit # Or kfold, loo, stratifiedshufflesplit, stratifiedkfold, fixed
  params:
    n_iter: 5
    test_size: 0.5
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Dataset Loader</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Load your data, no matter what file type</p>
<pre class="prettyprint" data-lang="yaml">

dataset_loader:
  # specification of the dataset on which to train the models.
  name: mdtraj # Or msmbuilder, numpy, filename, joblib, sklearn_dataset, hdf5
  params:
    trajectories: ./fs_peptide/trajectory-*.xtc
    topology: ./fs_peptide/fs-peptide.pdb
    stride: 100
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Trials</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Save to a single SQL-like database, run on as many clusters as you'd like*</p>
<pre class="prettyprint" data-lang="yaml">

trials:
  # path to a database in which the results of each hyperparameter fit
  # are stored any SQL database is supported, but we recommend using
  # SQLLite, which is simple and stores the results in a file on disk.
  uri: sqlite:///osprey-trials.db
</pre>

<footer class="source">
*you'll still need to copy your data to each cluster, however
</footer></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Running a Job</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Simple command-line interface, easy to run on any cluster</p>
<pre class="prettyprint" data-lang="SHELL">

$ osprey worker -n 100 config.yaml
...
----------------------------------------------------------------------
Beginning iteration                                           10 / 100
----------------------------------------------------------------------
Loading trials database: sqlite:///trials.db...
History contains: 9 trials
Choosing next hyperparameters with gp...
  {'tica__n_components': 2, 'tica__lag_time': 180, 'cluster__n_clusters': 36}
(gp took 0.000 s)
...
Success! Model score = 4.214510
(best score so far   = 4.593165)
</pre></article>
 
</slide>

<slide  >
  
    <hgroup>
      <h2>Osprey: Real-Time Analytics</h2>
      <h3></h3>
    </hgroup>
    <article ><p>Osprey also makes it easy to create interactive dashboards</p>
<pre class="prettyprint" data-lang="SHELL">

$ osprey plot config.yaml
</pre>

<div style="text-align: center">
<img height=350 style="padding-top: .5em;" src="figures/osprey-dash.png"/>
<img height=350 style="padding-top: .5em;" src="figures/osprey-dash2.png"/>
</div></article>
 
</slide>


<slide class="thank-you-slide segue nobackground">
  <!-- <aside class="gdbar right"><img src="images/google_developers_icon_128.png"></aside> -->
  <article class="flexbox vleft auto-fadein">
    <h2></h2>
    <p></p>
  </article>
  <p data-config-contact class="auto-fadein"></p>
  </p>
</slide>

<slide class="backdrop"></slide>

</slides>

<script>
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-XXXXXXXX-1']);
_gaq.push(['_trackPageview']);

(function() {
  var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
  ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
  var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>

<!--[if IE]>
  <script src="http://ajax.googleapis.com/ajax/libs/chrome-frame/1/CFInstall.min.js"></script>
  <script>CFInstall.check({mode: 'overlay'});</script>
<![endif]-->
</body>
</html>