From b6f20ab9d1f0e6e5c4335c83f4eb32c1c7c6762c Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Fri, 25 Oct 2019 16:25:00 +0200
Subject: [PATCH 01/14] Approximate bimodal gaussian with gaussian distribution

---
 prototype.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 prototype.py

diff --git a/prototype.py b/prototype.py
new file mode 100644
index 0000000..70ae0ad
--- /dev/null
+++ b/prototype.py
@@ -0,0 +1,75 @@
+import math
+import os
+import torch
+import torch.distributions.constraints as constraints
+import pyro
+from pyro.optim import Adam
+from pyro.infer import SVI, Trace_ELBO, config_enumerate
+import pyro.distributions as dist
+from pyro.infer.autoguide import AutoDelta
+from pyro import poutine
+
+# this is for running the notebook in our testing framework
+smoke_test = ('CI' in os.environ)
+n_steps = 2 if smoke_test else 2000
+pyro.set_rng_seed(2)
+
+# enable validation (e.g. validate parameters of distributions)
+assert pyro.__version__.startswith('0.4.1')
+pyro.enable_validation(True)
+
+# clear the param store in case we're in a REPL
+pyro.clear_param_store()
+
+data = torch.tensor([0., 1., 10., 11., 12.])
+
+K = 2  # Fixed number of components.
+
+@config_enumerate
+def model(data):
+    # Global variables.
+    weights = torch.tensor([0.4, 0.6])
+    scale = pyro.sample('scale', dist.LogNormal(0., 2.))
+    with pyro.plate('components', K):
+        locs = pyro.sample('locs', dist.Normal(0., 10.))
+
+    print("actual locs= {}".format(locs.data.numpy()))
+    print("actual scale= {}".format(scale.data.numpy()))
+
+    with pyro.plate('data', len(data)):
+        # Local variables.
+        assignment = pyro.sample('assignment', dist.Categorical(weights))
+        pyro.sample('obs', dist.Normal(locs[assignment], scale), obs=data)
+
+# global_guide = AutoDelta(poutine.block(model, expose=['locs', 'scale']))
+
+
+def guide(data):
+    scale_q = pyro.param('scale_q', torch.tensor(1.),
+                         constraint=constraints.positive)
+
+    locs_q = pyro.param('locs_q', torch.tensor(5.),
+                             constraint=constraints.positive)
+
+    pyro.sample('obs', dist.Normal(locs_q, scale_q))
+
+
+
+# setup the optimizer
+adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
+optimizer = Adam(adam_params)
+
+# setup the inference algorithm
+svi = SVI(model, guide, optimizer, loss=Trace_ELBO())
+
+# do gradient steps
+for step in range(n_steps):
+    svi.step(data)
+    if step % 100 == 0:
+        print('.', end='')
+
+scale = pyro.param("scale_q").item()
+locs = pyro.param("locs_q").item()
+print('locs = {}'.format(locs))
+print('scale = {}'.format(scale))
+

From 5ead11ce7fdd6b151c88d7dcabb69782e3e6cfff Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Fri, 25 Oct 2019 18:17:32 +0200
Subject: [PATCH 02/14] Add relbo draft

---
 prototype.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/prototype.py b/prototype.py
index 70ae0ad..874e4f4 100644
--- a/prototype.py
+++ b/prototype.py
@@ -8,6 +8,7 @@
 import pyro.distributions as dist
 from pyro.infer.autoguide import AutoDelta
 from pyro import poutine
+from pyro.poutine import trace, replay, block
 
 # this is for running the notebook in our testing framework
 smoke_test = ('CI' in os.environ)
@@ -33,9 +34,6 @@ def model(data):
     with pyro.plate('components', K):
         locs = pyro.sample('locs', dist.Normal(0., 10.))
 
-    print("actual locs= {}".format(locs.data.numpy()))
-    print("actual scale= {}".format(scale.data.numpy()))
-
     with pyro.plate('data', len(data)):
         # Local variables.
         assignment = pyro.sample('assignment', dist.Categorical(weights))
@@ -55,19 +53,77 @@ def guide(data):
 
 
 
+def approximation(data, components, weights):
+    assignment = pyro.sample('assignment', dist.Categorical(weights))
+    sample = components[assignment](data)
+
+def dummy_approximation(data):
+    scale_a = pyro.param('scale_a', torch.tensor(2),
+                         constraint=constraints.positive)
+
+    locs_a = pyro.param('locs_a', torch.tensor(10.),
+                          constraint=constraints.positive)
+    sample = pyro.sample('obs', dist.Normal(locs_a, scale_a))
+
+    # scale_a = pyro.param("scale_a").item()
+    # locs_a = pyro.param("locs_a").item ()
+
+    # print("Loc and scale of approx, before svi: {0} {1}". format(locs_a, scale_a))
+
+
+def relbo(model, guide, *args, **kwargs):
+
+    approximation = kwargs.pop('approximation', None)
+    # Run the guide with the arguments passed to SVI.step() and trace the execution,
+    # i.e. record all the calls to Pyro primitives like sample() and param().
+    guide_trace = trace(guide).get_trace(*args, **kwargs)
+    # Now run the model with the same arguments and trace the execution. Because
+    # model is being run with replay, whenever we encounter a sample site in the
+    # model, instead of sampling from the corresponding distribution in the model,
+    # we instead reuse the corresponding sample from the guide. In probabilistic
+    # terms, this means our loss is constructed as an expectation w.r.t. the joint
+    # distribution defined by the guide.
+    model_trace = trace(replay(model, guide_trace)).get_trace(*args, **kwargs)
+    approximation_trace = trace(replay(block(approximation, expose=["obs"]), guide_trace)).get_trace(*args, **kwargs)
+    # We will accumulate the various terms of the ELBO in `elbo`.
+    elbo = 0.
+    # Loop over all the sample sites in the model and add the corresponding
+    # log p(z) term to the ELBO. Note that this will also include any observed
+    # data, i.e. sample sites with the keyword `obs=...`.
+    elbo = elbo + model_trace.log_prob_sum()            
+    # Loop over all the sample sites in the guide and add the corresponding
+    # -log q(z) term to the ELBO.
+    elbo = elbo - guide_trace.log_prob_sum()
+    elbo = elbo - approximation_trace.log_prob_sum()
+    print(approximation_trace.log_prob_sum())
+    # Return (-elbo) since by convention we do gradient descent on a loss and
+    # the ELBO is a lower bound that needs to be maximized.
+    return -elbo
+
+
 # setup the optimizer
 adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
 optimizer = Adam(adam_params)
 
 # setup the inference algorithm
-svi = SVI(model, guide, optimizer, loss=Trace_ELBO())
+svi = SVI(model, guide, optimizer, loss=relbo)
 
 # do gradient steps
+
+wrapped_approximation = approximation(data, components=[guide, guide] ,weights=torch.tensor([0.4, 0.6]))
+
 for step in range(n_steps):
-    svi.step(data)
+    svi.step(data, approximation=dummy_approximation)
     if step % 100 == 0:
         print('.', end='')
 
+print(pyro.param("locs_q"))
+scale = pyro.param("scale_q").item()
+locs = pyro.param("locs_q").item()
+lifted_component = poutine.lift(guide)
+components = [lifted_component, lifted_component]
+
+
 scale = pyro.param("scale_q").item()
 locs = pyro.param("locs_q").item()
 print('locs = {}'.format(locs))

From eaa43677929b2d42536d918d25478407c2f8d099 Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Fri, 25 Oct 2019 18:41:56 +0200
Subject: [PATCH 03/14] Add first experiments with approximation

---
 prototype.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/prototype.py b/prototype.py
index 874e4f4..303492e 100644
--- a/prototype.py
+++ b/prototype.py
@@ -52,10 +52,31 @@ def guide(data):
     pyro.sample('obs', dist.Normal(locs_q, scale_q))
 
 
+def component_1(data):
+    scale_q = pyro.param('scale_q', torch.tensor(1.),
+                         constraint=constraints.positive)
+
+    locs_q = pyro.param('locs_q', torch.tensor(5.),
+                             constraint=constraints.positive)
+
+    pyro.sample('obs', dist.Normal(locs_q, scale_q))
+
+def component_2(data):
+    scale_q = pyro.param('scale_q', torch.tensor(2.),
+                         constraint=constraints.positive)
+
+    locs_q = pyro.param('locs_q', torch.tensor(10.),
+                             constraint=constraints.positive)
+
+    pyro.sample('obs', dist.Normal(locs_q, scale_q))
+
 
-def approximation(data, components, weights):
+
+def approximation(data):
+    weights = torch.tensor([0.4, 0.6])
     assignment = pyro.sample('assignment', dist.Categorical(weights))
-    sample = components[assignment](data)
+    components = [component_1, component_2]
+    components[assignment](data)
 
 def dummy_approximation(data):
     scale_a = pyro.param('scale_a', torch.tensor(2),
@@ -110,10 +131,9 @@ def relbo(model, guide, *args, **kwargs):
 
 # do gradient steps
 
-wrapped_approximation = approximation(data, components=[guide, guide] ,weights=torch.tensor([0.4, 0.6]))
 
 for step in range(n_steps):
-    svi.step(data, approximation=dummy_approximation)
+    svi.step(data, approximation=approximation)
     if step % 100 == 0:
         print('.', end='')
 

From cdc0e8767251bf4ad34e31cfb9fcc09983e153ea Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 26 Oct 2019 17:47:19 +0200
Subject: [PATCH 04/14] Add first working boosting BBVI prototype

---
 prototype.py | 132 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 80 insertions(+), 52 deletions(-)

diff --git a/prototype.py b/prototype.py
index 303492e..5b9cf12 100644
--- a/prototype.py
+++ b/prototype.py
@@ -9,10 +9,14 @@
 from pyro.infer.autoguide import AutoDelta
 from pyro import poutine
 from pyro.poutine import trace, replay, block
+from functools import partial
+import numpy as np
+import scipy.stats
+from matplotlib import pyplot
 
 # this is for running the notebook in our testing framework
 smoke_test = ('CI' in os.environ)
-n_steps = 2 if smoke_test else 2000
+n_steps = 2 if smoke_test else 1500
 pyro.set_rng_seed(2)
 
 # enable validation (e.g. validate parameters of distributions)
@@ -22,7 +26,7 @@
 # clear the param store in case we're in a REPL
 pyro.clear_param_store()
 
-data = torch.tensor([0., 1., 10., 11., 12.])
+data = torch.tensor([0., 1., 2., 0, 0.5, 1.5, 10., 11., 12., 10.6, 11.8, 12.2])
 
 K = 2  # Fixed number of components.
 
@@ -42,47 +46,26 @@ def model(data):
 # global_guide = AutoDelta(poutine.block(model, expose=['locs', 'scale']))
 
 
-def guide(data):
-    scale_q = pyro.param('scale_q', torch.tensor(1.),
+def guide(data, index):
+    scale_q = pyro.param('scale_{}'.format(index), torch.tensor(1.),
                          constraint=constraints.positive)
 
-    locs_q = pyro.param('locs_q', torch.tensor(5.),
-                             constraint=constraints.positive)
-
-    pyro.sample('obs', dist.Normal(locs_q, scale_q))
-
-
-def component_1(data):
-    scale_q = pyro.param('scale_q', torch.tensor(1.),
-                         constraint=constraints.positive)
-
-    locs_q = pyro.param('locs_q', torch.tensor(5.),
-                             constraint=constraints.positive)
-
-    pyro.sample('obs', dist.Normal(locs_q, scale_q))
-
-def component_2(data):
-    scale_q = pyro.param('scale_q', torch.tensor(2.),
-                         constraint=constraints.positive)
-
-    locs_q = pyro.param('locs_q', torch.tensor(10.),
+    locs_q = pyro.param('locs_{}'.format(index), torch.tensor(5.),
                              constraint=constraints.positive)
 
     pyro.sample('obs', dist.Normal(locs_q, scale_q))
 
 
 
-def approximation(data):
-    weights = torch.tensor([0.4, 0.6])
+def approximation(data, components, weights):
     assignment = pyro.sample('assignment', dist.Categorical(weights))
-    components = [component_1, component_2]
     components[assignment](data)
 
 def dummy_approximation(data):
-    scale_a = pyro.param('scale_a', torch.tensor(2),
+    scale_a = pyro.param('scale_a', torch.tensor(10000.),
                          constraint=constraints.positive)
 
-    locs_a = pyro.param('locs_a', torch.tensor(10.),
+    locs_a = pyro.param('locs_a', torch.tensor(0.),
                           constraint=constraints.positive)
     sample = pyro.sample('obs', dist.Normal(locs_a, scale_a))
 
@@ -116,7 +99,7 @@ def relbo(model, guide, *args, **kwargs):
     # -log q(z) term to the ELBO.
     elbo = elbo - guide_trace.log_prob_sum()
     elbo = elbo - approximation_trace.log_prob_sum()
-    print(approximation_trace.log_prob_sum())
+    # print(approximation_trace.log_prob_sum())
     # Return (-elbo) since by convention we do gradient descent on a loss and
     # the ELBO is a lower bound that needs to be maximized.
     return -elbo
@@ -126,26 +109,71 @@ def relbo(model, guide, *args, **kwargs):
 adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
 optimizer = Adam(adam_params)
 
-# setup the inference algorithm
-svi = SVI(model, guide, optimizer, loss=relbo)
-
-# do gradient steps
-
-
-for step in range(n_steps):
-    svi.step(data, approximation=approximation)
-    if step % 100 == 0:
-        print('.', end='')
-
-print(pyro.param("locs_q"))
-scale = pyro.param("scale_q").item()
-locs = pyro.param("locs_q").item()
-lifted_component = poutine.lift(guide)
-components = [lifted_component, lifted_component]
-
-
-scale = pyro.param("scale_q").item()
-locs = pyro.param("locs_q").item()
-print('locs = {}'.format(locs))
-print('scale = {}'.format(scale))
 
+# wrapped_guide = partial(guide, index=0)
+# svi = SVI(model, wrapped_guide, optimizer, loss=Trace_ELBO())
+# # do gradient steps
+# losses = []
+# for step in range(n_steps):
+#     loss = svi.step(data)
+#     losses.append(loss)
+#     if step % 100 == 0:
+#         print('.', end='')
+
+n_iterations = 2
+
+initial_approximation = dummy_approximation
+components = [initial_approximation]
+weights = torch.tensor([1.])
+wrapped_approximation = partial(approximation, components=components, weights=weights)
+
+locs = [0]
+scales = [0]
+
+for t in range(1, n_iterations + 1):
+    # setup the inference algorithm
+    wrapped_guide = partial(guide, index=t)
+    svi = SVI(model, wrapped_guide, optimizer, loss=relbo)
+    # do gradient steps
+    losses = []
+    for step in range(n_steps):
+        loss = svi.step(data, approximation=wrapped_approximation)
+        losses.append(loss)
+        if step % 100 == 0:
+            print('.', end='')
+
+    #pyplot.scatter(range(len(losses)), losses)
+    #pyplot.show()
+    components.append(wrapped_guide)
+    new_weight = 2 / (t + 1)
+
+    weights = weights * (1-new_weight)
+    weights = torch.cat((weights, torch.tensor([new_weight])))
+
+    wrapped_approximation = partial(approximation, components=components, weights=weights)
+
+
+    scale = pyro.param("scale_{}".format(t)).item()
+    scales.append(scale)
+    loc = pyro.param("locs_{}".format(t)).item()
+    locs.append(loc)
+    print('locs = {}'.format(loc))
+    print('scale = {}'.format(scale))
+
+print(weights)
+print(locs)
+print(scales)
+
+X = np.arange(-3,18,0.1)
+Y1 = weights[1].item() * scipy.stats.norm.pdf((X - locs[1]) / scales[1])
+Y2 = weights[2].item() * scipy.stats.norm.pdf((X - locs[2]) / scales[2])
+#Y3 = weights[3].item() * scipy.stats.norm.pdf((X - locs[3] / scales[3]))
+
+pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
+pyplot.plot(X, Y1, 'r-')
+pyplot.plot(X, Y2, 'b-')
+pyplot.plot(X, Y1 + Y2, 'k--')
+pyplot.plot(data.data.numpy(), np.zeros(len(data)), 'k*')
+pyplot.title('Density of two-component mixture model')
+pyplot.ylabel('probability density');
+pyplot.show()

From 4a4ab298467eea30756819cb5392395ec925b5b5 Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Tue, 29 Oct 2019 09:51:31 +0100
Subject: [PATCH 05/14] Add requirements.txt

---
 requirements.txt | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..cbb56f3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,36 @@
+absl-py==0.8.1
+astor==0.8.0
+certifi==2019.9.11
+cycler==0.10.0
+gast==0.2.2
+google-pasta==0.1.7
+graphviz==0.13
+grpcio==1.24.1
+h5py==2.10.0
+joblib==0.14.0
+joypy==0.2.1
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.0
+kiwisolver==1.1.0
+Markdown==3.1.1
+matplotlib==3.1.1
+numpy==1.17.2
+opt-einsum==3.1.0
+pandas==0.25.1
+protobuf==3.10.0
+pyparsing==2.4.2
+pyro-ppl==0.4.1
+python-dateutil==2.8.0
+pytz==2019.3
+scikit-learn==0.21.3
+scipy==1.3.1
+seaborn==0.9.0
+six==1.12.0
+tensorboard==1.14.0
+tensorflow==1.14.0
+tensorflow-estimator==1.14.0
+termcolor==1.1.0
+torch==1.3.0
+tqdm==4.36.1
+Werkzeug==0.16.0
+wrapt==1.11.2

From 41068fef1ebdcafa45dbc59080d34bc806b9a09a Mon Sep 17 00:00:00 2001
From: Lorenz Kuhn <kuhnl@student.ethz.ch>
Date: Tue, 29 Oct 2019 17:57:43 +0100
Subject: [PATCH 06/14] Reduce requirement specification

---
 requirements.txt | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cbb56f3..829dd8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,36 +1,2 @@
-absl-py==0.8.1
-astor==0.8.0
-certifi==2019.9.11
-cycler==0.10.0
-gast==0.2.2
-google-pasta==0.1.7
-graphviz==0.13
-grpcio==1.24.1
-h5py==2.10.0
-joblib==0.14.0
-joypy==0.2.1
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-kiwisolver==1.1.0
-Markdown==3.1.1
-matplotlib==3.1.1
-numpy==1.17.2
-opt-einsum==3.1.0
-pandas==0.25.1
-protobuf==3.10.0
-pyparsing==2.4.2
 pyro-ppl==0.4.1
-python-dateutil==2.8.0
-pytz==2019.3
-scikit-learn==0.21.3
-scipy==1.3.1
-seaborn==0.9.0
-six==1.12.0
-tensorboard==1.14.0
-tensorflow==1.14.0
-tensorflow-estimator==1.14.0
-termcolor==1.1.0
 torch==1.3.0
-tqdm==4.36.1
-Werkzeug==0.16.0
-wrapt==1.11.2

From 70589cd9dba139a394084931c77290c075fb95a7 Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Wed, 30 Oct 2019 10:00:26 +0100
Subject: [PATCH 07/14] Add more logging features

---
 prototype.py | 214 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 128 insertions(+), 86 deletions(-)

diff --git a/prototype.py b/prototype.py
index 5b9cf12..d104ae5 100644
--- a/prototype.py
+++ b/prototype.py
@@ -3,7 +3,7 @@
 import torch
 import torch.distributions.constraints as constraints
 import pyro
-from pyro.optim import Adam
+from pyro.optim import Adam, SGD
 from pyro.infer import SVI, Trace_ELBO, config_enumerate
 import pyro.distributions as dist
 from pyro.infer.autoguide import AutoDelta
@@ -13,10 +13,15 @@
 import numpy as np
 import scipy.stats
 from matplotlib import pyplot
+from pyro.infer.autoguide import AutoDelta
+from collections import defaultdict
+
+PRINT_INTERMEDIATE_LATENT_VALUES = False
+PRINT_TRACES = False
 
 # this is for running the notebook in our testing framework
 smoke_test = ('CI' in os.environ)
-n_steps = 2 if smoke_test else 1500
+n_steps = 2 if smoke_test else 1200
 pyro.set_rng_seed(2)
 
 # enable validation (e.g. validate parameters of distributions)
@@ -28,12 +33,18 @@
 
 data = torch.tensor([0., 1., 2., 0, 0.5, 1.5, 10., 11., 12., 10.6, 11.8, 12.2])
 
+n = torch.distributions.Normal(torch.tensor([2.0]), torch.tensor([1.0]))
+m = torch.distributions.Normal(torch.tensor([10.0]), torch.tensor([1.0]))
+
+data = torch.cat((n.sample((60,)), m.sample((40,))))
+
+
 K = 2  # Fixed number of components.
 
 @config_enumerate
 def model(data):
-    # Global variables.
-    weights = torch.tensor([0.4, 0.6])
+
+    weights = pyro.sample('weights', dist.Dirichlet(0.5 * torch.ones(K)))
     scale = pyro.sample('scale', dist.LogNormal(0., 2.))
     with pyro.plate('components', K):
         locs = pyro.sample('locs', dist.Normal(0., 10.))
@@ -43,8 +54,6 @@ def model(data):
         assignment = pyro.sample('assignment', dist.Categorical(weights))
         pyro.sample('obs', dist.Normal(locs[assignment], scale), obs=data)
 
-# global_guide = AutoDelta(poutine.block(model, expose=['locs', 'scale']))
-
 
 def guide(data, index):
     scale_q = pyro.param('scale_{}'.format(index), torch.tensor(1.),
@@ -56,24 +65,18 @@ def guide(data, index):
     pyro.sample('obs', dist.Normal(locs_q, scale_q))
 
 
-
+@config_enumerate
 def approximation(data, components, weights):
     assignment = pyro.sample('assignment', dist.Categorical(weights))
     components[assignment](data)
 
+
 def dummy_approximation(data):
-    scale_a = pyro.param('scale_a', torch.tensor(10000.),
-                         constraint=constraints.positive)
 
-    locs_a = pyro.param('locs_a', torch.tensor(0.),
-                          constraint=constraints.positive)
+    scale_a = torch.tensor([1.])
+    locs_a = torch.tensor([20.])
     sample = pyro.sample('obs', dist.Normal(locs_a, scale_a))
 
-    # scale_a = pyro.param("scale_a").item()
-    # locs_a = pyro.param("locs_a").item ()
-
-    # print("Loc and scale of approx, before svi: {0} {1}". format(locs_a, scale_a))
-
 
 def relbo(model, guide, *args, **kwargs):
 
@@ -94,86 +97,125 @@ def relbo(model, guide, *args, **kwargs):
     # Loop over all the sample sites in the model and add the corresponding
     # log p(z) term to the ELBO. Note that this will also include any observed
     # data, i.e. sample sites with the keyword `obs=...`.
-    elbo = elbo + model_trace.log_prob_sum()            
+    elbo = elbo + model_trace.log_prob_sum()        
     # Loop over all the sample sites in the guide and add the corresponding
     # -log q(z) term to the ELBO.
     elbo = elbo - guide_trace.log_prob_sum()
     elbo = elbo - approximation_trace.log_prob_sum()
-    # print(approximation_trace.log_prob_sum())
+    
     # Return (-elbo) since by convention we do gradient descent on a loss and
     # the ELBO is a lower bound that needs to be maximized.
+    if elbo < 10e-8 and PRINT_TRACES:
+        print('Guide trace')
+        print(guide_trace.log_prob_sum())
+        print('Model trace')
+        print(model_trace.log_prob_sum())   
+        print('Approximation trace')
+        print(approximation_trace.log_prob_sum())
     return -elbo
 
+def boosting_bbvi():
+    # setup the optimizer
+    adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
+    optimizer = Adam(adam_params)
 
-# setup the optimizer
-adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
-optimizer = Adam(adam_params)
-
-
-# wrapped_guide = partial(guide, index=0)
-# svi = SVI(model, wrapped_guide, optimizer, loss=Trace_ELBO())
-# # do gradient steps
-# losses = []
-# for step in range(n_steps):
-#     loss = svi.step(data)
-#     losses.append(loss)
-#     if step % 100 == 0:
-#         print('.', end='')
-
-n_iterations = 2
-
-initial_approximation = dummy_approximation
-components = [initial_approximation]
-weights = torch.tensor([1.])
-wrapped_approximation = partial(approximation, components=components, weights=weights)
-
-locs = [0]
-scales = [0]
-
-for t in range(1, n_iterations + 1):
-    # setup the inference algorithm
-    wrapped_guide = partial(guide, index=t)
-    svi = SVI(model, wrapped_guide, optimizer, loss=relbo)
-    # do gradient steps
-    losses = []
-    for step in range(n_steps):
-        loss = svi.step(data, approximation=wrapped_approximation)
-        losses.append(loss)
-        if step % 100 == 0:
-            print('.', end='')
-
-    #pyplot.scatter(range(len(losses)), losses)
-    #pyplot.show()
-    components.append(wrapped_guide)
-    new_weight = 2 / (t + 1)
-
-    weights = weights * (1-new_weight)
-    weights = torch.cat((weights, torch.tensor([new_weight])))
+    n_iterations = 2
 
+    initial_approximation = dummy_approximation
+    components = [initial_approximation]
+    weights = torch.tensor([1.])
     wrapped_approximation = partial(approximation, components=components, weights=weights)
 
+    locs = [0]
+    scales = [0]
+
+    gradient_norms = defaultdict(list)
+    for t in range(1, n_iterations + 1):
+        # setup the inference algorithm
+        wrapped_guide = partial(guide, index=t)
+        svi = SVI(model, wrapped_guide, optimizer, loss=relbo)
+        # do gradient steps
+        losses = []
+        # Register hooks to monitor gradient norms.
+        wrapped_guide(data)
+        for name, value in pyro.get_param_store().named_parameters():
+            if not name in gradient_norms:
+                value.register_hook(lambda g, name=name: gradient_norms[name].append(g.norm().item()))
+        
+        for step in range(n_steps):
+            loss = svi.step(data, approximation=wrapped_approximation)
+            losses.append(loss)
+            if (loss > 1000 and PRINT_INTERMEDIATE_LATENT_VALUES):
+                print('Loss: {}'.format(loss))
+                scale = pyro.param("scale_{}".format(t)).item()
+                loc = pyro.param("locs_{}".format(t)).item()
+                print('locs = {}'.format(loc))
+                print('scale = {}'.format(scale))
+
+            if step % 100 == 0:
+                print('.', end=' ')
+
+
+        pyplot.plot(range(len(losses)), losses)
+        pyplot.xlabel('Update Steps')
+        pyplot.ylabel('-ELBO')
+        pyplot.title('-ELBO against time for component{}'.format(t));
+        pyplot.show()
+
+        components.append(wrapped_guide)
+        new_weight = 2 / (t + 1)
+
+        weights = weights * (1-new_weight)
+        weights = torch.cat((weights, torch.tensor([new_weight])))
+
+        wrapped_approximation = partial(approximation, components=components, weights=weights)
+
+
+        scale = pyro.param("scale_{}".format(t)).item()
+        scales.append(scale)
+        loc = pyro.param("locs_{}".format(t)).item()
+        locs.append(loc)
+        print('locs = {}'.format(loc))
+        print('scale = {}'.format(scale))
+
+    pyplot.figure(figsize=(10,4), dpi=100).set_facecolor('white')
+    for name, grad_norms in gradient_norms.items():
+        pyplot.plot(grad_norms, label=name)
+        pyplot.xlabel('iters')
+        pyplot.ylabel('gradient norm')
+        pyplot.yscale('log')
+        pyplot.legend(loc='best')
+        pyplot.title('Gradient norms during SVI');
+    pyplot.show()  
+
+
+    for t in range(1, n_iterations + 1):
+        scale = pyro.param("scale_{}".format(t)).item()
+        loc = pyro.param("locs_{}".format(t)).item()
+        print('locs = {}'.format(loc))
+        print('scale = {}'.format(scale))
+
+
+    print(weights)
+    print(locs)
+    print(scales)
+
+    X = np.arange(-3,18,0.1)
+    Y1 = weights[1].item() * scipy.stats.norm.pdf((X - locs[1]) / scales[1])
+    Y2 = weights[2].item() * scipy.stats.norm.pdf((X - locs[2]) / scales[2])
+    #Y3 = weights[3].item() * scipy.stats.norm.pdf((X - locs[3] / scales[3]))
+
+    pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
+    pyplot.plot(X, Y1, 'r-')
+    pyplot.plot(X, Y2, 'b-')
+    pyplot.plot(X, Y1 + Y2, 'k--')
+    pyplot.plot(data.data.numpy(), np.zeros(len(data)), 'k*')
+    pyplot.title('Density of two-component mixture model')
+    pyplot.ylabel('probability density');
+    pyplot.show()
+
+
+if __name__ == '__main__':
+
+    boosting_bbvi()
 
-    scale = pyro.param("scale_{}".format(t)).item()
-    scales.append(scale)
-    loc = pyro.param("locs_{}".format(t)).item()
-    locs.append(loc)
-    print('locs = {}'.format(loc))
-    print('scale = {}'.format(scale))
-
-print(weights)
-print(locs)
-print(scales)
-
-X = np.arange(-3,18,0.1)
-Y1 = weights[1].item() * scipy.stats.norm.pdf((X - locs[1]) / scales[1])
-Y2 = weights[2].item() * scipy.stats.norm.pdf((X - locs[2]) / scales[2])
-#Y3 = weights[3].item() * scipy.stats.norm.pdf((X - locs[3] / scales[3]))
-
-pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
-pyplot.plot(X, Y1, 'r-')
-pyplot.plot(X, Y2, 'b-')
-pyplot.plot(X, Y1 + Y2, 'k--')
-pyplot.plot(data.data.numpy(), np.zeros(len(data)), 'k*')
-pyplot.title('Density of two-component mixture model')
-pyplot.ylabel('probability density');
-pyplot.show()

From 9374edb51b9704d8d3b3b23fc61808dfbc9e1a36 Mon Sep 17 00:00:00 2001
From: Gideon Dresdner <gideond@gmail.com>
Date: Wed, 30 Oct 2019 12:23:38 +0100
Subject: [PATCH 08/14] wip, clean up deps

---
 environment.yml | 76 +++++++++++++++++++++++++++++++++++++++++++++++++
 install         |  3 ++
 prototype.py    |  1 +
 3 files changed, 80 insertions(+)
 create mode 100644 environment.yml
 create mode 100644 install
 mode change 100644 => 100755 prototype.py

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..5c07a77
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,76 @@
+name: bbbvipyro
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - blas=1.0=mkl
+  - ca-certificates=2019.10.16=0
+  - certifi=2019.9.11=py37_0
+  - cffi=1.13.0=py37h2e261b9_0
+  - cudatoolkit=10.1.168=0
+  - cycler=0.10.0=py37_0
+  - dbus=1.13.12=h746ee38_0
+  - expat=2.2.6=he6710b0_0
+  - fontconfig=2.13.0=h9420a91_0
+  - freetype=2.9.1=h8a8886c_1
+  - glib=2.56.2=hd408876_0
+  - gst-plugins-base=1.14.0=hbbd80ab_1
+  - gstreamer=1.14.0=hb453b48_1
+  - icu=58.2=h9c2bf20_1
+  - intel-openmp=2019.4=243
+  - jpeg=9b=h024ee3a_2
+  - kiwisolver=1.1.0=py37he6710b0_0
+  - libedit=3.1.20181209=hc058e9b_0
+  - libffi=3.2.1=hd88cf55_4
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libgfortran-ng=7.3.0=hdf63c60_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.0.10=h2733197_2
+  - libuuid=1.0.3=h1bed415_2
+  - libxcb=1.13=h1bed415_1
+  - libxml2=2.9.9=hea5a465_1
+  - matplotlib=3.1.1=py37h5429711_0
+  - mkl=2019.4=243
+  - mkl-service=2.3.0=py37he904b0f_0
+  - mkl_fft=1.0.14=py37ha843d7b_0
+  - mkl_random=1.1.0=py37hd6b4f25_0
+  - ncurses=6.1=he6710b0_1
+  - ninja=1.9.0=py37hfd86e86_0
+  - numpy=1.17.2=py37haad9e8e_0
+  - numpy-base=1.17.2=py37hde5b4d6_0
+  - olefile=0.46=py37_0
+  - openssl=1.1.1d=h7b6447c_3
+  - pcre=8.43=he6710b0_0
+  - pillow=6.2.0=py37h34e0f95_0
+  - pip=19.3.1=py37_0
+  - pycparser=2.19=py37_0
+  - pyparsing=2.4.2=py_0
+  - pyqt=5.9.2=py37h05f1152_2
+  - python=3.7.4=h265db76_1
+  - python-dateutil=2.8.0=py37_0
+  - pytorch=1.3.0=py3.7_cuda10.1.243_cudnn7.6.3_0
+  - pytz=2019.3=py_0
+  - qt=5.9.7=h5867ecd_1
+  - readline=7.0=h7b6447c_5
+  - scipy=1.3.1=py37h7c811a0_0
+  - setuptools=41.4.0=py37_0
+  - sip=4.19.8=py37hf484d3e_0
+  - six=1.12.0=py37_0
+  - sqlite=3.30.1=h7b6447c_0
+  - tk=8.6.8=hbc83047_0
+  - torchvision=0.4.1=py37_cu101
+  - tornado=6.0.3=py37h7b6447c_0
+  - wheel=0.33.6=py37_0
+  - xz=5.2.4=h14c3975_4
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.3.7=h0b5b093_0
+  - pip:
+    - opt-einsum==3.1.0
+    - pyro-api==0.1.1
+    - pyro-ppl==0.5.1
+    - python-graphviz==0.13
+    - tqdm==4.36.1
+prefix: /cluster/home/dgideon/software/anaconda3/envs/bbbvipyro
+
diff --git a/install b/install
new file mode 100644
index 0000000..3af76c9
--- /dev/null
+++ b/install
@@ -0,0 +1,3 @@
+conda install -y pytorch torchvision -c pytorch
+conda install -y scipy numpy matplotlib
+pip install pyro-ppl
diff --git a/prototype.py b/prototype.py
old mode 100644
new mode 100755
index 5b9cf12..9d55cc1
--- a/prototype.py
+++ b/prototype.py
@@ -20,6 +20,7 @@
 pyro.set_rng_seed(2)
 
 # enable validation (e.g. validate parameters of distributions)
+print(pyro.__version__) # returns 0.5.something. Why is this bad? #TODO(gideon)
 assert pyro.__version__.startswith('0.4.1')
 pyro.enable_validation(True)
 

From f6eb79c52b8b1ab551c4f25eb89c2bbe060685c9 Mon Sep 17 00:00:00 2001
From: Gideon Dresdner <gideond@gmail.com>
Date: Wed, 30 Oct 2019 14:23:43 +0100
Subject: [PATCH 09/14] reproducible environment

---
 environment.yml | 74 ++++---------------------------------------------
 install         |  3 --
 prototype.py    |  2 --
 3 files changed, 6 insertions(+), 73 deletions(-)
 delete mode 100644 install

diff --git a/environment.yml b/environment.yml
index 5c07a77..360dce0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,74 +3,12 @@ channels:
   - pytorch
   - defaults
 dependencies:
-  - _libgcc_mutex=0.1=main
-  - blas=1.0=mkl
-  - ca-certificates=2019.10.16=0
-  - certifi=2019.9.11=py37_0
-  - cffi=1.13.0=py37h2e261b9_0
-  - cudatoolkit=10.1.168=0
-  - cycler=0.10.0=py37_0
-  - dbus=1.13.12=h746ee38_0
-  - expat=2.2.6=he6710b0_0
-  - fontconfig=2.13.0=h9420a91_0
-  - freetype=2.9.1=h8a8886c_1
-  - glib=2.56.2=hd408876_0
-  - gst-plugins-base=1.14.0=hbbd80ab_1
-  - gstreamer=1.14.0=hb453b48_1
-  - icu=58.2=h9c2bf20_1
-  - intel-openmp=2019.4=243
-  - jpeg=9b=h024ee3a_2
-  - kiwisolver=1.1.0=py37he6710b0_0
-  - libedit=3.1.20181209=hc058e9b_0
-  - libffi=3.2.1=hd88cf55_4
-  - libgcc-ng=9.1.0=hdf63c60_0
-  - libgfortran-ng=7.3.0=hdf63c60_0
-  - libpng=1.6.37=hbc83047_0
-  - libstdcxx-ng=9.1.0=hdf63c60_0
-  - libtiff=4.0.10=h2733197_2
-  - libuuid=1.0.3=h1bed415_2
-  - libxcb=1.13=h1bed415_1
-  - libxml2=2.9.9=hea5a465_1
-  - matplotlib=3.1.1=py37h5429711_0
-  - mkl=2019.4=243
-  - mkl-service=2.3.0=py37he904b0f_0
-  - mkl_fft=1.0.14=py37ha843d7b_0
-  - mkl_random=1.1.0=py37hd6b4f25_0
-  - ncurses=6.1=he6710b0_1
-  - ninja=1.9.0=py37hfd86e86_0
-  - numpy=1.17.2=py37haad9e8e_0
-  - numpy-base=1.17.2=py37hde5b4d6_0
-  - olefile=0.46=py37_0
-  - openssl=1.1.1d=h7b6447c_3
-  - pcre=8.43=he6710b0_0
-  - pillow=6.2.0=py37h34e0f95_0
-  - pip=19.3.1=py37_0
-  - pycparser=2.19=py37_0
-  - pyparsing=2.4.2=py_0
-  - pyqt=5.9.2=py37h05f1152_2
-  - python=3.7.4=h265db76_1
-  - python-dateutil=2.8.0=py37_0
-  - pytorch=1.3.0=py3.7_cuda10.1.243_cudnn7.6.3_0
-  - pytz=2019.3=py_0
-  - qt=5.9.7=h5867ecd_1
-  - readline=7.0=h7b6447c_5
-  - scipy=1.3.1=py37h7c811a0_0
-  - setuptools=41.4.0=py37_0
-  - sip=4.19.8=py37hf484d3e_0
-  - six=1.12.0=py37_0
-  - sqlite=3.30.1=h7b6447c_0
-  - tk=8.6.8=hbc83047_0
-  - torchvision=0.4.1=py37_cu101
-  - tornado=6.0.3=py37h7b6447c_0
-  - wheel=0.33.6=py37_0
-  - xz=5.2.4=h14c3975_4
-  - zlib=1.2.11=h7b6447c_3
-  - zstd=1.3.7=h0b5b093_0
+  - matplotlib
+  - numpy
+  - scipy
+  - torchvision
+  - pytorch
+  - pip
   - pip:
-    - opt-einsum==3.1.0
-    - pyro-api==0.1.1
     - pyro-ppl==0.5.1
-    - python-graphviz==0.13
-    - tqdm==4.36.1
 prefix: /cluster/home/dgideon/software/anaconda3/envs/bbbvipyro
-
diff --git a/install b/install
deleted file mode 100644
index 3af76c9..0000000
--- a/install
+++ /dev/null
@@ -1,3 +0,0 @@
-conda install -y pytorch torchvision -c pytorch
-conda install -y scipy numpy matplotlib
-pip install pyro-ppl
diff --git a/prototype.py b/prototype.py
index 9d55cc1..c1604bf 100755
--- a/prototype.py
+++ b/prototype.py
@@ -20,8 +20,6 @@
 pyro.set_rng_seed(2)
 
 # enable validation (e.g. validate parameters of distributions)
-print(pyro.__version__) # returns 0.5.something. Why is this bad? #TODO(gideon)
-assert pyro.__version__.startswith('0.4.1')
 pyro.enable_validation(True)
 
 # clear the param store in case we're in a REPL

From 7f005c2a3994378441f70d1d39ead69697981833 Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 9 Nov 2019 18:00:46 +0100
Subject: [PATCH 10/14] Add first draft of bimodal_posterior example

---
 bimodal_posterior.py | 160 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 bimodal_posterior.py

diff --git a/bimodal_posterior.py b/bimodal_posterior.py
new file mode 100644
index 0000000..163a84e
--- /dev/null
+++ b/bimodal_posterior.py
@@ -0,0 +1,160 @@
+import math
+import os
+import torch
+import torch.distributions.constraints as constraints
+import pyro
+from pyro.optim import Adam, SGD
+from pyro.infer import SVI, Trace_ELBO, config_enumerate
+import pyro.distributions as dist
+from pyro.infer.autoguide import AutoDelta
+from pyro import poutine    
+from pyro.poutine import trace, replay, block
+from functools import partial
+import numpy as np
+import scipy.stats
+from pyro.infer.autoguide import AutoDelta
+from collections import defaultdict
+import matplotlib
+from matplotlib import pyplot
+
+PRINT_INTERMEDIATE_LATENT_VALUES = True
+PRINT_TRACES = False
+
+# this is for running the notebook in our testing framework
+smoke_test = ('CI' in os.environ)
+n_steps = 2 if smoke_test else 2000
+pyro.set_rng_seed(2)
+
+# enable validation (e.g. validate parameters of distributions)
+pyro.enable_validation(True)
+
+# clear the param store in case we're in a REPL
+pyro.clear_param_store()
+
+data = torch.tensor([4.0, 4.2, 3.9, 4.1, 3.8, 3.5, 4.3])
+
+def guide(data, index):
+    variance_q = pyro.param('variance_{}'.format(index), torch.tensor([1.0]), constraints.positive)
+    mu_q = pyro.param('mu_{}'.format(index), torch.tensor([1.0]))
+    pyro.sample("mu", dist.Normal(mu_q, variance_q))
+
+@config_enumerate
+def model(data):
+    # Global variables.
+    prior_mu = torch.tensor([0.])
+    prior_variance = torch.tensor([5.])
+    mu = pyro.sample('mu', dist.Normal(prior_mu, prior_variance))
+    variance = torch.tensor([1.])
+
+    for i in pyro.plate('data', len(data)):
+        # Local variables.
+        pyro.sample('obs_{}'.format(i), dist.Normal(mu*mu, variance), obs=data[i])
+
+@config_enumerate
+def approximation(data, components, weights):
+        for i in pyro.plate('data', len(data)):
+            assignment = pyro.sample('assignment_{}'.format(i), dist.Categorical(weights))
+            distribution = components[assignment](data)
+
+def dummy_approximation(data):
+    variance_q = pyro.param('variance_0', torch.tensor([1.0]), constraints.positive)
+    mu_q = pyro.param('mu_0', torch.tensor([20.0]))
+    pyro.sample("mu", dist.Normal(mu_q, variance_q))
+
+def relbo(model, guide, *args, **kwargs):
+
+    approximation = kwargs.pop('approximation', None)
+    # Run the guide with the arguments passed to SVI.step() and trace the execution,
+    # i.e. record all the calls to Pyro primitives like sample() and param().
+    #print("enter relbo")
+    guide_trace = trace(guide).get_trace(*args, **kwargs)
+    #print(guide_trace.nodes['obs_1'])
+    model_trace = trace(replay(model, guide_trace)).get_trace(*args, **kwargs)
+    #print(model_trace.nodes['obs_1'])
+
+
+    approximation_trace = trace(replay(block(approximation, expose_fn= lambda site: 'obs_' in site['name']), guide_trace)).get_trace(*args, **kwargs)
+    # We will accumulate the various terms of the ELBO in `elbo`.
+
+    elbo = model_trace.log_prob_sum() - guide_trace.log_prob_sum() - approximation_trace.log_prob_sum()
+    loss_fn = pyro.infer.TraceEnum_ELBO(max_plate_nesting=1).differentiable_loss(model,
+                                                               guide,
+                                                        *args, **kwargs)
+
+    elbo = -loss_fn - approximation_trace.log_prob_sum()
+    # Return (-elbo) since by convention we do gradient descent on a loss and
+    # the ELBO is a lower bound that needs to be maximized.
+
+    return -elbo
+
+
+def boosting_bbvi():
+    n_iterations = 2
+
+    initial_approximation = dummy_approximation
+    components = [initial_approximation]
+    weights = torch.tensor([1.])
+    wrapped_approximation = partial(approximation, components=components,
+                                    weights=weights)
+
+    locs = [0]
+    scales = [0]
+
+    gradient_norms = defaultdict(list)
+    for t in range(1, n_iterations + 1):
+        # setup the inference algorithm
+        wrapped_guide = partial(guide, index=t)
+        # do gradient steps
+        losses = []
+        # Register hooks to monitor gradient norms.
+        wrapped_guide(data)
+        print(pyro.get_param_store().named_parameters())
+
+        adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
+        optimizer = Adam(adam_params)
+        for name, value in pyro.get_param_store().named_parameters():
+            if not name in gradient_norms:
+                value.register_hook(lambda g, name=name: gradient_norms[name].append(g.norm().item()))
+
+        svi = SVI(model, wrapped_guide, optimizer, loss=relbo)
+        for step in range(n_steps):
+            loss = svi.step(data, approximation=wrapped_approximation)
+            losses.append(loss)
+
+            if PRINT_INTERMEDIATE_LATENT_VALUES:
+                print('Loss: {}'.format(loss))
+                variance = pyro.param("variance_{}".format(t)).item()
+                mu = pyro.param("mu_{}".format(t)).item()
+                print('mu = {}'.format(mu))
+                print('variance = {}'.format(variance))
+
+            if step % 100 == 0:
+                print('.', end=' ')
+
+        pyplot.plot(range(len(losses)), losses)
+        pyplot.xlabel('Update Steps')
+        pyplot.ylabel('-ELBO')
+        pyplot.title('-ELBO against time for component {}'.format(t));
+        pyplot.show()
+
+        components.append(wrapped_guide)
+        new_weight = 2 / (t + 1)
+
+        weights = weights * (1-new_weight)
+        weights = torch.cat((weights, torch.tensor([new_weight])))
+
+        wrapped_approximation = partial(approximation, components=components, weights=weights)
+
+    pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
+    for name, grad_norms in gradient_norms.items():
+        pyplot.plot(grad_norms, label=name)
+        pyplot.xlabel('iters')
+        pyplot.ylabel('gradient norm')
+        # pyplot.yscale('log')
+        pyplot.legend(loc='best')
+        pyplot.title('Gradient norms during SVI');
+    pyplot.show()  
+
+
+if __name__ == '__main__':
+  boosting_bbvi()
\ No newline at end of file

From 53d6a25c878f8f0bfa6d95cf7e28a8924dc5204d Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 9 Nov 2019 19:00:19 +0100
Subject: [PATCH 11/14] Fix exposure of approximation parameters

---
 bimodal_posterior.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/bimodal_posterior.py b/bimodal_posterior.py
index 163a84e..6876743 100644
--- a/bimodal_posterior.py
+++ b/bimodal_posterior.py
@@ -22,7 +22,7 @@
 
 # this is for running the notebook in our testing framework
 smoke_test = ('CI' in os.environ)
-n_steps = 2 if smoke_test else 2000
+n_steps = 2 if smoke_test else 5000
 pyro.set_rng_seed(2)
 
 # enable validation (e.g. validate parameters of distributions)
@@ -35,7 +35,7 @@
 
 def guide(data, index):
     variance_q = pyro.param('variance_{}'.format(index), torch.tensor([1.0]), constraints.positive)
-    mu_q = pyro.param('mu_{}'.format(index), torch.tensor([1.0]))
+    mu_q = pyro.param('mu_{}'.format(index), torch.tensor([-1.0]))
     pyro.sample("mu", dist.Normal(mu_q, variance_q))
 
 @config_enumerate
@@ -52,13 +52,12 @@ def model(data):
 
 @config_enumerate
 def approximation(data, components, weights):
-        for i in pyro.plate('data', len(data)):
-            assignment = pyro.sample('assignment_{}'.format(i), dist.Categorical(weights))
-            distribution = components[assignment](data)
+    assignment = pyro.sample('assignment', dist.Categorical(weights))
+    distribution = components[assignment](data)
 
 def dummy_approximation(data):
     variance_q = pyro.param('variance_0', torch.tensor([1.0]), constraints.positive)
-    mu_q = pyro.param('mu_0', torch.tensor([20.0]))
+    mu_q = pyro.param('mu_0', torch.tensor([1.0]))
     pyro.sample("mu", dist.Normal(mu_q, variance_q))
 
 def relbo(model, guide, *args, **kwargs):
@@ -73,7 +72,7 @@ def relbo(model, guide, *args, **kwargs):
     #print(model_trace.nodes['obs_1'])
 
 
-    approximation_trace = trace(replay(block(approximation, expose_fn= lambda site: 'obs_' in site['name']), guide_trace)).get_trace(*args, **kwargs)
+    approximation_trace = trace(replay(block(approximation, expose=['mu']), guide_trace)).get_trace(*args, **kwargs)
     # We will accumulate the various terms of the ELBO in `elbo`.
 
     elbo = model_trace.log_prob_sum() - guide_trace.log_prob_sum() - approximation_trace.log_prob_sum()
@@ -81,6 +80,8 @@ def relbo(model, guide, *args, **kwargs):
                                                                guide,
                                                         *args, **kwargs)
 
+    # print(loss_fn)
+    # print(approximation_trace.log_prob_sum())
     elbo = -loss_fn - approximation_trace.log_prob_sum()
     # Return (-elbo) since by convention we do gradient descent on a loss and
     # the ELBO is a lower bound that needs to be maximized.
@@ -110,7 +111,7 @@ def boosting_bbvi():
         wrapped_guide(data)
         print(pyro.get_param_store().named_parameters())
 
-        adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
+        adam_params = {"lr": 0.005, "betas": (0.90, 0.999)}
         optimizer = Adam(adam_params)
         for name, value in pyro.get_param_store().named_parameters():
             if not name in gradient_norms:
@@ -145,6 +146,13 @@ def boosting_bbvi():
 
         wrapped_approximation = partial(approximation, components=components, weights=weights)
 
+        scale = pyro.param("variance_{}".format(t)).item()
+        scales.append(scale)
+        loc = pyro.param("mu_{}".format(t)).item()
+        locs.append(loc)
+        print('mu = {}'.format(loc))
+        print('variance = {}'.format(scale))
+
     pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
     for name, grad_norms in gradient_norms.items():
         pyplot.plot(grad_norms, label=name)
@@ -155,6 +163,22 @@ def boosting_bbvi():
         pyplot.title('Gradient norms during SVI');
     pyplot.show()  
 
+    print(weights)
+    print(locs)
+    print(scales)
+
+    X = np.arange(-10, 10, 0.1)
+    Y1 = weights[1].item() * scipy.stats.norm.pdf((X - locs[1]) / scales[1])
+    Y2 = weights[2].item() * scipy.stats.norm.pdf((X - locs[2]) / scales[2])
+
+    pyplot.figure(figsize=(10, 4), dpi=100).set_facecolor('white')
+    pyplot.plot(X, Y1, 'r-')
+    pyplot.plot(X, Y2, 'b-')
+    pyplot.plot(X, Y1 + Y2, 'k--')
+    pyplot.plot(data.data.numpy(), np.zeros(len(data)), 'k*')
+    pyplot.title('Approximation of posterior over mu')
+    pyplot.ylabel('probability density');
+    pyplot.show()
 
 if __name__ == '__main__':
   boosting_bbvi()
\ No newline at end of file

From 32950ef1842730a71ff4209cbd9ec15e2cdec323 Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 9 Nov 2019 19:13:34 +0100
Subject: [PATCH 12/14] Increase learning rate

---
 bimodal_posterior.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bimodal_posterior.py b/bimodal_posterior.py
index 6876743..c0d2173 100644
--- a/bimodal_posterior.py
+++ b/bimodal_posterior.py
@@ -35,7 +35,7 @@
 
 def guide(data, index):
     variance_q = pyro.param('variance_{}'.format(index), torch.tensor([1.0]), constraints.positive)
-    mu_q = pyro.param('mu_{}'.format(index), torch.tensor([-1.0]))
+    mu_q = pyro.param('mu_{}'.format(index), torch.tensor([1.0]))
     pyro.sample("mu", dist.Normal(mu_q, variance_q))
 
 @config_enumerate
@@ -57,7 +57,7 @@ def approximation(data, components, weights):
 
 def dummy_approximation(data):
     variance_q = pyro.param('variance_0', torch.tensor([1.0]), constraints.positive)
-    mu_q = pyro.param('mu_0', torch.tensor([1.0]))
+    mu_q = pyro.param('mu_0', torch.tensor([20.0]))
     pyro.sample("mu", dist.Normal(mu_q, variance_q))
 
 def relbo(model, guide, *args, **kwargs):
@@ -111,7 +111,7 @@ def boosting_bbvi():
         wrapped_guide(data)
         print(pyro.get_param_store().named_parameters())
 
-        adam_params = {"lr": 0.005, "betas": (0.90, 0.999)}
+        adam_params = {"lr": 0.001, "betas": (0.90, 0.999)}
         optimizer = Adam(adam_params)
         for name, value in pyro.get_param_store().named_parameters():
             if not name in gradient_norms:

From 565353e881f02eb444fd6d0755e2a55fb791f1cf Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 9 Nov 2019 19:15:56 +0100
Subject: [PATCH 13/14] Add comment

---
 bimodal_posterior.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bimodal_posterior.py b/bimodal_posterior.py
index c0d2173..f2e6ee7 100644
--- a/bimodal_posterior.py
+++ b/bimodal_posterior.py
@@ -75,7 +75,9 @@ def relbo(model, guide, *args, **kwargs):
     approximation_trace = trace(replay(block(approximation, expose=['mu']), guide_trace)).get_trace(*args, **kwargs)
     # We will accumulate the various terms of the ELBO in `elbo`.
 
-    elbo = model_trace.log_prob_sum() - guide_trace.log_prob_sum() - approximation_trace.log_prob_sum()
+    # This is how we computed the ELBO before using TraceEnum_ELBO:
+    # elbo = model_trace.log_prob_sum() - guide_trace.log_prob_sum() - approximation_trace.log_prob_sum()
+    
     loss_fn = pyro.infer.TraceEnum_ELBO(max_plate_nesting=1).differentiable_loss(model,
                                                                guide,
                                                         *args, **kwargs)

From f416629f3aaa5f55f24bc18c5578d25b08e7e69f Mon Sep 17 00:00:00 2001
From: Lorenz <lorenz.kuhn@bluewin.ch>
Date: Sat, 9 Nov 2019 19:33:03 +0100
Subject: [PATCH 14/14] Reduce learning rate

---
 bimodal_posterior.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bimodal_posterior.py b/bimodal_posterior.py
index f2e6ee7..de92785 100644
--- a/bimodal_posterior.py
+++ b/bimodal_posterior.py
@@ -77,7 +77,7 @@ def relbo(model, guide, *args, **kwargs):
 
     # This is how we computed the ELBO before using TraceEnum_ELBO:
     # elbo = model_trace.log_prob_sum() - guide_trace.log_prob_sum() - approximation_trace.log_prob_sum()
-    
+
     loss_fn = pyro.infer.TraceEnum_ELBO(max_plate_nesting=1).differentiable_loss(model,
                                                                guide,
                                                         *args, **kwargs)
@@ -113,7 +113,7 @@ def boosting_bbvi():
         wrapped_guide(data)
         print(pyro.get_param_store().named_parameters())
 
-        adam_params = {"lr": 0.001, "betas": (0.90, 0.999)}
+        adam_params = {"lr": 0.002, "betas": (0.90, 0.999)}
         optimizer = Adam(adam_params)
         for name, value in pyro.get_param_store().named_parameters():
             if not name in gradient_norms: