use arviz and hdi instead of percentile

clip de and de_snooker don't remove outliers for kde generation (this should have been removed a while ago, since the generation is not random the outliers are not really outlying) read locationFrom (this was a bug and missing for MCMC) bumped version number
cadet · Sep 22, 2021 · 2a13351 · 2a13351
1 parent 072d54c
commit 2a13351
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 35 deletions.
diff --git a/CADETMatch.pyproj b/CADETMatch.pyproj
@@ -12,7 +12,7 @@
     <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
     <LaunchProvider>Standard Python launcher</LaunchProvider>
     <InterpreterId>CondaEnv|CondaEnv|pymoo_devel</InterpreterId>
-    <CommandLineArguments>"C:\Users\kosh\Downloads\Cadet-Match-GIEX Question-20210903T074409Z-001\Cadet-Match-GIEX Question\Match_Test.json" 1</CommandLineArguments>
+    <CommandLineArguments>"F:\cadet_release_test\search\mcmc\stage2\non.json" 12</CommandLineArguments>
     <EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
     <SuppressConfigureTestFrameworkPrompt>true</SuppressConfigureTestFrameworkPrompt>
     <InterpreterArguments>

diff --git a/CADETMatch/de.py b/CADETMatch/de.py
@@ -49,5 +49,5 @@ def get_proposal(self, s, c, random):
             random.shuffle(w)
             g = np.diff(w, axis=0) * self.g0 + f[i]
             q[i] = s[i] + g
-            q[i] = q[i] % 1
+        q = np.clip(q, 0, 1)
         return q, np.zeros(Ns, dtype=np.float64)
diff --git a/CADETMatch/de_snooker.py b/CADETMatch/de_snooker.py
@@ -43,6 +43,6 @@ def get_proposal(self, s, c, random):
             norm = np.linalg.norm(delta)
             u = delta / np.sqrt(norm)
             q[i] = s[i] + u * self.gammas * (np.dot(u, z1) - np.dot(u, z2))
-            q[i] = q[i] % 1
             metropolis[i] = np.log(np.linalg.norm(q[i] - z)) - np.log(norm)
+        q = np.clip(q, 0, 1)
         return q, 0.5 * (ndim - 1.0) * metropolis
diff --git a/CADETMatch/kde_generator.py b/CADETMatch/kde_generator.py
@@ -495,40 +495,30 @@ def generate_synthetic_error(cache):
 
         scores = numpy.array(scores_all)
 
-        keep_idx = keep_data(scores)
-
-        kept = int(numpy.sum(keep_idx))
-        removed = int(len(scores) - kept)
-
         dir_base = cache.settings.get("resultsDirBase")
         file = dir_base / "kde_data.h5"
 
         kde_data = H5()
         kde_data.filename = file.as_posix()
 
-        kde_data.root.kept = kept
-        kde_data.root.removed = removed
-
-        kde_data.root.scores = scores[keep_idx, :]
-
-        kde_data.root.original.scores = scores
+        kde_data.root.scores = scores
 
         for output_name, output in outputs_all.items():
-            kde_data.root[output_name] = numpy.array(output)[keep_idx, :]
+            kde_data.root[output_name] = numpy.array(output)
 
         for time_name, time in times.items():
             kde_data.root["%s_time" % time_name] = time
 
         for name, experiment in errors_all.items():
             for error_name, error_value in experiment.items():
-                kde_data.root.errors[name][error_name] = error_value[keep_idx, :]
+                kde_data.root.errors[name][error_name] = error_value
 
         for key, value in uv_store_all.items():
-            kde_data.root.uv_store[key] = numpy.array(value)[keep_idx, :]
+            kde_data.root.uv_store[key] = numpy.array(value)
 
         kde_data.save(lock=True)
 
-        return scores[keep_idx, :]
+        return scores
 
     return None
 

diff --git a/CADETMatch/search/mcmc.py b/CADETMatch/search/mcmc.py
@@ -26,6 +26,7 @@
 import CADETMatch.util as util
 import CADETMatch.sub as sub
 import CADETMatch.pop as pop
+import arviz
 
 name = "MCMC"
 
@@ -208,7 +209,10 @@ def converged_bounds(chain, length, error_level):
         temp_chain_flat = temp_chain.reshape(
             temp_chain_shape[0] * temp_chain_shape[1], temp_chain_shape[2]
         )
-        lb_5, mid_50, ub_95 = numpy.percentile(temp_chain_flat, [5, 50, 95], 0)
+        hdi = arviz.hdi(temp_chain, hdi_prob=0.9)
+        lb_5 = hdi[:,0]
+        ub_95 = hdi[:,1]
+        mid_50 = numpy.mean(temp_chain_flat, axis=0)
 
         lb.append(lb_5)
         ub.append(ub_95)
@@ -665,13 +669,16 @@ def sampler_auto_bounds(cache, checkpoint, sampler, checkpointFile, mcmc_store):
 
 
 def process_interval(cache, mcmc_store, interval_chain, interval_chain_transform):
-    mean = numpy.mean(interval_chain_transform, 0)
-    labels = [5, 10, 50, 90, 95]
-    percentile = numpy.percentile(interval_chain_transform, labels, 0)
+    hdi = arviz.hdi(interval_chain_transform, hdi_prob=0.9)
+    lb_5 = hdi[:,0]
+    ub_95 = hdi[:,1]
+    mid_50 = numpy.mean(flatten(interval_chain_transform), axis=0)
 
-    mcmc_store.root.percentile["mean"] = mean
-    for idx, label in enumerate(labels):
-        mcmc_store.root.percentile["percentile_%s" % label] = percentile[idx, :]
+    hdi_stat = numpy.vstack([lb_5, mid_50, ub_95])[:, numpy.newaxis, :]
+
+    mcmc_store.root.percentile["mean"] = mid_50
+    mcmc_store.root.percentile["lb_hdi_90"] = lb_5
+    mcmc_store.root.percentile["ub_hdi_90"] = ub_95
 
     flat_interval = interval(interval_chain, cache)
     flat_interval_transform = interval(interval_chain_transform, cache)
@@ -724,7 +731,7 @@ def process_sampler_run_write(cache, mcmc_store):
 
     interval_chain = chain_flat
     interval_chain_transform = chain_flat_transform
-    process_interval(cache, mcmc_store, interval_chain, interval_chain_transform)
+    process_interval(cache, mcmc_store, chain, chain_transform)
 
 
 def sampler_run(cache, checkpoint, sampler, checkpointFile, mcmc_store):
@@ -769,9 +776,16 @@ def sampler_run(cache, checkpoint, sampler, checkpointFile, mcmc_store):
         run_chain = addChain(run_chain, p[:, numpy.newaxis, :])
         run_probability = addChain(run_probability, ln_prob[:, numpy.newaxis])
 
+        hdi = arviz.hdi(run_chain, hdi_prob=0.9)
+        lb_5 = hdi[:,0]
+        ub_95 = hdi[:,1]
+        mid_50 = numpy.mean(flatten(run_chain), axis=0)
+
+        hdi_stat = numpy.vstack([lb_5, mid_50, ub_95])[:, numpy.newaxis, :]
+
         run_chain_stat = addChain(
             run_chain_stat,
-            numpy.percentile(flatten(run_chain), [5, 50, 95], 0)[:, numpy.newaxis, :],
+            hdi_stat,
         )
 
         multiprocessing.get_logger().info(
@@ -1249,14 +1263,15 @@ def writeMCMC(cache, mcmc_store, process_mcmc_store):
     mcmc_store.save(lock=True)
 
 
-def interval(flat_chain, cache):
-    mean = numpy.mean(flat_chain, 0)
-
-    percentile = numpy.percentile(flat_chain, [5, 10, 50, 90, 95], 0)
+def interval(chain, cache):
+    hdi = arviz.hdi(chain, hdi_prob=0.9)
+    lb_5 = hdi[:,0]
+    ub_95 = hdi[:,1]
+    mid_50 = numpy.mean(flatten(chain), axis=0)
 
-    data = numpy.vstack((mean, percentile)).transpose()
+    hdi_stat = numpy.vstack([lb_5, mid_50, ub_95])
 
-    pd = pandas.DataFrame(data, columns=["mean", "5", "10", "50", "90", "95"])
+    pd = pandas.DataFrame(hdi_stat.transpose(), columns=["lb_hdi_90", "mean", "ub_hdi_90"])
     pd.insert(0, "name", cache.parameter_headers_actual)
     pd.set_index("name")
     return pd
diff --git a/CADETMatch/util.py b/CADETMatch/util.py
@@ -522,7 +522,9 @@ def update_json_mcmc(settings):
         new_parameters = settings["parameters_mcmc"]
 
         for new, prior in zip(new_parameters, prior_parameters):
-            if new["location"].split("/")[-1] == prior["location"].split("/")[-1]:
+            ok_location = "location" in new and new["location"].split("/")[-1] == prior["location"].split("/")[-1]
+            ok_location_from = "locationFrom" in new and new["locationFrom"].split("/")[-1] == prior["locationFrom"].split("/")[-1]
+            if ok_location or ok_location_from:
                 #update just the location data everthing else needs to remain the same
                 for key, value in new.items():
                     if key not in keep:

diff --git a/CADETMatch/version.py b/CADETMatch/version.py
@@ -18,5 +18,5 @@
 __email__ = "[email protected]"
 __license__ = "GNU General Public License v3 (GPLv3)"
 __copyright__ = "2020 %s" % __author__
-__version__ = "0.8.8"
+__version__ = "0.8.9"
 __uri__ = "https://github.com/modsim/CADET-Match"