From f2f968a30ff17c98892ef4e61c9b6e069cf27424 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 16 Jul 2023 20:53:30 -0700
Subject: [PATCH 01/57] Added frequent directions module

---
 btx/processing/freqdir.py | 434 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 434 insertions(+)
 create mode 100644 btx/processing/freqdir.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
new file mode 100644
index 000000000..10e162a67
--- /dev/null
+++ b/btx/processing/freqdir.py
@@ -0,0 +1,434 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+###########################################
+#John Imports
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+import numpy as np
+
+import time
+
+from datetime import datetime
+currRun = datetime.now().strftime("%y%m%d%H%M%S")
+
+import cProfile, sys
+
+#############################################
+
+class FreqDir:
+
+    """Frequent Directions."""
+
+    def __init__(
+        self,
+        john_start,
+        tot_imgs,
+        ell, 
+        alpha,
+        exp,
+        run,
+        det_type,
+        batch_size=10,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = john_start + tot_imgs*self.rank//self.size
+
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            _,
+            self.batch_size,
+            self.num_features,
+        ) = self.set_params(tot_imgs, ell, batch_size, bin_factor)
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+        self.outliers, self.pc_data = [], []
+
+        self.d = self.num_features
+        self.ell = ell
+        self.m = 2*self.ell
+        self.sketch = zeros( (self.m, self.d) ) 
+        self.nextZeroRow = 0
+        self.alpha = alpha
+
+        self.dataseen = []
+        
+        self.noImgsToProcess = tot_imgs//self.size
+
+        print("MY RANK IS: {}".format(self.rank))
+
+        if self.rank==0:
+            self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.totPSI.counter = john_start
+
+    def get_params(self):
+        """
+        Method to retrieve iPCA params.
+
+        Returns
+        -------
+        num_incorporated_images : int
+            number of images used to build model
+        num_components : int
+            number of components maintained in model
+        batch_size : int
+            batch size used in model updates
+        num_features : int
+            dimensionality of incorporated images
+        """
+        return (
+            self.num_incorporated_images,
+            self.ell,
+            self.batch_size,
+            self.num_features,
+        )
+
+    def set_params(self, num_images, num_components, batch_size, bin_factor):
+        """
+        Method to initialize iPCA parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        batch_size : int
+            Desired size of image block to be incorporated into model at each update.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        batch_size : int
+            Size of image block to be incorporated into model at each update.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+        batch_size = min(batch_size, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, batch_size, num_features
+
+    def run(self):
+        """
+        Perform iPCA on run subject to initialization parameters.
+        """
+        # update model with remaining batches
+        
+        for batch in range(0,self.noImgsToProcess,self.batch_size):
+            self.fetch_and_update_model(self.batch_size)
+
+
+    def get_formatted_images(self, n):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+
+        img_batch = self.get_formatted_images(n)
+
+
+        self.john_update_model(img_batch)
+
+
+    def john_update_model(self, X):
+        """
+        Update matrix sketch with new batch of observations
+        """
+
+#        pr = cProfile.Profile()
+#        pr.enable()
+
+        _, numIncorp = X.shape
+        n = self.num_incorporated_images
+        q = self.ell
+
+        with TaskTimer(self.task_durations, "total update"):
+
+            if self.rank == 0:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                    )
+                )
+            for row in X.T:
+                if self.nextZeroRow >= self.m:
+                    self.john_rotate()
+                self.sketch[self.nextZeroRow,:] = row 
+                self.nextZeroRow += 1
+                self.num_incorporated_images += 1
+                if self.rank==0:
+                    self.dataseen.append(row)
+            
+#            if self.rank==0:
+#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
+
+#        pr.disable()
+#        # Dump results:
+#        # - for binary dump
+#        pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank))
+#        # - for text dump
+#        with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file:
+#            sys.stdout = output_file
+#            pr.print_stats( sort='time' )
+#            sys.stdout = sys.__stdout__
+
+    
+    def john_rotate(self):
+        try:
+            [_,s,Vt] = svd(self.sketch , full_matrices=False)
+        except LinAlgError as err:
+            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+
+        if len(s) >= self.ell:
+            sCopy = s.copy()
+            
+            sShrunk = s[:self.ell]**2 - s[self.ell-1]**2
+            #John: Explicitly set this value to be 0, since sometimes it is negative
+            # or even turns to NaN due to roundoff error
+            sShrunk[-1] = 0
+            sShrunk = sqrt(sShrunk)
+            
+            sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
+
+            self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:])
+            self.sketch[self.ell:,:] = 0
+            self.nextZeroRow = self.ell
+        else:
+            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
+            self.sketch[len(s):,:] = 0
+            self.nextZeroRow = len(s)
+
+    def john_reconstructionError(self):
+        matrixCentered = np.array(self.dataseen)
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
+        UAk = UA[:,:k]
+        SAk = np.diag(SA[:k])
+        VtAk = VtA[:k]
+        Ak = UAk @ SAk @ VtAk
+        return (np.linalg.norm(
+        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
+                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
+
+    def lowMemoryReconstructionErrorUnscaled(self):
+        matrixCentered = np.array(self.dataseen)
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
+
+    def estimFrobNormSquared(addMe, arrs, its):
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
+
+
+    def gatherFreqDirs(self):
+        sendbuf = self.sketch[:self.ell,:]
+        recvbuf = None
+        if self.rank == 0:
+            recvbuf = np.empty(
+                    [self.size, self.ell, self.d], dtype=np.float64)
+        self.comm.Gather(sendbuf, recvbuf, root=0)
+        if self.rank==0:
+            origMatSketch = self.sketch.copy()
+            for j in range(1, self.size):
+                for row in recvbuf[j]:
+                    if(np.any(row)):
+                        if self.nextZeroRow >= self.m:
+                            self.john_rotate()
+                        self.sketch[self.nextZeroRow,:] = row 
+                        self.nextZeroRow += 1
+            toReturn = self.sketch.copy()
+            self.sketch = origMatSketch
+            print(toReturn)
+            return toReturn
+        else:
+            return
+
+def parse_input():
+    """
+    Parse command line input.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
+    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
+    parser.add_argument(
+        "-d",
+        "--det_type",
+        help="Detector name, e.g epix10k2M or jungfrau4M.",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--start_offset",
+        help="Run index of first image to be incorporated into iPCA model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_components",
+        help="Number of principal components to compute and maintain.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--batch_size",
+        help="Size of image batch incorporated in each model update.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_images",
+        help="Total number of images to be incorporated into model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Path to output directory for recording task duration data.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--priming",
+        help="Initialize model with PCA.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--downsample",
+        help="Enable downsampling of images.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--bin_factor",
+        help="Bin factor if using downsizing.",
+        required=False,
+        type=int,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    params = parse_input()
+    kwargs = {k: v for k, v in vars(params).items() if v is not None}
+
+    pipca = PiPCA(**kwargs)
+    pipca.run()
+    pipca.get_outliers()
+

From 7a589c3760b7ba8d28df557730c5f26701ea210f Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 18 Jul 2023 10:31:31 -0700
Subject: [PATCH 02/57] Checkpoint. Not sure what has been changed, but
 parallel FD should be working other than segfault high core issue.

---
 btx/processing/rankAd_freqdir.py | 434 +++++++++++++++++++++++++++++++
 1 file changed, 434 insertions(+)
 create mode 100644 btx/processing/rankAd_freqdir.py

diff --git a/btx/processing/rankAd_freqdir.py b/btx/processing/rankAd_freqdir.py
new file mode 100644
index 000000000..10e162a67
--- /dev/null
+++ b/btx/processing/rankAd_freqdir.py
@@ -0,0 +1,434 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+###########################################
+#John Imports
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+import numpy as np
+
+import time
+
+from datetime import datetime
+currRun = datetime.now().strftime("%y%m%d%H%M%S")
+
+import cProfile, sys
+
+#############################################
+
+class FreqDir:
+
+    """Frequent Directions."""
+
+    def __init__(
+        self,
+        john_start,
+        tot_imgs,
+        ell, 
+        alpha,
+        exp,
+        run,
+        det_type,
+        batch_size=10,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = john_start + tot_imgs*self.rank//self.size
+
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            _,
+            self.batch_size,
+            self.num_features,
+        ) = self.set_params(tot_imgs, ell, batch_size, bin_factor)
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+        self.outliers, self.pc_data = [], []
+
+        self.d = self.num_features
+        self.ell = ell
+        self.m = 2*self.ell
+        self.sketch = zeros( (self.m, self.d) ) 
+        self.nextZeroRow = 0
+        self.alpha = alpha
+
+        self.dataseen = []
+        
+        self.noImgsToProcess = tot_imgs//self.size
+
+        print("MY RANK IS: {}".format(self.rank))
+
+        if self.rank==0:
+            self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.totPSI.counter = john_start
+
+    def get_params(self):
+        """
+        Method to retrieve iPCA params.
+
+        Returns
+        -------
+        num_incorporated_images : int
+            number of images used to build model
+        num_components : int
+            number of components maintained in model
+        batch_size : int
+            batch size used in model updates
+        num_features : int
+            dimensionality of incorporated images
+        """
+        return (
+            self.num_incorporated_images,
+            self.ell,
+            self.batch_size,
+            self.num_features,
+        )
+
+    def set_params(self, num_images, num_components, batch_size, bin_factor):
+        """
+        Method to initialize iPCA parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        batch_size : int
+            Desired size of image block to be incorporated into model at each update.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        batch_size : int
+            Size of image block to be incorporated into model at each update.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+        batch_size = min(batch_size, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, batch_size, num_features
+
+    def run(self):
+        """
+        Perform iPCA on run subject to initialization parameters.
+        """
+        # update model with remaining batches
+        
+        for batch in range(0,self.noImgsToProcess,self.batch_size):
+            self.fetch_and_update_model(self.batch_size)
+
+
+    def get_formatted_images(self, n):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+
+        img_batch = self.get_formatted_images(n)
+
+
+        self.john_update_model(img_batch)
+
+
+    def john_update_model(self, X):
+        """
+        Update matrix sketch with new batch of observations
+        """
+
+#        pr = cProfile.Profile()
+#        pr.enable()
+
+        _, numIncorp = X.shape
+        n = self.num_incorporated_images
+        q = self.ell
+
+        with TaskTimer(self.task_durations, "total update"):
+
+            if self.rank == 0:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                    )
+                )
+            for row in X.T:
+                if self.nextZeroRow >= self.m:
+                    self.john_rotate()
+                self.sketch[self.nextZeroRow,:] = row 
+                self.nextZeroRow += 1
+                self.num_incorporated_images += 1
+                if self.rank==0:
+                    self.dataseen.append(row)
+            
+#            if self.rank==0:
+#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
+
+#        pr.disable()
+#        # Dump results:
+#        # - for binary dump
+#        pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank))
+#        # - for text dump
+#        with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file:
+#            sys.stdout = output_file
+#            pr.print_stats( sort='time' )
+#            sys.stdout = sys.__stdout__
+
+    
+    def john_rotate(self):
+        try:
+            [_,s,Vt] = svd(self.sketch , full_matrices=False)
+        except LinAlgError as err:
+            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+
+        if len(s) >= self.ell:
+            sCopy = s.copy()
+            
+            sShrunk = s[:self.ell]**2 - s[self.ell-1]**2
+            #John: Explicitly set this value to be 0, since sometimes it is negative
+            # or even turns to NaN due to roundoff error
+            sShrunk[-1] = 0
+            sShrunk = sqrt(sShrunk)
+            
+            sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
+
+            self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:])
+            self.sketch[self.ell:,:] = 0
+            self.nextZeroRow = self.ell
+        else:
+            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
+            self.sketch[len(s):,:] = 0
+            self.nextZeroRow = len(s)
+
+    def john_reconstructionError(self):
+        matrixCentered = np.array(self.dataseen)
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
+        UAk = UA[:,:k]
+        SAk = np.diag(SA[:k])
+        VtAk = VtA[:k]
+        Ak = UAk @ SAk @ VtAk
+        return (np.linalg.norm(
+        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
+                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
+
+    def lowMemoryReconstructionErrorUnscaled(self):
+        matrixCentered = np.array(self.dataseen)
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
+
+    def estimFrobNormSquared(addMe, arrs, its):
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
+
+
+    def gatherFreqDirs(self):
+        sendbuf = self.sketch[:self.ell,:]
+        recvbuf = None
+        if self.rank == 0:
+            recvbuf = np.empty(
+                    [self.size, self.ell, self.d], dtype=np.float64)
+        self.comm.Gather(sendbuf, recvbuf, root=0)
+        if self.rank==0:
+            origMatSketch = self.sketch.copy()
+            for j in range(1, self.size):
+                for row in recvbuf[j]:
+                    if(np.any(row)):
+                        if self.nextZeroRow >= self.m:
+                            self.john_rotate()
+                        self.sketch[self.nextZeroRow,:] = row 
+                        self.nextZeroRow += 1
+            toReturn = self.sketch.copy()
+            self.sketch = origMatSketch
+            print(toReturn)
+            return toReturn
+        else:
+            return
+
+def parse_input():
+    """
+    Parse command line input.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
+    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
+    parser.add_argument(
+        "-d",
+        "--det_type",
+        help="Detector name, e.g epix10k2M or jungfrau4M.",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--start_offset",
+        help="Run index of first image to be incorporated into iPCA model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_components",
+        help="Number of principal components to compute and maintain.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--batch_size",
+        help="Size of image batch incorporated in each model update.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_images",
+        help="Total number of images to be incorporated into model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Path to output directory for recording task duration data.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--priming",
+        help="Initialize model with PCA.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--downsample",
+        help="Enable downsampling of images.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--bin_factor",
+        help="Bin factor if using downsizing.",
+        required=False,
+        type=int,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    params = parse_input()
+    kwargs = {k: v for k, v in vars(params).items() if v is not None}
+
+    pipca = PiPCA(**kwargs)
+    pipca.run()
+    pipca.get_outliers()
+

From 0a1df812cb8efdf29ab2308fff8d0e05a17d85e2 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Wed, 19 Jul 2023 13:44:05 -0700
Subject: [PATCH 03/57] Updated freqdir

---
 btx/processing/freqdir.py | 183 +++++++++++++++++++++-----------------
 1 file changed, 101 insertions(+), 82 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 10e162a67..7561daf0f 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -28,13 +28,11 @@
 from datetime import datetime
 currRun = datetime.now().strftime("%y%m%d%H%M%S")
 
-import cProfile, sys
-
 #############################################
 
 class FreqDir:
 
-    """Frequent Directions."""
+    """Parallel Frequent Directions."""
 
     def __init__(
         self,
@@ -45,7 +43,6 @@ def __init__(
         exp,
         run,
         det_type,
-        batch_size=10,
         downsample=False,
         bin_factor=2,
         output_dir="",
@@ -65,14 +62,12 @@ def __init__(
         (
             self.num_images,
             _,
-            self.batch_size,
             self.num_features,
-        ) = self.set_params(tot_imgs, ell, batch_size, bin_factor)
+        ) = self.set_params(tot_imgs, ell, bin_factor)
 
         self.task_durations = dict({})
 
         self.num_incorporated_images = 0
-        self.outliers, self.pc_data = [], []
 
         self.d = self.num_features
         self.ell = ell
@@ -81,41 +76,11 @@ def __init__(
         self.nextZeroRow = 0
         self.alpha = alpha
 
-        self.dataseen = []
-        
         self.noImgsToProcess = tot_imgs//self.size
 
-        print("MY RANK IS: {}".format(self.rank))
-
-        if self.rank==0:
-            self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type)
-            self.totPSI.counter = john_start
-
-    def get_params(self):
-        """
-        Method to retrieve iPCA params.
-
-        Returns
-        -------
-        num_incorporated_images : int
-            number of images used to build model
-        num_components : int
-            number of components maintained in model
-        batch_size : int
-            batch size used in model updates
-        num_features : int
-            dimensionality of incorporated images
-        """
-        return (
-            self.num_incorporated_images,
-            self.ell,
-            self.batch_size,
-            self.num_features,
-        )
-
-    def set_params(self, num_images, num_components, batch_size, bin_factor):
+    def set_params(self, num_images, num_components, bin_factor):
         """
-        Method to initialize iPCA parameters.
+        Method to initialize FreqDir parameters.
 
         Parameters
         ----------
@@ -123,8 +88,6 @@ def set_params(self, num_images, num_components, batch_size, bin_factor):
             Desired number of images to incorporate into model.
         num_components : int
             Desired number of components for model to maintain.
-        batch_size : int
-            Desired size of image block to be incorporated into model at each update.
         bin_factor : int
             Factor to bin data by.
 
@@ -134,17 +97,15 @@ def set_params(self, num_images, num_components, batch_size, bin_factor):
             Number of images to incorporate into model.
         num_components : int
             Number of components for model to maintain.
-        batch_size : int
-            Size of image block to be incorporated into model at each update.
         num_features : int
             Number of features (dimension) in each image.
         """
+
         max_events = self.psi.max_events
         downsample = self.downsample
 
         num_images = min(num_images, max_events) if num_images != -1 else max_events
         num_components = min(num_components, num_images)
-        batch_size = min(batch_size, num_images)
 
         # set d
         det_shape = self.psi.det.shape()
@@ -157,18 +118,17 @@ def set_params(self, num_images, num_components, batch_size, bin_factor):
             else:
                 num_features = int(num_features / bin_factor**2)
 
-        return num_images, num_components, batch_size, num_features
+        return num_images, num_components, num_features
 
     def run(self):
         """
-        Perform iPCA on run subject to initialization parameters.
+        Perform frequent directions matrix sketching
+        on run subject to initialization parameters.
         """
-        # update model with remaining batches
-        
+
         for batch in range(0,self.noImgsToProcess,self.batch_size):
             self.fetch_and_update_model(self.batch_size)
 
-
     def get_formatted_images(self, n):
         """
         Fetch n - x image segments from run, where x is the number of 'dead' images.
@@ -226,11 +186,13 @@ def fetch_and_update_model(self, n):
     def john_update_model(self, X):
         """
         Update matrix sketch with new batch of observations
+        
+        Parameters
+        ----------
+        X: ndarray
+            data to update matrix sketch with
         """
 
-#        pr = cProfile.Profile()
-#        pr.enable()
-
         _, numIncorp = X.shape
         n = self.num_incorporated_images
         q = self.ell
@@ -249,41 +211,44 @@ def john_update_model(self, X):
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
                 self.num_incorporated_images += 1
-                if self.rank==0:
-                    self.dataseen.append(row)
             
 #            if self.rank==0:
 #                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
-
-#        pr.disable()
-#        # Dump results:
-#        # - for binary dump
-#        pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank))
-#        # - for text dump
-#        with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file:
-#            sys.stdout = output_file
-#            pr.print_stats( sort='time' )
-#            sys.stdout = sys.__stdout__
-
     
     def john_rotate(self):
-        try:
-            [_,s,Vt] = svd(self.sketch , full_matrices=False)
-        except LinAlgError as err:
-            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+       """ 
+       Apply Frequent Directions Algorithm to 
+       current matrix sketch and adjoined buffer
+
+        Notes
+        -----
+        Based on [1] and [2]. 
+
+        [1] Frequent Directions: Simple and Deterministic Matrix 
+        Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
+        David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
+
+        [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
+        Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
+        Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
+        in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
+        https://doi.org/10.1007/978-3-662-44777-2_39
+       """
+
+        [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
 
         if len(s) >= self.ell:
             sCopy = s.copy()
             
-            sShrunk = s[:self.ell]**2 - s[self.ell-1]**2
+            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
             #John: Explicitly set this value to be 0, since sometimes it is negative
             # or even turns to NaN due to roundoff error
-            sShrunk[-1] = 0
-            sShrunk = sqrt(sShrunk)
+            toShrink[-1] = 0
+            toShrink = sqrt(toShrink)
             
-            sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
+            toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
 
-            self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:])
+            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
             self.sketch[self.ell:,:] = 0
             self.nextZeroRow = self.ell
         else:
@@ -291,8 +256,17 @@ def john_rotate(self):
             self.sketch[len(s):,:] = 0
             self.nextZeroRow = len(s)
 
-    def john_reconstructionError(self):
-        matrixCentered = np.array(self.dataseen)
+    def john_reconstructionError(self, matrixCentered):
+        """ 
+        Compute the reconstruction error of the matrix sketch
+        against given data
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
         matSketch = self.sketch
         k = 10
         matrixCenteredT = matrixCentered.T
@@ -308,8 +282,18 @@ def john_reconstructionError(self):
         	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
                 (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
 
-    def lowMemoryReconstructionErrorUnscaled(self):
-        matrixCentered = np.array(self.dataseen)
+    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
+        """ 
+        Compute the low memory reconstruction error of the matrix sketch
+        against given data. This si the same as john_reconstructionError,
+        but estimates the norm computation and does not scale by the matrix. 
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
         matSketch = self.sketch
         k = 10
         matrixCenteredT = matrixCentered.T
@@ -318,7 +302,39 @@ def lowMemoryReconstructionErrorUnscaled(self):
         G = U[:,:k]
         return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
 
-    def estimFrobNormSquared(addMe, arrs, its):
+    def estimFrobNormSquared(self, addMe, arrs, its):
+        """ 
+        Estimate the Frobenius Norm of product of arrs matrices 
+        plus addME matrix using its iterations. 
+
+        Parameters
+        ----------
+        arrs: list of ndarray
+           Matrices to multiply together
+
+        addMe: ndarray
+            Matrix to add to others
+
+        its: int
+            Number of iterations to average over
+
+        Returns
+        -------
+        sumMe/its*no_rows : float
+            Estimate of frobenius norm of produce 
+            of arrs matrices plus addMe matrix
+
+        Notes
+        -----
+        Frobenius estimation is the expected value of matrix
+        multiplied by random vector from multivariate normal distribution
+        based on [1]. 
+
+        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+        Analysis and Applications 2021 42:1, 202-223
+       """
+
         no_rows = arrs[-1].shape[1]
         v = np.random.normal(size=no_rows)
         v_hat = v / np.linalg.norm(v)
@@ -334,6 +350,12 @@ def estimFrobNormSquared(addMe, arrs, its):
 
 
     def gatherFreqDirs(self):
+        """
+        Gather local matrix sketches to root node and
+        merge local sketches together. 
+        """
+
+        self.comm.Barrier()
         sendbuf = self.sketch[:self.ell,:]
         recvbuf = None
         if self.rank == 0:
@@ -351,7 +373,6 @@ def gatherFreqDirs(self):
                         self.nextZeroRow += 1
             toReturn = self.sketch.copy()
             self.sketch = origMatSketch
-            print(toReturn)
             return toReturn
         else:
             return
@@ -430,5 +451,3 @@ def parse_input():
 
     pipca = PiPCA(**kwargs)
     pipca.run()
-    pipca.get_outliers()
-

From 851487786911e2ec62ba50de324f71ce3ae9360e Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Wed, 19 Jul 2023 13:46:12 -0700
Subject: [PATCH 04/57] Checkpoint

---
 btx/processing/rankAd_freqdir.py | 434 -------------------------------
 1 file changed, 434 deletions(-)
 delete mode 100644 btx/processing/rankAd_freqdir.py

diff --git a/btx/processing/rankAd_freqdir.py b/btx/processing/rankAd_freqdir.py
deleted file mode 100644
index 10e162a67..000000000
--- a/btx/processing/rankAd_freqdir.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import os, csv, argparse
-
-import numpy as np
-from mpi4py import MPI
-
-from matplotlib import pyplot as plt
-from matplotlib import colors
-
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
-
-###########################################
-#John Imports
-from numpy import zeros, sqrt, dot, diag
-from numpy.linalg import svd, LinAlgError
-from scipy.linalg import svd as scipy_svd
-import numpy as np
-
-import time
-
-from datetime import datetime
-currRun = datetime.now().strftime("%y%m%d%H%M%S")
-
-import cProfile, sys
-
-#############################################
-
-class FreqDir:
-
-    """Frequent Directions."""
-
-    def __init__(
-        self,
-        john_start,
-        tot_imgs,
-        ell, 
-        alpha,
-        exp,
-        run,
-        det_type,
-        batch_size=10,
-        downsample=False,
-        bin_factor=2,
-        output_dir="",
-    ):
-
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
-
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = john_start + tot_imgs*self.rank//self.size
-
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
-
-        (
-            self.num_images,
-            _,
-            self.batch_size,
-            self.num_features,
-        ) = self.set_params(tot_imgs, ell, batch_size, bin_factor)
-
-        self.task_durations = dict({})
-
-        self.num_incorporated_images = 0
-        self.outliers, self.pc_data = [], []
-
-        self.d = self.num_features
-        self.ell = ell
-        self.m = 2*self.ell
-        self.sketch = zeros( (self.m, self.d) ) 
-        self.nextZeroRow = 0
-        self.alpha = alpha
-
-        self.dataseen = []
-        
-        self.noImgsToProcess = tot_imgs//self.size
-
-        print("MY RANK IS: {}".format(self.rank))
-
-        if self.rank==0:
-            self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type)
-            self.totPSI.counter = john_start
-
-    def get_params(self):
-        """
-        Method to retrieve iPCA params.
-
-        Returns
-        -------
-        num_incorporated_images : int
-            number of images used to build model
-        num_components : int
-            number of components maintained in model
-        batch_size : int
-            batch size used in model updates
-        num_features : int
-            dimensionality of incorporated images
-        """
-        return (
-            self.num_incorporated_images,
-            self.ell,
-            self.batch_size,
-            self.num_features,
-        )
-
-    def set_params(self, num_images, num_components, batch_size, bin_factor):
-        """
-        Method to initialize iPCA parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        batch_size : int
-            Desired size of image block to be incorporated into model at each update.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        batch_size : int
-            Size of image block to be incorporated into model at each update.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-        batch_size = min(batch_size, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, batch_size, num_features
-
-    def run(self):
-        """
-        Perform iPCA on run subject to initialization parameters.
-        """
-        # update model with remaining batches
-        
-        for batch in range(0,self.noImgsToProcess,self.batch_size):
-            self.fetch_and_update_model(self.batch_size)
-
-
-    def get_formatted_images(self, n):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-
-        bin_factor = self.bin_factor
-        downsample = self.downsample
-
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-
-        return formatted_imgs
-
-    def fetch_and_update_model(self, n):
-        """
-        Fetch images and update model.
-
-        Parameters
-        ----------
-        n : int
-            number of images to incorporate
-        """
-
-        img_batch = self.get_formatted_images(n)
-
-
-        self.john_update_model(img_batch)
-
-
-    def john_update_model(self, X):
-        """
-        Update matrix sketch with new batch of observations
-        """
-
-#        pr = cProfile.Profile()
-#        pr.enable()
-
-        _, numIncorp = X.shape
-        n = self.num_incorporated_images
-        q = self.ell
-
-        with TaskTimer(self.task_durations, "total update"):
-
-            if self.rank == 0:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
-                    )
-                )
-            for row in X.T:
-                if self.nextZeroRow >= self.m:
-                    self.john_rotate()
-                self.sketch[self.nextZeroRow,:] = row 
-                self.nextZeroRow += 1
-                self.num_incorporated_images += 1
-                if self.rank==0:
-                    self.dataseen.append(row)
-            
-#            if self.rank==0:
-#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
-
-#        pr.disable()
-#        # Dump results:
-#        # - for binary dump
-#        pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank))
-#        # - for text dump
-#        with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file:
-#            sys.stdout = output_file
-#            pr.print_stats( sort='time' )
-#            sys.stdout = sys.__stdout__
-
-    
-    def john_rotate(self):
-        try:
-            [_,s,Vt] = svd(self.sketch , full_matrices=False)
-        except LinAlgError as err:
-            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
-
-        if len(s) >= self.ell:
-            sCopy = s.copy()
-            
-            sShrunk = s[:self.ell]**2 - s[self.ell-1]**2
-            #John: Explicitly set this value to be 0, since sometimes it is negative
-            # or even turns to NaN due to roundoff error
-            sShrunk[-1] = 0
-            sShrunk = sqrt(sShrunk)
-            
-            sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
-
-            self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:])
-            self.sketch[self.ell:,:] = 0
-            self.nextZeroRow = self.ell
-        else:
-            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
-            self.sketch[len(s):,:] = 0
-            self.nextZeroRow = len(s)
-
-    def john_reconstructionError(self):
-        matrixCentered = np.array(self.dataseen)
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
-        UAk = UA[:,:k]
-        SAk = np.diag(SA[:k])
-        VtAk = VtA[:k]
-        Ak = UAk @ SAk @ VtAk
-        return (np.linalg.norm(
-        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
-                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
-
-    def lowMemoryReconstructionErrorUnscaled(self):
-        matrixCentered = np.array(self.dataseen)
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
-
-    def estimFrobNormSquared(addMe, arrs, its):
-        no_rows = arrs[-1].shape[1]
-        v = np.random.normal(size=no_rows)
-        v_hat = v / np.linalg.norm(v)
-        sumMe = 0
-        for j in range(its):
-            v = np.random.normal(size=no_rows)
-            v_hat = v / np.linalg.norm(v)
-            v_addMe = addMe @ v_hat
-            for arr in arrs[::-1]:
-                v_hat = arr @ v_hat
-            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-        return sumMe/its*no_rows
-
-
-    def gatherFreqDirs(self):
-        sendbuf = self.sketch[:self.ell,:]
-        recvbuf = None
-        if self.rank == 0:
-            recvbuf = np.empty(
-                    [self.size, self.ell, self.d], dtype=np.float64)
-        self.comm.Gather(sendbuf, recvbuf, root=0)
-        if self.rank==0:
-            origMatSketch = self.sketch.copy()
-            for j in range(1, self.size):
-                for row in recvbuf[j]:
-                    if(np.any(row)):
-                        if self.nextZeroRow >= self.m:
-                            self.john_rotate()
-                        self.sketch[self.nextZeroRow,:] = row 
-                        self.nextZeroRow += 1
-            toReturn = self.sketch.copy()
-            self.sketch = origMatSketch
-            print(toReturn)
-            return toReturn
-        else:
-            return
-
-def parse_input():
-    """
-    Parse command line input.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
-    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
-    parser.add_argument(
-        "-d",
-        "--det_type",
-        help="Detector name, e.g epix10k2M or jungfrau4M.",
-        required=True,
-        type=str,
-    )
-    parser.add_argument(
-        "--start_offset",
-        help="Run index of first image to be incorporated into iPCA model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_components",
-        help="Number of principal components to compute and maintain.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--batch_size",
-        help="Size of image batch incorporated in each model update.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_images",
-        help="Total number of images to be incorporated into model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Path to output directory for recording task duration data.",
-        required=False,
-        type=str,
-    )
-    parser.add_argument(
-        "--priming",
-        help="Initialize model with PCA.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--downsample",
-        help="Enable downsampling of images.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--bin_factor",
-        help="Bin factor if using downsizing.",
-        required=False,
-        type=int,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-
-    params = parse_input()
-    kwargs = {k: v for k, v in vars(params).items() if v is not None}
-
-    pipca = PiPCA(**kwargs)
-    pipca.run()
-    pipca.get_outliers()
-

From f11a31f8287e5c6e2cf1126e6d326e63cba65274 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Thu, 20 Jul 2023 13:17:18 -0700
Subject: [PATCH 05/57] Fixed gather command.

---
 btx/processing/OLDfreqdir.py  | 464 ++++++++++++++++++++++++++++++++++
 btx/processing/freqdir.py     | 100 ++++++--
 btx/processing/rankAdaptFD.py | 464 ++++++++++++++++++++++++++++++++++
 3 files changed, 1008 insertions(+), 20 deletions(-)
 create mode 100644 btx/processing/OLDfreqdir.py
 create mode 100644 btx/processing/rankAdaptFD.py

diff --git a/btx/processing/OLDfreqdir.py b/btx/processing/OLDfreqdir.py
new file mode 100644
index 000000000..430ea4d22
--- /dev/null
+++ b/btx/processing/OLDfreqdir.py
@@ -0,0 +1,464 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+###########################################
+#John Imports
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+import numpy as np
+
+import time
+
+from datetime import datetime
+currRun = datetime.now().strftime("%y%m%d%H%M%S")
+
+#############################################
+
+class FreqDir:
+
+    """Parallel Frequent Directions."""
+
+    def __init__(
+        self,
+        john_start,
+        tot_imgs,
+        ell, 
+        alpha,
+        exp,
+        run,
+        det_type,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = john_start + tot_imgs*self.rank//self.size
+
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            _,
+            self.num_features,
+        ) = self.set_params(tot_imgs, ell, bin_factor)
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+
+        self.d = self.num_features
+        self.ell = ell
+        self.m = 2*self.ell
+        self.sketch = zeros( (self.m, self.d) ) 
+        self.nextZeroRow = 0
+        self.alpha = alpha
+
+        self.noImgsToProcess = tot_imgs//self.size
+
+    def set_params(self, num_images, num_components, bin_factor):
+        """
+        Method to initialize FreqDir parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, num_features
+
+    def run(self):
+        """
+        Perform frequent directions matrix sketching
+        on run subject to initialization parameters.
+        """
+
+        for batch in range(0,self.noImgsToProcess,self.ell):
+            self.fetch_and_update_model(self.ell)
+
+        self.comm.Barrier()
+
+    def get_formatted_images(self, n):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+
+        img_batch = self.get_formatted_images(n)
+
+
+        self.john_update_model(img_batch)
+
+
+    def john_update_model(self, X):
+        """
+        Update matrix sketch with new batch of observations
+        
+        Parameters
+        ----------
+        X: ndarray
+            data to update matrix sketch with
+        """
+
+        _, numIncorp = X.shape
+        n = self.num_incorporated_images
+        q = self.ell
+
+        with TaskTimer(self.task_durations, "total update"):
+
+            if self.rank == 0:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                    )
+                )
+            for row in X.T:
+                if self.nextZeroRow >= self.m:
+                    self.john_rotate()
+                self.sketch[self.nextZeroRow,:] = row 
+                self.nextZeroRow += 1
+                self.num_incorporated_images += 1
+#            if self.rank==0:
+#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
+    
+    def john_rotate(self):
+        """ 
+        Apply Frequent Directions Algorithm to 
+        current matrix sketch and adjoined buffer
+
+        Notes
+        -----
+        Based on [1] and [2]. 
+
+        [1] Frequent Directions: Simple and Deterministic Matrix 
+        Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
+        David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
+
+        [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
+        Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
+        Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
+        in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
+        https://doi.org/10.1007/978-3-662-44777-2_39
+        """
+
+        try:
+            [_,s,Vt] = svd(self.sketch , full_matrices=False)
+        except LinAlgError as err:
+            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+
+        if len(s) >= self.ell:
+            sCopy = s.copy()
+            
+            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
+            #John: Explicitly set this value to be 0, since sometimes it is negative
+            # or even turns to NaN due to roundoff error
+            toShrink[-1] = 0
+            toShrink = sqrt(toShrink)
+            
+            toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
+
+            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
+            self.sketch[self.ell:,:] = 0
+            self.nextZeroRow = self.ell
+        else:
+            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
+            self.sketch[len(s):,:] = 0
+            self.nextZeroRow = len(s)
+
+    def john_reconstructionError(self, matrixCentered):
+        """ 
+        Compute the reconstruction error of the matrix sketch
+        against given data
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
+        UAk = UA[:,:k]
+        SAk = np.diag(SA[:k])
+        VtAk = VtA[:k]
+        Ak = UAk @ SAk @ VtAk
+        return (np.linalg.norm(
+        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
+                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
+
+    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
+        """ 
+        Compute the low memory reconstruction error of the matrix sketch
+        against given data. This si the same as john_reconstructionError,
+        but estimates the norm computation and does not scale by the matrix. 
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
+
+    def estimFrobNormSquared(self, addMe, arrs, its):
+        """ 
+        Estimate the Frobenius Norm of product of arrs matrices 
+        plus addME matrix using its iterations. 
+
+        Parameters
+        ----------
+        arrs: list of ndarray
+           Matrices to multiply together
+
+        addMe: ndarray
+            Matrix to add to others
+
+        its: int
+            Number of iterations to average over
+
+        Returns
+        -------
+        sumMe/its*no_rows : float
+            Estimate of frobenius norm of produce 
+            of arrs matrices plus addMe matrix
+
+        Notes
+        -----
+        Frobenius estimation is the expected value of matrix
+        multiplied by random vector from multivariate normal distribution
+        based on [1]. 
+
+        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+        Analysis and Applications 2021 42:1, 202-223
+       """
+
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
+
+
+    def gatherFreqDirs(self):
+        print("STARTING GATHER")
+        """
+        Gather local matrix sketches to root node and
+        merge local sketches together. 
+        """
+
+        self.comm.Barrier()
+        sendbuf = self.sketch[:self.ell,:]
+        recvbuf = None
+        if self.rank == 0:
+            recvbuf = np.empty(
+                    [self.size, self.ell, self.d], dtype=np.double)
+        self.comm.Gather(sendbuf, recvbuf, root=0)
+        print("{} FINISHED GATHER".format(self.rank))
+        if self.rank==0:
+            origMatSketch = self.sketch.copy()
+            origNextZeroRow = self.nextZeroRow
+            self.nextZeroRow = self.ell
+            print("BUFFER SHAPE: ", recvbuf.shape)
+            for j in range(1, self.size):
+                print("CURRENT BUFFER: ", j)
+                print(recvbuf[j])
+                for row in recvbuf[j]:
+                    if(np.any(row)):
+                        if self.nextZeroRow >= self.m:
+                            self.john_rotate()
+                        self.sketch[self.nextZeroRow,:] = row 
+                        self.nextZeroRow += 1
+            toReturn = self.sketch.copy()
+            self.sketch = origMatSketch
+            return toReturn
+        else:
+            return
+
+def parse_input():
+    """
+    Parse command line input.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
+    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
+    parser.add_argument(
+        "-d",
+        "--det_type",
+        help="Detector name, e.g epix10k2M or jungfrau4M.",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--start_offset",
+        help="Run index of first image to be incorporated into iPCA model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_components",
+        help="Number of principal components to compute and maintain.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--batch_size",
+        help="Size of image batch incorporated in each model update.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_images",
+        help="Total number of images to be incorporated into model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Path to output directory for recording task duration data.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--priming",
+        help="Initialize model with PCA.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--downsample",
+        help="Enable downsampling of images.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--bin_factor",
+        help="Bin factor if using downsizing.",
+        required=False,
+        type=int,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    params = parse_input()
+    kwargs = {k: v for k, v in vars(params).items() if v is not None}
+
+    pipca = PiPCA(**kwargs)
+    pipca.run()
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 7561daf0f..8d0f93ef9 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -126,8 +126,10 @@ def run(self):
         on run subject to initialization parameters.
         """
 
-        for batch in range(0,self.noImgsToProcess,self.batch_size):
-            self.fetch_and_update_model(self.batch_size)
+        for batch in range(0,self.noImgsToProcess,self.ell):
+            self.fetch_and_update_model(self.ell)
+
+        self.comm.Barrier()
 
     def get_formatted_images(self, n):
         """
@@ -211,14 +213,13 @@ def john_update_model(self, X):
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
                 self.num_incorporated_images += 1
-            
 #            if self.rank==0:
 #                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
     
     def john_rotate(self):
-       """ 
-       Apply Frequent Directions Algorithm to 
-       current matrix sketch and adjoined buffer
+        """ 
+        Apply Frequent Directions Algorithm to 
+        current matrix sketch and adjoined buffer
 
         Notes
         -----
@@ -233,9 +234,12 @@ def john_rotate(self):
         Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
         in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
         https://doi.org/10.1007/978-3-662-44777-2_39
-       """
+        """
 
-        [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+        try:
+            [_,s,Vt] = svd(self.sketch , full_matrices=False)
+        except LinAlgError as err:
+            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
 
         if len(s) >= self.ell:
             sCopy = s.copy()
@@ -350,32 +354,88 @@ def estimFrobNormSquared(self, addMe, arrs, its):
 
 
     def gatherFreqDirs(self):
+        print("STARTING GATHER")
         """
         Gather local matrix sketches to root node and
         merge local sketches together. 
         """
-
-        self.comm.Barrier()
-        sendbuf = self.sketch[:self.ell,:]
-        recvbuf = None
+#        self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1])
+        sendbuf = self.ell
+        buffSizes = np.array(self.comm.gather(sendbuf, root=0))
         if self.rank == 0:
-            recvbuf = np.empty(
-                    [self.size, self.ell, self.d], dtype=np.float64)
-        self.comm.Gather(sendbuf, recvbuf, root=0)
+            print("BUFF SIZES: ", buffSizes)
+#        data = [np.array((), dtype=np.double) for _ in range(self.size)]
+#        data[self.rank] = self.sketch[:self.ell, :].copy()
+#        if self.rank == 0:
+#            sizes_memory = (self.d)*buffSizes
+#            offsets = np.zeros(self.size)
+#            offsets[1:] = np.cumsum(sizes_memory)[:-1]
+#
+#        data_out = None
+#        recvbuf = None
+#        if self.rank == 0:
+#            # data_out = np.empty((np.sum(buffSizes), fd.d), dtype=np.float32)
+#            data_out = np.empty((np.sum(buffSizes), self.d))
+#            recvbuf=[data_out, sizes_memory.tolist(), offsets.tolist(), MPI.DOUBLE]	
+#
+#        self.comm.Barrier()
+#        self.comm.Gatherv(data[self.rank],recvbuf = recvbuf, root=0)
+#        self.comm.Barrier()
+#        print("{} FINISHED GATHERV".format(self.rank))
+
         if self.rank==0:
             origMatSketch = self.sketch.copy()
-            for j in range(1, self.size):
-                for row in recvbuf[j]:
+            origNextZeroRow = self.nextZeroRow
+            self.nextZeroRow = self.ell
+            counter = 0
+            for proc in range(1, self.size):
+                bufferMe = np.empty(self.ell*self.d, dtype=np.double)
+                self.comm.Recv(bufferMe, source=proc, tag=13)
+                bufferMe = np.reshape(bufferMe, (self.ell, self.d))
+                for row in bufferMe:
                     if(np.any(row)):
                         if self.nextZeroRow >= self.m:
                             self.john_rotate()
-                        self.sketch[self.nextZeroRow,:] = row 
-                        self.nextZeroRow += 1
+                    self.sketch[self.nextZeroRow,:] = row 
+                    self.nextZeroRow += 1
+                    counter += 1
+                    print("DATA PROCESSED: {}".format(counter))
             toReturn = self.sketch.copy()
+            print("COMPLETED MERGE PROCESS: ", toReturn)
             self.sketch = origMatSketch
             return toReturn
         else:
-            return
+            bufferMe = self.sketch[:self.ell, :].copy().flatten()
+            self.comm.Send(bufferMe, dest=0, tag=13)
+            return        
+
+#        self.comm.Barrier()
+#        sendbuf = self.sketch[:self.ell,:]
+#        recvbuf = None
+#        if self.rank == 0:
+#            recvbuf = np.empty(
+#                    [self.size, self.ell, self.d], dtype=np.float32)
+#        self.comm.Gather(sendbuf, recvbuf, root=0)
+#        print("{} FINISHED GATHER".format(self.rank))
+#        if self.rank==0:
+#            origMatSketch = self.sketch.copy()
+#            origNextZeroRow = self.nextZeroRow
+#            self.nextZeroRow = self.ell
+#            print("BUFFER SHAPE: ", recvbuf.shape)
+#            for j in range(1, self.size):
+#                print("CURRENT BUFFER: ", j)
+#                print(recvbuf[j])
+#                for row in recvbuf[j]:
+#                    if(np.any(row)):
+#                        if self.nextZeroRow >= self.m:
+#                            self.john_rotate()
+#                        self.sketch[self.nextZeroRow,:] = row 
+#                        self.nextZeroRow += 1
+#            toReturn = self.sketch.copy()
+#            self.sketch = origMatSketch
+#            return toReturn
+#        else:
+#            return
 
 def parse_input():
     """
diff --git a/btx/processing/rankAdaptFD.py b/btx/processing/rankAdaptFD.py
new file mode 100644
index 000000000..16575469d
--- /dev/null
+++ b/btx/processing/rankAdaptFD.py
@@ -0,0 +1,464 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+###########################################
+#John Imports
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+import numpy as np
+
+import time
+
+from datetime import datetime
+currRun = datetime.now().strftime("%y%m%d%H%M%S")
+
+#############################################
+
+class FreqDir:
+
+    """Parallel Frequent Directions."""
+
+    def __init__(
+        self,
+        john_start,
+        tot_imgs,
+        ell, 
+        alpha,
+        exp,
+        run,
+        det_type,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = john_start + tot_imgs*self.rank//self.size
+
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            _,
+            self.num_features,
+        ) = self.set_params(tot_imgs, ell, bin_factor)
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+
+        self.d = self.num_features
+        self.ell = ell
+        self.m = 2*self.ell
+        self.sketch = zeros( (self.m, self.d) ) 
+        self.nextZeroRow = 0
+        self.alpha = alpha
+
+        self.noImgsToProcess = tot_imgs//self.size
+
+    def set_params(self, num_images, num_components, bin_factor):
+        """
+        Method to initialize FreqDir parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, num_features
+
+    def run(self):
+        """
+        Perform frequent directions matrix sketching
+        on run subject to initialization parameters.
+        """
+
+        for batch in range(0,self.noImgsToProcess,self.ell):
+            self.fetch_and_update_model(self.ell)
+
+        self.comm.Barrier()
+
+    def get_formatted_images(self, n):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+
+        img_batch = self.get_formatted_images(n)
+
+
+        self.john_update_model(img_batch)
+
+
+    def john_update_model(self, X):
+        """
+        Update matrix sketch with new batch of observations
+        
+        Parameters
+        ----------
+        X: ndarray
+            data to update matrix sketch with
+        """
+
+        _, numIncorp = X.shape
+        n = self.num_incorporated_images
+        q = self.ell
+
+        with TaskTimer(self.task_durations, "total update"):
+
+            if self.rank == 0:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                    )
+                )
+            for row in X.T:
+                if self.nextZeroRow >= self.m:
+                    self.john_rotate()
+                self.sketch[self.nextZeroRow,:] = row 
+                self.nextZeroRow += 1
+                self.num_incorporated_images += 1
+#            if self.rank==0:
+#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
+    
+    def john_rotate(self):
+        """ 
+        Apply Frequent Directions Algorithm to 
+        current matrix sketch and adjoined buffer
+
+        Notes
+        -----
+        Based on [1] and [2]. 
+
+        [1] Frequent Directions: Simple and Deterministic Matrix 
+        Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
+        David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
+
+        [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
+        Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
+        Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
+        in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
+        https://doi.org/10.1007/978-3-662-44777-2_39
+        """
+
+        try:
+            [_,s,Vt] = svd(self.sketch , full_matrices=False)
+        except LinAlgError as err:
+            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
+
+        if len(s) >= self.ell:
+            sCopy = s.copy()
+            
+            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
+            #John: Explicitly set this value to be 0, since sometimes it is negative
+            # or even turns to NaN due to roundoff error
+            toShrink[-1] = 0
+            toShrink = sqrt(toShrink)
+            
+            toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
+
+            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
+            self.sketch[self.ell:,:] = 0
+            self.nextZeroRow = self.ell
+        else:
+            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
+            self.sketch[len(s):,:] = 0
+            self.nextZeroRow = len(s)
+
+    def john_reconstructionError(self, matrixCentered):
+        """ 
+        Compute the reconstruction error of the matrix sketch
+        against given data
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
+        UAk = UA[:,:k]
+        SAk = np.diag(SA[:k])
+        VtAk = VtA[:k]
+        Ak = UAk @ SAk @ VtAk
+        return (np.linalg.norm(
+        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
+                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
+
+    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
+        """ 
+        Compute the low memory reconstruction error of the matrix sketch
+        against given data. This si the same as john_reconstructionError,
+        but estimates the norm computation and does not scale by the matrix. 
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+           Data to compare matrix sketch to 
+       """
+
+        matSketch = self.sketch
+        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT)
+        G = U[:,:k]
+        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
+
+    def estimFrobNormSquared(self, addMe, arrs, its):
+        """ 
+        Estimate the Frobenius Norm of product of arrs matrices 
+        plus addME matrix using its iterations. 
+
+        Parameters
+        ----------
+        arrs: list of ndarray
+           Matrices to multiply together
+
+        addMe: ndarray
+            Matrix to add to others
+
+        its: int
+            Number of iterations to average over
+
+        Returns
+        -------
+        sumMe/its*no_rows : float
+            Estimate of frobenius norm of produce 
+            of arrs matrices plus addMe matrix
+
+        Notes
+        -----
+        Frobenius estimation is the expected value of matrix
+        multiplied by random vector from multivariate normal distribution
+        based on [1]. 
+
+        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+        Analysis and Applications 2021 42:1, 202-223
+       """
+
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
+
+
+    def gatherFreqDirs(self):
+        print("STARTING GATHER")
+        """
+        Gather local matrix sketches to root node and
+        merge local sketches together. 
+        """
+
+        self.comm.Barrier()
+        sendbuf = self.sketch[:self.ell,:]
+        recvbuf = None
+        if self.rank == 0:
+            recvbuf = np.empty(
+                    [self.size, self.ell, self.d], dtype=np.float32)
+        self.comm.Gather(sendbuf, recvbuf, root=0)
+        print("{} FINISHED GATHER".format(self.rank))
+        if self.rank==0:
+            origMatSketch = self.sketch.copy()
+            origNextZeroRow = self.nextZeroRow
+            self.nextZeroRow = self.ell
+            print("BUFFER SHAPE: ", recvbuf.shape)
+            for j in range(1, self.size):
+                print("CURRENT BUFFER: ", j)
+                print(recvbuf[j])
+                for row in recvbuf[j]:
+                    if(np.any(row)):
+                        if self.nextZeroRow >= self.m:
+                            self.john_rotate()
+                        self.sketch[self.nextZeroRow,:] = row 
+                        self.nextZeroRow += 1
+            toReturn = self.sketch.copy()
+            self.sketch = origMatSketch
+            return toReturn
+        else:
+            return
+
+def parse_input():
+    """
+    Parse command line input.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
+    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
+    parser.add_argument(
+        "-d",
+        "--det_type",
+        help="Detector name, e.g epix10k2M or jungfrau4M.",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--start_offset",
+        help="Run index of first image to be incorporated into iPCA model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_components",
+        help="Number of principal components to compute and maintain.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--batch_size",
+        help="Size of image batch incorporated in each model update.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_images",
+        help="Total number of images to be incorporated into model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Path to output directory for recording task duration data.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--priming",
+        help="Initialize model with PCA.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--downsample",
+        help="Enable downsampling of images.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--bin_factor",
+        help="Bin factor if using downsizing.",
+        required=False,
+        type=int,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    params = parse_input()
+    kwargs = {k: v for k, v in vars(params).items() if v is not None}
+
+    pipca = PiPCA(**kwargs)
+    pipca.run()

From be8d31264df9624029824423548010ab6fa0e896 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Thu, 20 Jul 2023 19:48:30 -0700
Subject: [PATCH 06/57] checkpoint

---
 btx/processing/freqdir.py     |   8 +-
 btx/processing/rankAdaptFD.py | 464 ----------------------------------
 2 files changed, 4 insertions(+), 468 deletions(-)
 delete mode 100644 btx/processing/rankAdaptFD.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 8d0f93ef9..5bf1e8e3d 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -354,7 +354,7 @@ def estimFrobNormSquared(self, addMe, arrs, its):
 
 
     def gatherFreqDirs(self):
-        print("STARTING GATHER")
+#        print("STARTING GATHER")
         """
         Gather local matrix sketches to root node and
         merge local sketches together. 
@@ -362,8 +362,8 @@ def gatherFreqDirs(self):
 #        self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1])
         sendbuf = self.ell
         buffSizes = np.array(self.comm.gather(sendbuf, root=0))
-        if self.rank == 0:
-            print("BUFF SIZES: ", buffSizes)
+#        if self.rank == 0:
+#            print("BUFF SIZES: ", buffSizes)
 #        data = [np.array((), dtype=np.double) for _ in range(self.size)]
 #        data[self.rank] = self.sketch[:self.ell, :].copy()
 #        if self.rank == 0:
@@ -399,7 +399,7 @@ def gatherFreqDirs(self):
                     self.sketch[self.nextZeroRow,:] = row 
                     self.nextZeroRow += 1
                     counter += 1
-                    print("DATA PROCESSED: {}".format(counter))
+#                    print("DATA PROCESSED: {}".format(counter))
             toReturn = self.sketch.copy()
             print("COMPLETED MERGE PROCESS: ", toReturn)
             self.sketch = origMatSketch
diff --git a/btx/processing/rankAdaptFD.py b/btx/processing/rankAdaptFD.py
deleted file mode 100644
index 16575469d..000000000
--- a/btx/processing/rankAdaptFD.py
+++ /dev/null
@@ -1,464 +0,0 @@
-import os, csv, argparse
-
-import numpy as np
-from mpi4py import MPI
-
-from matplotlib import pyplot as plt
-from matplotlib import colors
-
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
-
-###########################################
-#John Imports
-from numpy import zeros, sqrt, dot, diag
-from numpy.linalg import svd, LinAlgError
-from scipy.linalg import svd as scipy_svd
-import numpy as np
-
-import time
-
-from datetime import datetime
-currRun = datetime.now().strftime("%y%m%d%H%M%S")
-
-#############################################
-
-class FreqDir:
-
-    """Parallel Frequent Directions."""
-
-    def __init__(
-        self,
-        john_start,
-        tot_imgs,
-        ell, 
-        alpha,
-        exp,
-        run,
-        det_type,
-        downsample=False,
-        bin_factor=2,
-        output_dir="",
-    ):
-
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
-
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = john_start + tot_imgs*self.rank//self.size
-
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
-
-        (
-            self.num_images,
-            _,
-            self.num_features,
-        ) = self.set_params(tot_imgs, ell, bin_factor)
-
-        self.task_durations = dict({})
-
-        self.num_incorporated_images = 0
-
-        self.d = self.num_features
-        self.ell = ell
-        self.m = 2*self.ell
-        self.sketch = zeros( (self.m, self.d) ) 
-        self.nextZeroRow = 0
-        self.alpha = alpha
-
-        self.noImgsToProcess = tot_imgs//self.size
-
-    def set_params(self, num_images, num_components, bin_factor):
-        """
-        Method to initialize FreqDir parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, num_features
-
-    def run(self):
-        """
-        Perform frequent directions matrix sketching
-        on run subject to initialization parameters.
-        """
-
-        for batch in range(0,self.noImgsToProcess,self.ell):
-            self.fetch_and_update_model(self.ell)
-
-        self.comm.Barrier()
-
-    def get_formatted_images(self, n):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-
-        bin_factor = self.bin_factor
-        downsample = self.downsample
-
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-
-        return formatted_imgs
-
-    def fetch_and_update_model(self, n):
-        """
-        Fetch images and update model.
-
-        Parameters
-        ----------
-        n : int
-            number of images to incorporate
-        """
-
-        img_batch = self.get_formatted_images(n)
-
-
-        self.john_update_model(img_batch)
-
-
-    def john_update_model(self, X):
-        """
-        Update matrix sketch with new batch of observations
-        
-        Parameters
-        ----------
-        X: ndarray
-            data to update matrix sketch with
-        """
-
-        _, numIncorp = X.shape
-        n = self.num_incorporated_images
-        q = self.ell
-
-        with TaskTimer(self.task_durations, "total update"):
-
-            if self.rank == 0:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
-                    )
-                )
-            for row in X.T:
-                if self.nextZeroRow >= self.m:
-                    self.john_rotate()
-                self.sketch[self.nextZeroRow,:] = row 
-                self.nextZeroRow += 1
-                self.num_incorporated_images += 1
-#            if self.rank==0:
-#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
-    
-    def john_rotate(self):
-        """ 
-        Apply Frequent Directions Algorithm to 
-        current matrix sketch and adjoined buffer
-
-        Notes
-        -----
-        Based on [1] and [2]. 
-
-        [1] Frequent Directions: Simple and Deterministic Matrix 
-        Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
-        David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
-
-        [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
-        Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
-        Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
-        in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
-        https://doi.org/10.1007/978-3-662-44777-2_39
-        """
-
-        try:
-            [_,s,Vt] = svd(self.sketch , full_matrices=False)
-        except LinAlgError as err:
-            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
-
-        if len(s) >= self.ell:
-            sCopy = s.copy()
-            
-            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
-            #John: Explicitly set this value to be 0, since sometimes it is negative
-            # or even turns to NaN due to roundoff error
-            toShrink[-1] = 0
-            toShrink = sqrt(toShrink)
-            
-            toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
-
-            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
-            self.sketch[self.ell:,:] = 0
-            self.nextZeroRow = self.ell
-        else:
-            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
-            self.sketch[len(s):,:] = 0
-            self.nextZeroRow = len(s)
-
-    def john_reconstructionError(self, matrixCentered):
-        """ 
-        Compute the reconstruction error of the matrix sketch
-        against given data
-
-        Parameters
-        ----------
-        matrixCentered: ndarray
-           Data to compare matrix sketch to 
-       """
-
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
-        UAk = UA[:,:k]
-        SAk = np.diag(SA[:k])
-        VtAk = VtA[:k]
-        Ak = UAk @ SAk @ VtAk
-        return (np.linalg.norm(
-        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
-                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
-
-    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
-        """ 
-        Compute the low memory reconstruction error of the matrix sketch
-        against given data. This si the same as john_reconstructionError,
-        but estimates the norm computation and does not scale by the matrix. 
-
-        Parameters
-        ----------
-        matrixCentered: ndarray
-           Data to compare matrix sketch to 
-       """
-
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
-
-    def estimFrobNormSquared(self, addMe, arrs, its):
-        """ 
-        Estimate the Frobenius Norm of product of arrs matrices 
-        plus addME matrix using its iterations. 
-
-        Parameters
-        ----------
-        arrs: list of ndarray
-           Matrices to multiply together
-
-        addMe: ndarray
-            Matrix to add to others
-
-        its: int
-            Number of iterations to average over
-
-        Returns
-        -------
-        sumMe/its*no_rows : float
-            Estimate of frobenius norm of produce 
-            of arrs matrices plus addMe matrix
-
-        Notes
-        -----
-        Frobenius estimation is the expected value of matrix
-        multiplied by random vector from multivariate normal distribution
-        based on [1]. 
-
-        [1] Norm and Trace Estimation with Random Rank-one Vectors 
-        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
-        Analysis and Applications 2021 42:1, 202-223
-       """
-
-        no_rows = arrs[-1].shape[1]
-        v = np.random.normal(size=no_rows)
-        v_hat = v / np.linalg.norm(v)
-        sumMe = 0
-        for j in range(its):
-            v = np.random.normal(size=no_rows)
-            v_hat = v / np.linalg.norm(v)
-            v_addMe = addMe @ v_hat
-            for arr in arrs[::-1]:
-                v_hat = arr @ v_hat
-            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-        return sumMe/its*no_rows
-
-
-    def gatherFreqDirs(self):
-        print("STARTING GATHER")
-        """
-        Gather local matrix sketches to root node and
-        merge local sketches together. 
-        """
-
-        self.comm.Barrier()
-        sendbuf = self.sketch[:self.ell,:]
-        recvbuf = None
-        if self.rank == 0:
-            recvbuf = np.empty(
-                    [self.size, self.ell, self.d], dtype=np.float32)
-        self.comm.Gather(sendbuf, recvbuf, root=0)
-        print("{} FINISHED GATHER".format(self.rank))
-        if self.rank==0:
-            origMatSketch = self.sketch.copy()
-            origNextZeroRow = self.nextZeroRow
-            self.nextZeroRow = self.ell
-            print("BUFFER SHAPE: ", recvbuf.shape)
-            for j in range(1, self.size):
-                print("CURRENT BUFFER: ", j)
-                print(recvbuf[j])
-                for row in recvbuf[j]:
-                    if(np.any(row)):
-                        if self.nextZeroRow >= self.m:
-                            self.john_rotate()
-                        self.sketch[self.nextZeroRow,:] = row 
-                        self.nextZeroRow += 1
-            toReturn = self.sketch.copy()
-            self.sketch = origMatSketch
-            return toReturn
-        else:
-            return
-
-def parse_input():
-    """
-    Parse command line input.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
-    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
-    parser.add_argument(
-        "-d",
-        "--det_type",
-        help="Detector name, e.g epix10k2M or jungfrau4M.",
-        required=True,
-        type=str,
-    )
-    parser.add_argument(
-        "--start_offset",
-        help="Run index of first image to be incorporated into iPCA model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_components",
-        help="Number of principal components to compute and maintain.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--batch_size",
-        help="Size of image batch incorporated in each model update.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_images",
-        help="Total number of images to be incorporated into model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Path to output directory for recording task duration data.",
-        required=False,
-        type=str,
-    )
-    parser.add_argument(
-        "--priming",
-        help="Initialize model with PCA.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--downsample",
-        help="Enable downsampling of images.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--bin_factor",
-        help="Bin factor if using downsizing.",
-        required=False,
-        type=int,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-
-    params = parse_input()
-    kwargs = {k: v for k, v in vars(params).items() if v is not None}
-
-    pipca = PiPCA(**kwargs)
-    pipca.run()

From 7b26c0eadc2bc87edec1fd8a4086a97f06fe2088 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Fri, 21 Jul 2023 06:12:32 -0700
Subject: [PATCH 07/57] Added rank adaptive and tree merge. Both have not been
 tested yet

---
 .gitignore                |   4 +-
 btx/processing/freqdir.py | 203 ++++++++++++++++++++++++--------------
 2 files changed, 130 insertions(+), 77 deletions(-)

diff --git a/.gitignore b/.gitignore
index bfec190cb..8d2dadd1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@ adhoc/
 __pycache__/
 
 # cli -tmp yaml
-tutorial/*-tmp.yaml
\ No newline at end of file
+tutorial/*-tmp.yaml
+
+*.h5
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 5bf1e8e3d..2720a0717 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -28,6 +28,7 @@
 from datetime import datetime
 currRun = datetime.now().strftime("%y%m%d%H%M%S")
 
+import h5py
 #############################################
 
 class FreqDir:
@@ -43,31 +44,41 @@ def __init__(
         exp,
         run,
         det_type,
+        rankAdapt,
+        merger=False,
+        mergerFeatures=0,
         downsample=False,
         bin_factor=2,
         output_dir="",
     ):
 
+
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = john_start + tot_imgs*self.rank//self.size
+        if not merger:
+            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi.counter = john_start + tot_imgs*self.rank//self.size
 
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
+            self.downsample = downsample
+            self.bin_factor = bin_factor
+            self.output_dir = output_dir
 
-        (
-            self.num_images,
-            _,
-            self.num_features,
-        ) = self.set_params(tot_imgs, ell, bin_factor)
+            (
+                self.num_images,
+                _,
+                self.num_features,
+            ) = self.set_params(tot_imgs, ell, bin_factor)
 
-        self.task_durations = dict({})
+            self.task_durations = dict({})
 
-        self.num_incorporated_images = 0
+            self.num_incorporated_images = 0
+        else:
+            #JOHN: NEED TO IMPROVE. CURRENTLY, NEED TO MANUALLY SET d, WHICH IS UNACCEPTABLE. 
+            self.num_features = mergerFeatures
+            self.task_durations = dict({})
+            self.num_incorporated_images = 0
 
         self.d = self.num_features
         self.ell = ell
@@ -78,6 +89,9 @@ def __init__(
 
         self.noImgsToProcess = tot_imgs//self.size
 
+        self.rankAdapt = rankAdapt
+        self.increaseEll = False
+
     def set_params(self, num_images, num_components, bin_factor):
         """
         Method to initialize FreqDir parameters.
@@ -196,23 +210,36 @@ def john_update_model(self, X):
         """
 
         _, numIncorp = X.shape
-        n = self.num_incorporated_images
-        q = self.ell
-
+#        n = self.num_incorporated_images
+#        q = self.ell
+#
         with TaskTimer(self.task_durations, "total update"):
 
-            if self.rank == 0:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
-                    )
-                )
+#            if self.rank == 0:
+#                print(
+#                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+#                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+#                    )
+#                )
             for row in X.T:
+                canRankAdapt = numIncorp > (self.ell + 15)
                 if self.nextZeroRow >= self.m:
-                    self.john_rotate()
+                    if self.increaseEll and canRankAdapt and self.rankAdapt:
+                        self.ell = self.ell + 10
+                        self.m = 2*self.ell
+                        self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d))))
+                        self.increaseEll = False
+                    else:
+                        copyBatch = self.sketch[self.ell:,:].copy()
+                        self.john_rotate()
+                        if canRankAdapt and self.rankAdapt:
+                            reconError = self.lowMemoryReconstructionErrorUnscaled(copyBatch)
+                            if (np.sqrt(reconError) > 0.08):
+                                self.increaseEll = True
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
                 self.num_incorporated_images += 1
+                numIncorp -= 1
 #            if self.rank==0:
 #                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
     
@@ -354,34 +381,12 @@ def estimFrobNormSquared(self, addMe, arrs, its):
 
 
     def gatherFreqDirs(self):
-#        print("STARTING GATHER")
         """
         Gather local matrix sketches to root node and
         merge local sketches together. 
         """
-#        self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1])
         sendbuf = self.ell
-        buffSizes = np.array(self.comm.gather(sendbuf, root=0))
-#        if self.rank == 0:
-#            print("BUFF SIZES: ", buffSizes)
-#        data = [np.array((), dtype=np.double) for _ in range(self.size)]
-#        data[self.rank] = self.sketch[:self.ell, :].copy()
-#        if self.rank == 0:
-#            sizes_memory = (self.d)*buffSizes
-#            offsets = np.zeros(self.size)
-#            offsets[1:] = np.cumsum(sizes_memory)[:-1]
-#
-#        data_out = None
-#        recvbuf = None
-#        if self.rank == 0:
-#            # data_out = np.empty((np.sum(buffSizes), fd.d), dtype=np.float32)
-#            data_out = np.empty((np.sum(buffSizes), self.d))
-#            recvbuf=[data_out, sizes_memory.tolist(), offsets.tolist(), MPI.DOUBLE]	
-#
-#        self.comm.Barrier()
-#        self.comm.Gatherv(data[self.rank],recvbuf = recvbuf, root=0)
-#        self.comm.Barrier()
-#        print("{} FINISHED GATHERV".format(self.rank))
+        buffSizes = np.array(self.comm.allgather(sendbuf))
 
         if self.rank==0:
             origMatSketch = self.sketch.copy()
@@ -389,9 +394,9 @@ def gatherFreqDirs(self):
             self.nextZeroRow = self.ell
             counter = 0
             for proc in range(1, self.size):
-                bufferMe = np.empty(self.ell*self.d, dtype=np.double)
+                bufferMe = np.empty(buffSizes[self.rank]*self.d, dtype=np.double)
                 self.comm.Recv(bufferMe, source=proc, tag=13)
-                bufferMe = np.reshape(bufferMe, (self.ell, self.d))
+                bufferMe = np.reshape(bufferMe, (buffSizes[self.rank], self.d))
                 for row in bufferMe:
                     if(np.any(row)):
                         if self.nextZeroRow >= self.m:
@@ -407,35 +412,81 @@ def gatherFreqDirs(self):
         else:
             bufferMe = self.sketch[:self.ell, :].copy().flatten()
             self.comm.Send(bufferMe, dest=0, tag=13)
-            return        
-
-#        self.comm.Barrier()
-#        sendbuf = self.sketch[:self.ell,:]
-#        recvbuf = None
-#        if self.rank == 0:
-#            recvbuf = np.empty(
-#                    [self.size, self.ell, self.d], dtype=np.float32)
-#        self.comm.Gather(sendbuf, recvbuf, root=0)
-#        print("{} FINISHED GATHER".format(self.rank))
-#        if self.rank==0:
-#            origMatSketch = self.sketch.copy()
-#            origNextZeroRow = self.nextZeroRow
-#            self.nextZeroRow = self.ell
-#            print("BUFFER SHAPE: ", recvbuf.shape)
-#            for j in range(1, self.size):
-#                print("CURRENT BUFFER: ", j)
-#                print(recvbuf[j])
-#                for row in recvbuf[j]:
-#                    if(np.any(row)):
-#                        if self.nextZeroRow >= self.m:
-#                            self.john_rotate()
-#                        self.sketch[self.nextZeroRow,:] = row 
-#                        self.nextZeroRow += 1
-#            toReturn = self.sketch.copy()
-#            self.sketch = origMatSketch
-#            return toReturn
-#        else:
-#            return
+            return 
+
+    def get(self):
+        return self.sketch[:self.ell, :]
+
+class MergeTree:
+
+    """Frequent Directions Merging Object."""
+
+    def __init__(self, divBy, readFile, dataSetName):
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        
+        self.divBy = divBy
+        
+        with h5py.File(readFile, 'r') as hf:
+            self.data = hf[dataSetName][:]
+
+        print("AOIDJWOIJA", self.rank, self.data.shape)
+
+        self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) 
+
+        self.fd.d = self.data.shape[1]
+
+        sendbuf = self.data.shape[0]
+        self.buffSizes = np.array(self.comm.allgather(sendbuf))
+        print(self.buffSizes)
+
+        #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
+        self.fd.john_update_model(self.data)
+
+
+    def merge(self):
+
+        """
+        Merge Frequent Direction Components in a tree-like fashion. 
+        Returns
+        -------
+        finalSketch : ndarray
+            Merged matrix sketch of cumulative data
+
+
+        """
+        powerNum = 1
+        while(powerNum < self.size):
+            powerNum = powerNum * self.divBy
+        if powerNum != size:
+            raise ValueError('NUMBER OF CORES WOULD LEAD TO INBALANCED MERGE TREE. ENDING PROGRAM.')
+            return
+
+        level = 0
+        while((self.divBy ** level) < self.size):
+            jump = self.divBy ** level
+            if(self.rank%jump ==0):
+                root = self.rank - (self.rank%(jump*self.divBy))
+                grouping = [j for j in range(root, root + jump*self.divBy, jump)]
+                print(grouping)
+#                if self.rank==root:
+#                    for proc in grouping[1:]:
+#                        bufferMe = np.empty(self.data.shape[0] * self.data.shape[1], dtype=np.double)
+#                        comm.Recv(bufferMe, source=proc, tag=17)
+#                        bufferMe = np.reshape(bufferMe, (self.data.shape[0], self.data.shape[1]))
+#                        self.fd.john_update_model(bufferMe.T)
+#                        print(level, data)
+#                else:
+#                    bufferMe = self.fd.get().copy().flatten()
+#                    comm.Send(bufferMe, dest=root, tag=17)
+            level += 1
+        if self.rank==0:
+            finalSketch = self.fd.get()
+            return finalSketch
+        else:
+            return
+
 
 def parse_input():
     """

From b8270aa7ac51582dc97463487b8bdd9f0f79bd95 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Fri, 21 Jul 2023 07:48:53 -0700
Subject: [PATCH 08/57] Parallel Rank Adaptive and Merge Tree appear to run.
 Can't say for sure the sketches are useful until I implement something
 substantial

---
 btx/processing/freqdir.py | 95 +++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 2720a0717..6ff071b34 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -56,6 +56,7 @@ def __init__(
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
+        self.merger = merger
 
         if not merger:
             self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
@@ -75,7 +76,7 @@ def __init__(
 
             self.num_incorporated_images = 0
         else:
-            #JOHN: NEED TO IMPROVE. CURRENTLY, NEED TO MANUALLY SET d, WHICH IS UNACCEPTABLE. 
+            #JOHN: NEED TO IMPROVE. THIS IS WACK. 
             self.num_features = mergerFeatures
             self.task_durations = dict({})
             self.num_incorporated_images = 0
@@ -140,10 +141,9 @@ def run(self):
         on run subject to initialization parameters.
         """
 
-        for batch in range(0,self.noImgsToProcess,self.ell):
-            self.fetch_and_update_model(self.ell)
-
-        self.comm.Barrier()
+        for batch in range(0,self.noImgsToProcess,self.ell*10):
+#            print("aodijwaoij      1")
+            self.fetch_and_update_model(self.ell*10)
 
     def get_formatted_images(self, n):
         """
@@ -193,9 +193,11 @@ def fetch_and_update_model(self, n):
             number of images to incorporate
         """
 
+#        print("aodijwaoij      2")
         img_batch = self.get_formatted_images(n)
 
 
+#        print("aodijwaoij      3")
         self.john_update_model(img_batch)
 
 
@@ -209,39 +211,51 @@ def john_update_model(self, X):
             data to update matrix sketch with
         """
 
+#        print("aodijwaoij      4")
         _, numIncorp = X.shape
-#        n = self.num_incorporated_images
-#        q = self.ell
-#
-        with TaskTimer(self.task_durations, "total update"):
+        origNumIncorp = numIncorp
+        n = self.num_incorporated_images
+        q = self.ell
 
-#            if self.rank == 0:
-#                print(
-#                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-#                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
-#                    )
-#                )
+        with TaskTimer(self.task_durations, "total update"):
+#            print("aodijwaoij      5")
+
+            if self.rank==0 and not self.merger:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                    )
+                )
+#            print("aodijwaoij      5")
             for row in X.T:
+#                print(self.rank, "   aodijwaoij      6")
                 canRankAdapt = numIncorp > (self.ell + 15)
+#                print(self.rank,"CAN RANK ADAPT", canRankAdapt, numIncorp, self.ell+15)
                 if self.nextZeroRow >= self.m:
+#                    print(self.rank, "   aodijwaoij      7")
                     if self.increaseEll and canRankAdapt and self.rankAdapt:
+#                        print(self.rank, "   aodijwaoij      8")
                         self.ell = self.ell + 10
                         self.m = 2*self.ell
                         self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d))))
                         self.increaseEll = False
+                        print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell))
                     else:
+#                        print(self.rank, "   aodijwaoij      9")
                         copyBatch = self.sketch[self.ell:,:].copy()
                         self.john_rotate()
+#                        print(self.rank, "   aodijwaoij      9.25")
                         if canRankAdapt and self.rankAdapt:
-                            reconError = self.lowMemoryReconstructionErrorUnscaled(copyBatch)
-                            if (np.sqrt(reconError) > 0.08):
+#                            print(self.rank, "   aodijwaoij      9.5")
+                            reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch))
+#                            print("ITERATION {} - RECON ERROR OF RANK {}: {}".format(origNumIncorp - numIncorp, self.rank, reconError))
+                            if (reconError > 0.08):
                                 self.increaseEll = True
+#                print(self.rank, "   aodijwaoij      10")
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
                 self.num_incorporated_images += 1
                 numIncorp -= 1
-#            if self.rank==0:
-#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
     
     def john_rotate(self):
         """ 
@@ -270,7 +284,8 @@ def john_rotate(self):
 
         if len(s) >= self.ell:
             sCopy = s.copy()
-            
+           
+           #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise.
             toShrink = s[:self.ell]**2 - s[self.ell-1]**2
             #John: Explicitly set this value to be 0, since sometimes it is negative
             # or even turns to NaN due to roundoff error
@@ -325,13 +340,16 @@ def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
            Data to compare matrix sketch to 
        """
 
+#        print("{} COMPUTING ERROR".format(self.rank))
         matSketch = self.sketch
         k = 10
         matrixCenteredT = matrixCentered.T
         matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
+        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
         G = U[:,:k]
-        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
+#        print("{} FINISHED COMPUTING ERROR".format(self.rank))
+        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/
+                np.linalg.norm(matrixCenteredT, 'fro')**2)
 
     def estimFrobNormSquared(self, addMe, arrs, its):
         """ 
@@ -417,6 +435,11 @@ def gatherFreqDirs(self):
     def get(self):
         return self.sketch[:self.ell, :]
 
+    def write(self):
+        with h5py.File('h5writes/{}_{}.h5'.format(currRun, self.rank), 'w') as hf:
+            hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
+
+
 class MergeTree:
 
     """Frequent Directions Merging Object."""
@@ -431,18 +454,14 @@ def __init__(self, divBy, readFile, dataSetName):
         with h5py.File(readFile, 'r') as hf:
             self.data = hf[dataSetName][:]
 
-        print("AOIDJWOIJA", self.rank, self.data.shape)
-
         self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) 
 
-        self.fd.d = self.data.shape[1]
-
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
         print(self.buffSizes)
 
         #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
-        self.fd.john_update_model(self.data)
+        self.fd.john_update_model(self.data.T)
 
 
     def merge(self):
@@ -459,7 +478,7 @@ def merge(self):
         powerNum = 1
         while(powerNum < self.size):
             powerNum = powerNum * self.divBy
-        if powerNum != size:
+        if powerNum != self.size:
             raise ValueError('NUMBER OF CORES WOULD LEAD TO INBALANCED MERGE TREE. ENDING PROGRAM.')
             return
 
@@ -469,17 +488,15 @@ def merge(self):
             if(self.rank%jump ==0):
                 root = self.rank - (self.rank%(jump*self.divBy))
                 grouping = [j for j in range(root, root + jump*self.divBy, jump)]
-                print(grouping)
-#                if self.rank==root:
-#                    for proc in grouping[1:]:
-#                        bufferMe = np.empty(self.data.shape[0] * self.data.shape[1], dtype=np.double)
-#                        comm.Recv(bufferMe, source=proc, tag=17)
-#                        bufferMe = np.reshape(bufferMe, (self.data.shape[0], self.data.shape[1]))
-#                        self.fd.john_update_model(bufferMe.T)
-#                        print(level, data)
-#                else:
-#                    bufferMe = self.fd.get().copy().flatten()
-#                    comm.Send(bufferMe, dest=root, tag=17)
+                if self.rank==root:
+                    for proc in grouping[1:]:
+                        bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double)
+                        self.comm.Recv(bufferMe, source=proc, tag=17)
+                        bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1]))
+                        self.fd.john_update_model(bufferMe.T)
+                else:
+                    bufferMe = self.fd.get().copy().flatten()
+                    self.comm.Send(bufferMe, dest=root, tag=17)
             level += 1
         if self.rank==0:
             finalSketch = self.fd.get()

From d5b8abde6d1aae43bbcdbe44682b1ada9717e29c Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Fri, 21 Jul 2023 10:47:42 -0700
Subject: [PATCH 09/57] Fixed treemerge and parallel rank adaptive FD. Things
 seem to work. Need to verify run with application.

---
 btx/processing/freqdir.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 6ff071b34..0b42defa8 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -141,9 +141,9 @@ def run(self):
         on run subject to initialization parameters.
         """
 
-        for batch in range(0,self.noImgsToProcess,self.ell*10):
+        for batch in range(0,self.noImgsToProcess,self.ell*6):
 #            print("aodijwaoij      1")
-            self.fetch_and_update_model(self.ell*10)
+            self.fetch_and_update_model(self.ell*6)
 
     def get_formatted_images(self, n):
         """
@@ -458,7 +458,8 @@ def __init__(self, divBy, readFile, dataSetName):
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-        print(self.buffSizes)
+        if self.rank==0):
+            print(self.buffSizes)
 
         #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
         self.fd.john_update_model(self.data.T)
@@ -504,6 +505,9 @@ def merge(self):
         else:
             return
 
+    def write(self):
+        self.fd.write()
+
 
 def parse_input():
     """

From c204c3f4f5cee6725c6aafb5ced7c0fbad71975c Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 25 Jul 2023 12:54:54 -0700
Subject: [PATCH 10/57] Separated FreqDir to create Merge Tree and also
 projection module. Save intermediate results h5 files.

---
 btx/processing/freqdir.py | 285 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 279 insertions(+), 6 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 0b42defa8..b948f26e6 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -29,6 +29,11 @@
 currRun = datetime.now().strftime("%y%m%d%H%M%S")
 
 import h5py
+
+from PIL import Image
+
+#writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/"
+writeDirec = "h5writes/"
 #############################################
 
 class FreqDir:
@@ -192,11 +197,8 @@ def fetch_and_update_model(self, n):
         n : int
             number of images to incorporate
         """
-
 #        print("aodijwaoij      2")
         img_batch = self.get_formatted_images(n)
-
-
 #        print("aodijwaoij      3")
         self.john_update_model(img_batch)
 
@@ -436,8 +438,11 @@ def get(self):
         return self.sketch[:self.ell, :]
 
     def write(self):
-        with h5py.File('h5writes/{}_{}.h5'.format(currRun, self.rank), 'w') as hf:
+        filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank)
+        with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
+        self.comm.Barrier()
+        return filename 
 
 
 class MergeTree:
@@ -458,7 +463,7 @@ def __init__(self, divBy, readFile, dataSetName):
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-        if self.rank==0):
+        if self.rank==0:
             print(self.buffSizes)
 
         #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
@@ -506,8 +511,276 @@ def merge(self):
             return
 
     def write(self):
-        self.fd.write()
+        filename = writeDirec + '{}_merge.h5'.format(currRun)
+        if self.rank==0:
+            with h5py.File(filename, 'w') as hf:
+                hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
+        self.comm.Barrier()
+        return filename
+
+class ApplyCompression:
+    """Compute principal components of matrix sketch and apply to sketched data"""
+
+    def __init__(
+        self,
+        john_start,
+        tot_imgs,
+        ell, 
+        alpha,
+        exp,
+        run,
+        det_type,
+        rankAdapt,
+        readFile, dataSetName,
+        merger=False,
+        mergerFeatures=0,
+        downsample=False,
+        bin_factor=2,
+        output_dir=""
+    ):
+
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.merger = merger
+
+        if not merger:
+            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi.counter = john_start + tot_imgs*self.rank//self.size
+
+            self.downsample = downsample
+            self.bin_factor = bin_factor
+            self.output_dir = output_dir
+
+            (
+                self.num_images,
+                _,
+                self.num_features,
+            ) = self.set_params(tot_imgs, ell, bin_factor)
+
+            self.task_durations = dict({})
+
+            self.num_incorporated_images = 0
+        else:
+            #JOHN: NEED TO IMPROVE. THIS IS WACK. 
+            self.num_features = mergerFeatures
+            self.task_durations = dict({})
+            self.num_incorporated_images = 0
+
+        self.d = self.num_features
+        self.ell = ell
+        self.m = 2*self.ell
+        self.sketch = zeros( (self.m, self.d) ) 
+        self.nextZeroRow = 0
+        self.alpha = alpha
+
+        self.noImgsToProcess = tot_imgs//self.size
+
+        self.rankAdapt = rankAdapt
+        self.increaseEll = False
+
+
+        with h5py.File(readFile, 'r') as hf:
+            self.data = hf[dataSetName][:]
+
+        U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
+        self.components = Vt
+        
+        self.processedData = None
+        self.smallImgs = None
+
+        self.imageIndicesProcessed = []
+
+
+    def set_params(self, num_images, num_components, bin_factor):
+        """
+        Method to initialize FreqDir parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, num_features
+
+    def run(self):
+        """
+        Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. 
+        """
+        for batch in range(0,self.noImgsToProcess,self.ell*6):
+            startCounter = self.psi.counter
+            self.fetch_and_update_model(self.ell*6)
+            self.imageIndicesProcessed.append((startCounter, self.psi.counter))
+
+
+#    def get_formatted_images(self, n):
+#        """
+#        Fetch n - x image segments from run, where x is the number of 'dead' images.
+#
+#        Parameters
+#        ----------
+#        n : int
+#            number of images to retrieve
+#        start_index : int
+#            start index of subsection of data to retrieve
+#        end_index : int
+#            end index of subsection of data to retrieve
+#
+#        Returns
+#        -------
+#        ndarray, shape (end_index-start_index, n-x)
+#            n-x retrieved image segments of dimension end_index-start_index
+#        """
+#
+#        bin_factor = self.bin_factor
+#        downsample = self.downsample
+#
+#        # may have to rewrite eventually when number of images becomes large,
+#        # i.e. streamed setting, either that or downsample aggressively
+#        imgs = self.psi.get_images(n, assemble=False)
+#        print(imgs.shape)
+#
+#        toSaveImgs = bin_data(imgs, bin_factor)
+#        if downsample:
+#            imgs = bin_data(imgs, bin_factor)
+#
+#        toSaveImgs = toSaveImgs[
+#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+#        ]
+#        imgs = imgs[
+#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+#        ]
+#
+#        num_valid_imgs, p, x, y = imgs.shape
+#        toSave_num_valid_imgs, toSave_p, toSave_x, toSave_y = toSaveImgs.shape
+#
+#        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+#        toSave_formatted_imgs = np.reshape(toSaveImgs, (toSave_num_valid_imgs, toSave_p * toSave_x * toSave_y)).T
+#        print(toSave_formatted_imgs.shape)
+#
+#        return (formatted_imgs,toSave_formatted_imgs)
+
+    def get_formatted_images(self, n):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
 
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs
+
+    def assembleImgsToSave(self, imgs):
+        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+
+        saveMe = []
+        for img in imgs.T:
+            imgRe = np.reshape(img, self.psi.det.shape())
+            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+            #saveMe.append(np.array(Image.fromarray(imgRe, mode='L').resize((150, 150), Image.Resampling.BICUBIC)))
+            saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150))))
+        saveMe = np.array(saveMe)
+        return saveMe
+
+#        print("IMGS TO SAVE SHAPE: ", imgs.shape)
+#        saveMe = []
+#        for img in imgs:
+#                saveMe.append(np.array(Image.fromarray(img, mode='L').resize((150, 150), Image.Resampling.BICUBIC)))
+#        saveMe = np.array(saveMe)
+#        print("RESIZED IMGS TO SAVE SHAPE: ", saveMe.shape)
+#        return saveMe
+        
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+        img_batch = self.get_formatted_images(n)
+        toSave_img_batch = self.assembleImgsToSave(img_batch)
+        if self.smallImgs is None:
+            self.smallImgs = toSave_img_batch
+        else:
+            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
+        self.john_apply_compression(img_batch)
+
+    def john_apply_compression(self, X):
+        if self.processedData is None:
+            self.processedData = np.dot(X.T, self.components.T)
+        else:
+            self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T)))
+
+    def write(self):
+        filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank)
+        with h5py.File(filename, 'w') as hf:
+            hf.create_dataset("ProjectedData",  data=self.processedData)
+            hf.create_dataset("SmallImages", data=self.smallImgs)
+        self.comm.Barrier()
+        return filename
 
 def parse_input():
     """

From 07d624e52b844210d9e13ba854e757399f057bf1 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Thu, 27 Jul 2023 18:31:48 -0700
Subject: [PATCH 11/57] Checkpoint

---
 btx/processing/freqdir.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index b948f26e6..d1e49f54f 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -32,8 +32,8 @@
 
 from PIL import Image
 
-#writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/"
-writeDirec = "h5writes/"
+writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/"
+#writeDirec = "h5writes/"
 #############################################
 
 class FreqDir:

From 1f801ef9d5942c4643c1feed8b844eaeb1a91a59 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Fri, 28 Jul 2023 10:23:14 -0700
Subject: [PATCH 12/57] Cleaned up code

---
 btx/processing/freqdir.py | 207 ++++++++------------------------------
 1 file changed, 42 insertions(+), 165 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index d1e49f54f..77d7f3367 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -98,6 +98,8 @@ def __init__(
         self.rankAdapt = rankAdapt
         self.increaseEll = False
 
+        self.imgsTracked = []
+
     def set_params(self, num_images, num_components, bin_factor):
         """
         Method to initialize FreqDir parameters.
@@ -147,7 +149,6 @@ def run(self):
         """
 
         for batch in range(0,self.noImgsToProcess,self.ell*6):
-#            print("aodijwaoij      1")
             self.fetch_and_update_model(self.ell*6)
 
     def get_formatted_images(self, n):
@@ -168,6 +169,7 @@ def get_formatted_images(self, n):
         ndarray, shape (end_index-start_index, n-x)
             n-x retrieved image segments of dimension end_index-start_index
         """
+        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
 
         bin_factor = self.bin_factor
         downsample = self.downsample
@@ -197,9 +199,7 @@ def fetch_and_update_model(self, n):
         n : int
             number of images to incorporate
         """
-#        print("aodijwaoij      2")
         img_batch = self.get_formatted_images(n)
-#        print("aodijwaoij      3")
         self.john_update_model(img_batch)
 
 
@@ -212,48 +212,33 @@ def john_update_model(self, X):
         X: ndarray
             data to update matrix sketch with
         """
-
-#        print("aodijwaoij      4")
         _, numIncorp = X.shape
         origNumIncorp = numIncorp
         n = self.num_incorporated_images
         q = self.ell
-
         with TaskTimer(self.task_durations, "total update"):
-#            print("aodijwaoij      5")
-
             if self.rank==0 and not self.merger:
                 print(
                     "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
                         m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
                     )
                 )
-#            print("aodijwaoij      5")
             for row in X.T:
-#                print(self.rank, "   aodijwaoij      6")
                 canRankAdapt = numIncorp > (self.ell + 15)
-#                print(self.rank,"CAN RANK ADAPT", canRankAdapt, numIncorp, self.ell+15)
                 if self.nextZeroRow >= self.m:
-#                    print(self.rank, "   aodijwaoij      7")
                     if self.increaseEll and canRankAdapt and self.rankAdapt:
-#                        print(self.rank, "   aodijwaoij      8")
                         self.ell = self.ell + 10
                         self.m = 2*self.ell
                         self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d))))
                         self.increaseEll = False
                         print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell))
                     else:
-#                        print(self.rank, "   aodijwaoij      9")
                         copyBatch = self.sketch[self.ell:,:].copy()
                         self.john_rotate()
-#                        print(self.rank, "   aodijwaoij      9.25")
                         if canRankAdapt and self.rankAdapt:
-#                            print(self.rank, "   aodijwaoij      9.5")
                             reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch))
-#                            print("ITERATION {} - RECON ERROR OF RANK {}: {}".format(origNumIncorp - numIncorp, self.rank, reconError))
                             if (reconError > 0.08):
                                 self.increaseEll = True
-#                print(self.rank, "   aodijwaoij      10")
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
                 self.num_incorporated_images += 1
@@ -278,15 +263,12 @@ def john_rotate(self):
         in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
         https://doi.org/10.1007/978-3-662-44777-2_39
         """
-
         try:
             [_,s,Vt] = svd(self.sketch , full_matrices=False)
         except LinAlgError as err:
             [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
-
         if len(s) >= self.ell:
             sCopy = s.copy()
-           
            #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise.
             toShrink = s[:self.ell]**2 - s[self.ell-1]**2
             #John: Explicitly set this value to be 0, since sometimes it is negative
@@ -314,7 +296,6 @@ def john_reconstructionError(self, matrixCentered):
         matrixCentered: ndarray
            Data to compare matrix sketch to 
        """
-
         matSketch = self.sketch
         k = 10
         matrixCenteredT = matrixCentered.T
@@ -330,7 +311,7 @@ def john_reconstructionError(self, matrixCentered):
         	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
                 (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
 
-    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
+    def lowMemoryReconstructionError(self, matrixCentered):
         """ 
         Compute the low memory reconstruction error of the matrix sketch
         against given data. This si the same as john_reconstructionError,
@@ -341,15 +322,12 @@ def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
         matrixCentered: ndarray
            Data to compare matrix sketch to 
        """
-
-#        print("{} COMPUTING ERROR".format(self.rank))
         matSketch = self.sketch
         k = 10
         matrixCenteredT = matrixCentered.T
         matSketchT = matSketch.T
         U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
         G = U[:,:k]
-#        print("{} FINISHED COMPUTING ERROR".format(self.rank))
         return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/
                 np.linalg.norm(matrixCenteredT, 'fro')**2)
 
@@ -385,7 +363,6 @@ def estimFrobNormSquared(self, addMe, arrs, its):
         Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
         Analysis and Applications 2021 42:1, 202-223
        """
-
         no_rows = arrs[-1].shape[1]
         v = np.random.normal(size=no_rows)
         v_hat = v / np.linalg.norm(v)
@@ -400,14 +377,13 @@ def estimFrobNormSquared(self, addMe, arrs, its):
         return sumMe/its*no_rows
 
 
-    def gatherFreqDirs(self):
+    def gatherFreqDirsSerial(self):
         """
         Gather local matrix sketches to root node and
         merge local sketches together. 
         """
         sendbuf = self.ell
         buffSizes = np.array(self.comm.allgather(sendbuf))
-
         if self.rank==0:
             origMatSketch = self.sketch.copy()
             origNextZeroRow = self.nextZeroRow
@@ -424,7 +400,6 @@ def gatherFreqDirs(self):
                     self.sketch[self.nextZeroRow,:] = row 
                     self.nextZeroRow += 1
                     counter += 1
-#                    print("DATA PROCESSED: {}".format(counter))
             toReturn = self.sketch.copy()
             print("COMPLETED MERGE PROCESS: ", toReturn)
             self.sketch = origMatSketch
@@ -435,12 +410,19 @@ def gatherFreqDirs(self):
             return 
 
     def get(self):
+        """
+        Fetch matrix sketch
+        """
         return self.sketch[:self.ell, :]
 
     def write(self):
+        """
+        Write matrix sketch to h5 file. 
+        """
         filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
+            hf.create_dataset("imgsTracked", data=self.imgsTracked)
         self.comm.Barrier()
         return filename 
 
@@ -511,6 +493,9 @@ def merge(self):
             return
 
     def write(self):
+        """
+        Write merged matrix sketch to h5 file
+        """
         filename = writeDirec + '{}_merge.h5'.format(currRun)
         if self.rank==0:
             with h5py.File(filename, 'w') as hf:
@@ -519,7 +504,7 @@ def write(self):
         return filename
 
 class ApplyCompression:
-    """Compute principal components of matrix sketch and apply to sketched data"""
+    """Compute principal components of matrix sketch and apply to data"""
 
     def __init__(
         self,
@@ -644,54 +629,6 @@ def run(self):
             self.fetch_and_update_model(self.ell*6)
             self.imageIndicesProcessed.append((startCounter, self.psi.counter))
 
-
-#    def get_formatted_images(self, n):
-#        """
-#        Fetch n - x image segments from run, where x is the number of 'dead' images.
-#
-#        Parameters
-#        ----------
-#        n : int
-#            number of images to retrieve
-#        start_index : int
-#            start index of subsection of data to retrieve
-#        end_index : int
-#            end index of subsection of data to retrieve
-#
-#        Returns
-#        -------
-#        ndarray, shape (end_index-start_index, n-x)
-#            n-x retrieved image segments of dimension end_index-start_index
-#        """
-#
-#        bin_factor = self.bin_factor
-#        downsample = self.downsample
-#
-#        # may have to rewrite eventually when number of images becomes large,
-#        # i.e. streamed setting, either that or downsample aggressively
-#        imgs = self.psi.get_images(n, assemble=False)
-#        print(imgs.shape)
-#
-#        toSaveImgs = bin_data(imgs, bin_factor)
-#        if downsample:
-#            imgs = bin_data(imgs, bin_factor)
-#
-#        toSaveImgs = toSaveImgs[
-#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-#        ]
-#        imgs = imgs[
-#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-#        ]
-#
-#        num_valid_imgs, p, x, y = imgs.shape
-#        toSave_num_valid_imgs, toSave_p, toSave_x, toSave_y = toSaveImgs.shape
-#
-#        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-#        toSave_formatted_imgs = np.reshape(toSaveImgs, (toSave_num_valid_imgs, toSave_p * toSave_x * toSave_y)).T
-#        print(toSave_formatted_imgs.shape)
-#
-#        return (formatted_imgs,toSave_formatted_imgs)
-
     def get_formatted_images(self, n):
         """
         Fetch n - x image segments from run, where x is the number of 'dead' images.
@@ -731,6 +668,19 @@ def get_formatted_images(self, n):
         return formatted_imgs
 
     def assembleImgsToSave(self, imgs):
+        """
+        Form the images from psana pixel index map and downsample images. 
+
+        Parameters
+        ----------
+        imgs: ndarray
+            images to downsample
+
+        Notes
+        -----
+        There is no need to use a for loop here, since assemble_image_stack_batch 
+        works on batches of images, and reshape can as well.
+        """
         pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
 
         saveMe = []
@@ -742,23 +692,14 @@ def assembleImgsToSave(self, imgs):
         saveMe = np.array(saveMe)
         return saveMe
 
-#        print("IMGS TO SAVE SHAPE: ", imgs.shape)
-#        saveMe = []
-#        for img in imgs:
-#                saveMe.append(np.array(Image.fromarray(img, mode='L').resize((150, 150), Image.Resampling.BICUBIC)))
-#        saveMe = np.array(saveMe)
-#        print("RESIZED IMGS TO SAVE SHAPE: ", saveMe.shape)
-#        return saveMe
-        
-
     def fetch_and_update_model(self, n):
         """
-        Fetch images and update model.
+        Fetch and downsample data, apply projection algorithm
 
         Parameters
         ----------
         n : int
-            number of images to incorporate
+            number of images to process
         """
         img_batch = self.get_formatted_images(n)
         toSave_img_batch = self.assembleImgsToSave(img_batch)
@@ -769,90 +710,26 @@ def fetch_and_update_model(self, n):
         self.john_apply_compression(img_batch)
 
     def john_apply_compression(self, X):
+        """
+        Project data X onto matrix sketch space. 
+
+        Parameters
+        ----------
+        X: ndarray
+            data to project
+        """
         if self.processedData is None:
             self.processedData = np.dot(X.T, self.components.T)
         else:
             self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T)))
 
     def write(self):
+        """
+        Write projected data and downsampled data to h5 file
+        """
         filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)
         self.comm.Barrier()
         return filename
-
-def parse_input():
-    """
-    Parse command line input.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
-    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
-    parser.add_argument(
-        "-d",
-        "--det_type",
-        help="Detector name, e.g epix10k2M or jungfrau4M.",
-        required=True,
-        type=str,
-    )
-    parser.add_argument(
-        "--start_offset",
-        help="Run index of first image to be incorporated into iPCA model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_components",
-        help="Number of principal components to compute and maintain.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--batch_size",
-        help="Size of image batch incorporated in each model update.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_images",
-        help="Total number of images to be incorporated into model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Path to output directory for recording task duration data.",
-        required=False,
-        type=str,
-    )
-    parser.add_argument(
-        "--priming",
-        help="Initialize model with PCA.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--downsample",
-        help="Enable downsampling of images.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--bin_factor",
-        help="Bin factor if using downsizing.",
-        required=False,
-        type=int,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-
-    params = parse_input()
-    kwargs = {k: v for k, v in vars(params).items() if v is not None}
-
-    pipca = PiPCA(**kwargs)
-    pipca.run()

From 222ba0b437eedc4849ae08f67e8a483c760a8a9d Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Sun, 30 Jul 2023 15:21:19 -0700
Subject: [PATCH 13/57] Checkpoint. I don't think any significant changes have
 been made

---
 btx/processing/OLDfreqdir.py | 464 -----------------------------------
 btx/processing/freqdir.py    |  19 +-
 2 files changed, 12 insertions(+), 471 deletions(-)
 delete mode 100644 btx/processing/OLDfreqdir.py

diff --git a/btx/processing/OLDfreqdir.py b/btx/processing/OLDfreqdir.py
deleted file mode 100644
index 430ea4d22..000000000
--- a/btx/processing/OLDfreqdir.py
+++ /dev/null
@@ -1,464 +0,0 @@
-import os, csv, argparse
-
-import numpy as np
-from mpi4py import MPI
-
-from matplotlib import pyplot as plt
-from matplotlib import colors
-
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
-
-###########################################
-#John Imports
-from numpy import zeros, sqrt, dot, diag
-from numpy.linalg import svd, LinAlgError
-from scipy.linalg import svd as scipy_svd
-import numpy as np
-
-import time
-
-from datetime import datetime
-currRun = datetime.now().strftime("%y%m%d%H%M%S")
-
-#############################################
-
-class FreqDir:
-
-    """Parallel Frequent Directions."""
-
-    def __init__(
-        self,
-        john_start,
-        tot_imgs,
-        ell, 
-        alpha,
-        exp,
-        run,
-        det_type,
-        downsample=False,
-        bin_factor=2,
-        output_dir="",
-    ):
-
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
-
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = john_start + tot_imgs*self.rank//self.size
-
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
-
-        (
-            self.num_images,
-            _,
-            self.num_features,
-        ) = self.set_params(tot_imgs, ell, bin_factor)
-
-        self.task_durations = dict({})
-
-        self.num_incorporated_images = 0
-
-        self.d = self.num_features
-        self.ell = ell
-        self.m = 2*self.ell
-        self.sketch = zeros( (self.m, self.d) ) 
-        self.nextZeroRow = 0
-        self.alpha = alpha
-
-        self.noImgsToProcess = tot_imgs//self.size
-
-    def set_params(self, num_images, num_components, bin_factor):
-        """
-        Method to initialize FreqDir parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, num_features
-
-    def run(self):
-        """
-        Perform frequent directions matrix sketching
-        on run subject to initialization parameters.
-        """
-
-        for batch in range(0,self.noImgsToProcess,self.ell):
-            self.fetch_and_update_model(self.ell)
-
-        self.comm.Barrier()
-
-    def get_formatted_images(self, n):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-
-        bin_factor = self.bin_factor
-        downsample = self.downsample
-
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-
-        return formatted_imgs
-
-    def fetch_and_update_model(self, n):
-        """
-        Fetch images and update model.
-
-        Parameters
-        ----------
-        n : int
-            number of images to incorporate
-        """
-
-        img_batch = self.get_formatted_images(n)
-
-
-        self.john_update_model(img_batch)
-
-
-    def john_update_model(self, X):
-        """
-        Update matrix sketch with new batch of observations
-        
-        Parameters
-        ----------
-        X: ndarray
-            data to update matrix sketch with
-        """
-
-        _, numIncorp = X.shape
-        n = self.num_incorporated_images
-        q = self.ell
-
-        with TaskTimer(self.task_durations, "total update"):
-
-            if self.rank == 0:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
-                    )
-                )
-            for row in X.T:
-                if self.nextZeroRow >= self.m:
-                    self.john_rotate()
-                self.sketch[self.nextZeroRow,:] = row 
-                self.nextZeroRow += 1
-                self.num_incorporated_images += 1
-#            if self.rank==0:
-#                print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}')
-    
-    def john_rotate(self):
-        """ 
-        Apply Frequent Directions Algorithm to 
-        current matrix sketch and adjoined buffer
-
-        Notes
-        -----
-        Based on [1] and [2]. 
-
-        [1] Frequent Directions: Simple and Deterministic Matrix 
-        Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
-        David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
-
-        [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
-        Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
-        Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
-        in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
-        https://doi.org/10.1007/978-3-662-44777-2_39
-        """
-
-        try:
-            [_,s,Vt] = svd(self.sketch , full_matrices=False)
-        except LinAlgError as err:
-            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
-
-        if len(s) >= self.ell:
-            sCopy = s.copy()
-            
-            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
-            #John: Explicitly set this value to be 0, since sometimes it is negative
-            # or even turns to NaN due to roundoff error
-            toShrink[-1] = 0
-            toShrink = sqrt(toShrink)
-            
-            toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
-
-            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
-            self.sketch[self.ell:,:] = 0
-            self.nextZeroRow = self.ell
-        else:
-            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
-            self.sketch[len(s):,:] = 0
-            self.nextZeroRow = len(s)
-
-    def john_reconstructionError(self, matrixCentered):
-        """ 
-        Compute the reconstruction error of the matrix sketch
-        against given data
-
-        Parameters
-        ----------
-        matrixCentered: ndarray
-           Data to compare matrix sketch to 
-       """
-
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        UA, SA, VtA = np.linalg.svd(matrixCenteredT)
-        UAk = UA[:,:k]
-        SAk = np.diag(SA[:k])
-        VtAk = VtA[:k]
-        Ak = UAk @ SAk @ VtAk
-        return (np.linalg.norm(
-        	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
-                (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
-
-    def lowMemoryReconstructionErrorUnscaled(self, matrixCentered):
-        """ 
-        Compute the low memory reconstruction error of the matrix sketch
-        against given data. This si the same as john_reconstructionError,
-        but estimates the norm computation and does not scale by the matrix. 
-
-        Parameters
-        ----------
-        matrixCentered: ndarray
-           Data to compare matrix sketch to 
-       """
-
-        matSketch = self.sketch
-        k = 10
-        matrixCenteredT = matrixCentered.T
-        matSketchT = matSketch.T
-        U, S, Vt = np.linalg.svd(matSketchT)
-        G = U[:,:k]
-        return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100)
-
-    def estimFrobNormSquared(self, addMe, arrs, its):
-        """ 
-        Estimate the Frobenius Norm of product of arrs matrices 
-        plus addME matrix using its iterations. 
-
-        Parameters
-        ----------
-        arrs: list of ndarray
-           Matrices to multiply together
-
-        addMe: ndarray
-            Matrix to add to others
-
-        its: int
-            Number of iterations to average over
-
-        Returns
-        -------
-        sumMe/its*no_rows : float
-            Estimate of frobenius norm of produce 
-            of arrs matrices plus addMe matrix
-
-        Notes
-        -----
-        Frobenius estimation is the expected value of matrix
-        multiplied by random vector from multivariate normal distribution
-        based on [1]. 
-
-        [1] Norm and Trace Estimation with Random Rank-one Vectors 
-        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
-        Analysis and Applications 2021 42:1, 202-223
-       """
-
-        no_rows = arrs[-1].shape[1]
-        v = np.random.normal(size=no_rows)
-        v_hat = v / np.linalg.norm(v)
-        sumMe = 0
-        for j in range(its):
-            v = np.random.normal(size=no_rows)
-            v_hat = v / np.linalg.norm(v)
-            v_addMe = addMe @ v_hat
-            for arr in arrs[::-1]:
-                v_hat = arr @ v_hat
-            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-        return sumMe/its*no_rows
-
-
-    def gatherFreqDirs(self):
-        print("STARTING GATHER")
-        """
-        Gather local matrix sketches to root node and
-        merge local sketches together. 
-        """
-
-        self.comm.Barrier()
-        sendbuf = self.sketch[:self.ell,:]
-        recvbuf = None
-        if self.rank == 0:
-            recvbuf = np.empty(
-                    [self.size, self.ell, self.d], dtype=np.double)
-        self.comm.Gather(sendbuf, recvbuf, root=0)
-        print("{} FINISHED GATHER".format(self.rank))
-        if self.rank==0:
-            origMatSketch = self.sketch.copy()
-            origNextZeroRow = self.nextZeroRow
-            self.nextZeroRow = self.ell
-            print("BUFFER SHAPE: ", recvbuf.shape)
-            for j in range(1, self.size):
-                print("CURRENT BUFFER: ", j)
-                print(recvbuf[j])
-                for row in recvbuf[j]:
-                    if(np.any(row)):
-                        if self.nextZeroRow >= self.m:
-                            self.john_rotate()
-                        self.sketch[self.nextZeroRow,:] = row 
-                        self.nextZeroRow += 1
-            toReturn = self.sketch.copy()
-            self.sketch = origMatSketch
-            return toReturn
-        else:
-            return
-
-def parse_input():
-    """
-    Parse command line input.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
-    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
-    parser.add_argument(
-        "-d",
-        "--det_type",
-        help="Detector name, e.g epix10k2M or jungfrau4M.",
-        required=True,
-        type=str,
-    )
-    parser.add_argument(
-        "--start_offset",
-        help="Run index of first image to be incorporated into iPCA model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_components",
-        help="Number of principal components to compute and maintain.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--batch_size",
-        help="Size of image batch incorporated in each model update.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_images",
-        help="Total number of images to be incorporated into model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Path to output directory for recording task duration data.",
-        required=False,
-        type=str,
-    )
-    parser.add_argument(
-        "--priming",
-        help="Initialize model with PCA.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--downsample",
-        help="Enable downsampling of images.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--bin_factor",
-        help="Bin factor if using downsizing.",
-        required=False,
-        type=int,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-
-    params = parse_input()
-    kwargs = {k: v for k, v in vars(params).items() if v is not None}
-
-    pipca = PiPCA(**kwargs)
-    pipca.run()
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 77d7f3367..1683d73a6 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -32,7 +32,6 @@
 
 from PIL import Image
 
-writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/"
 #writeDirec = "h5writes/"
 #############################################
 
@@ -50,13 +49,15 @@ def __init__(
         run,
         det_type,
         rankAdapt,
+        writeDirec,
         merger=False,
         mergerFeatures=0,
         downsample=False,
         bin_factor=2,
-        output_dir="",
+        output_dir=""
     ):
 
+        self.writeDirec = writeDirec
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -419,7 +420,7 @@ def write(self):
         """
         Write matrix sketch to h5 file. 
         """
-        filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank)
+        filename = self.writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
             hf.create_dataset("imgsTracked", data=self.imgsTracked)
@@ -431,7 +432,7 @@ class MergeTree:
 
     """Frequent Directions Merging Object."""
 
-    def __init__(self, divBy, readFile, dataSetName):
+    def __init__(self, divBy, readFile, dataSetName, writeDirec):
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
@@ -441,7 +442,7 @@ def __init__(self, divBy, readFile, dataSetName):
         with h5py.File(readFile, 'r') as hf:
             self.data = hf[dataSetName][:]
 
-        self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) 
+        self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], writeDirec=writeDirec) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -451,6 +452,8 @@ def __init__(self, divBy, readFile, dataSetName):
         #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
         self.fd.john_update_model(self.data.T)
 
+        self.writeDirec = writeDirec
+
 
     def merge(self):
 
@@ -496,7 +499,7 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
-        filename = writeDirec + '{}_merge.h5'.format(currRun)
+        filename = self.writeDirec + '{}_merge.h5'.format(currRun)
         if self.rank==0:
             with h5py.File(filename, 'w') as hf:
                 hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
@@ -517,6 +520,7 @@ def __init__(
         det_type,
         rankAdapt,
         readFile, dataSetName,
+        writeDirec,
         merger=False,
         mergerFeatures=0,
         downsample=False,
@@ -524,6 +528,7 @@ def __init__(
         output_dir=""
     ):
 
+        self.writeDirec = writeDirec
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -727,7 +732,7 @@ def write(self):
         """
         Write projected data and downsampled data to h5 file
         """
-        filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank)
+        filename = self.writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)

From fb68d4b76c944c4209d77330acdbf43d67c2d055 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Thu, 3 Aug 2023 04:16:24 -0700
Subject: [PATCH 14/57] Refactored code and addressed many of the code review
 comments.

---
 btx/processing/freqdir.py | 440 ++++++++++++++++++--------------------
 1 file changed, 209 insertions(+), 231 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 1683d73a6..4a4ccdebc 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1,6 +1,10 @@
 import os, csv, argparse
 
 import numpy as np
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+
 from mpi4py import MPI
 
 from matplotlib import pyplot as plt
@@ -16,92 +20,130 @@
     assemble_image_stack_batch,
 )
 
-###########################################
-#John Imports
-from numpy import zeros, sqrt, dot, diag
-from numpy.linalg import svd, LinAlgError
-from scipy.linalg import svd as scipy_svd
-import numpy as np
-
 import time
 
-from datetime import datetime
-currRun = datetime.now().strftime("%y%m%d%H%M%S")
-
 import h5py
-
 from PIL import Image
 
-#writeDirec = "h5writes/"
-#############################################
 
 class FreqDir:
 
-    """Parallel Frequent Directions."""
+    """
+    Parallel Frequent Directions.
+    
+    Based on [1] and [2]. Frequent Directions is a matrix sketching algorithm used to
+    approximate large data sets. The basic goal of matrix sketching is to process an
+    n x d matrix A to somehow represent a matrix B so that ||A-B|| or covariance error
+    is small. Frequent Directions provably acheives a spectral bound on covariance 
+    error and greatly outperforms comparable existing sketching techniques. It acheives
+    similar runtime and performance to incremental SVD as well. 
+
+    In this module we implement the frequent directions algorithm. This is the first of
+    three modules in this data processing pipeline, and it produces a sketch of a subset
+    of the data into an h5 file. The "Merge Tree" module will be responsible for merging
+    each of the sketches together, parallelizing the process, and the apply compression
+    algorithm will be responsible for using the full matrix sketch projecting the 
+    original data to low dimensional space for data exploration. 
+
+    One novel feature of this implementation is the rank adaption feature: users have the
+    ability to select the approximate reconstruction error they want the sketch to operate
+    over, and the algorithm will adjust the rank of the sketch to meet this error bound
+    as data streams in. The module also gives users the ability to perform the sketching
+    process over thresholded and non-zero image data.
+
+    [1] Frequent Directions: Simple and Deterministic Matrix 
+    Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and 
+    David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792
+
+    [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved 
+    Practical Matrix Sketching with Guarantees. In: Schulz, A.S., 
+    Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes 
+    in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
+    https://doi.org/10.1007/978-3-662-44777-2_39
+
+    Attributes:
+       start_offset: starting index of images to process
+       total_imgs: total number of images to process
+       ell: number of components of matrix sketch
+       alpha: proportion  of components to not rotate in frequent directions algorithm
+       exp, run, det_type: experiment properties
+       rankAdapt: indicates whether to perform rank adaptive FD
+       increaseEll: internal variable indicating whether ell should be increased for rank adaption
+       dir: directory to write output
+       merger: indicates whether object will be used to merge other FD objects
+       mergerFeatures: used if merger is true and indicates number of features of local matrix sketches
+       downsample, bin: whether data should be downsampled and by how much
+       threshold: whether data should be thresholded (zero if less than threshold amount)
+       normalizeIntensity: whether data should be normalized to have total intensity of one
+       noZeroIntensity: whether data with low total intensity should be discarded
+       d: number of features (pixels) in data
+       m: internal frequent directions variable recording total number of components used in algorithm
+       sketch: numpy array housing current matrix sketch
+       mean: geometric mean of data processed
+       num_incorporated_images: number of images processed so far
+       imgsTracked: indices of images processed so far
+       currRun: Current datetime used to identify run
+    """
 
     def __init__(
         self,
-        john_start,
-        tot_imgs,
-        ell, 
-        alpha,
+        start_offset,
+        total_imgs,
         exp,
         run,
         det_type,
-        rankAdapt,
-        writeDirec,
+        dir,
+        currRun,
+        ell=0, 
+        alpha=0,
+        rankAdapt=False,
         merger=False,
         mergerFeatures=0,
         downsample=False,
         bin_factor=2,
-        output_dir=""
+        threshold=False,
+        normalizeIntensity=False,
+        noZeroIntensity=False, 
     ):
 
-        self.writeDirec = writeDirec
-
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
-        self.merger = merger
 
+        self.currRun = currRun
+
+        self.merger = merger
         if not merger:
             self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-            self.psi.counter = john_start + tot_imgs*self.rank//self.size
-
+            self.psi.counter = start_offset + total_imgs*self.rank//self.size
             self.downsample = downsample
             self.bin_factor = bin_factor
-            self.output_dir = output_dir
-
             (
                 self.num_images,
-                _,
                 self.num_features,
-            ) = self.set_params(tot_imgs, ell, bin_factor)
-
-            self.task_durations = dict({})
-
-            self.num_incorporated_images = 0
+            ) = self.set_params(total_imgs, bin_factor)
         else:
-            #JOHN: NEED TO IMPROVE. THIS IS WACK. 
             self.num_features = mergerFeatures
-            self.task_durations = dict({})
-            self.num_incorporated_images = 0
+        self.task_durations = dict({})
+        self.num_incorporated_images = 0
 
+        self.dir = dir
         self.d = self.num_features
         self.ell = ell
         self.m = 2*self.ell
         self.sketch = zeros( (self.m, self.d) ) 
         self.nextZeroRow = 0
         self.alpha = alpha
-
-        self.noImgsToProcess = tot_imgs//self.size
+        self.mean = None
+        self.imgsTracked = []
 
         self.rankAdapt = rankAdapt
         self.increaseEll = False
+        self.threshold = threshold
+        self.noZeroIntensity = noZeroIntensity
+        self.normalizeIntensity=normalizeIntensity
 
-        self.imgsTracked = []
-
-    def set_params(self, num_images, num_components, bin_factor):
+    def set_params(self, num_images, bin_factor):
         """
         Method to initialize FreqDir parameters.
 
@@ -109,8 +151,6 @@ def set_params(self, num_images, num_components, bin_factor):
         ----------
         num_images : int
             Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
         bin_factor : int
             Factor to bin data by.
 
@@ -118,8 +158,6 @@ def set_params(self, num_images, num_components, bin_factor):
         -------
         num_images : int
             Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
         num_features : int
             Number of features (dimension) in each image.
         """
@@ -128,7 +166,6 @@ def set_params(self, num_images, num_components, bin_factor):
         downsample = self.downsample
 
         num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
 
         # set d
         det_shape = self.psi.det.shape()
@@ -141,7 +178,7 @@ def set_params(self, num_images, num_components, bin_factor):
             else:
                 num_features = int(num_features / bin_factor**2)
 
-        return num_images, num_components, num_features
+        return num_images, num_features
 
     def run(self):
         """
@@ -149,8 +186,9 @@ def run(self):
         on run subject to initialization parameters.
         """
 
-        for batch in range(0,self.noImgsToProcess,self.ell*6):
-            self.fetch_and_update_model(self.ell*6)
+        noImgsToProcess = self.num_images//self.size
+        for batch in range(0,noImgsToProcess,self.ell*3):
+            self.fetch_and_update_model(self.ell*3)
 
     def get_formatted_images(self, n):
         """
@@ -187,9 +225,24 @@ def get_formatted_images(self, n):
         ]
 
         num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
 
-        return formatted_imgs
+        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+        nimg_batch = []
+        for img in img_batch:
+            if self.threshold:
+                secondQuartile = np.sort(img)[-1]//4
+                nimg = (img>secondQuartile)*img
+            else:
+                nimg = img
+            currIntensity = np.sum(nimg.flatten())
+            if self.noZeroIntensity and currIntensity<1000:
+                continue
+            else:
+                if currIntensity>10000 and self.normalizeIntensity:
+                    nimg_batch.append(nimg/currIntensity)
+                else:
+                    nimg_batch.append(nimg)
+        return np.array(nimg_batch)
 
     def fetch_and_update_model(self, n):
         """
@@ -201,10 +254,15 @@ def fetch_and_update_model(self, n):
             number of images to incorporate
         """
         img_batch = self.get_formatted_images(n)
-        self.john_update_model(img_batch)
+        if self.mean is None:
+            self.mean = np.sum(img_batch, axis=0)/(img_batch.shape[0])
+        else:
+            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=0))/(
+                    self.num_incorporated_images + (img_batch.shape[0]))
+        self.update_model(img_batch - self.mean)
 
 
-    def john_update_model(self, X):
+    def update_model(self, X):
         """
         Update matrix sketch with new batch of observations
         
@@ -215,16 +273,15 @@ def john_update_model(self, X):
         """
         _, numIncorp = X.shape
         origNumIncorp = numIncorp
-        n = self.num_incorporated_images
-        q = self.ell
         with TaskTimer(self.task_durations, "total update"):
             if self.rank==0 and not self.merger:
                 print(
                     "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q
+                        m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell
                     )
                 )
             for row in X.T:
+                
                 canRankAdapt = numIncorp > (self.ell + 15)
                 if self.nextZeroRow >= self.m:
                     if self.increaseEll and canRankAdapt and self.rankAdapt:
@@ -235,7 +292,7 @@ def john_update_model(self, X):
                         print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell))
                     else:
                         copyBatch = self.sketch[self.ell:,:].copy()
-                        self.john_rotate()
+                        self.rotate()
                         if canRankAdapt and self.rankAdapt:
                             reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch))
                             if (reconError > 0.08):
@@ -245,7 +302,7 @@ def john_update_model(self, X):
                 self.num_incorporated_images += 1
                 numIncorp -= 1
     
-    def john_rotate(self):
+    def rotate(self):
         """ 
         Apply Frequent Directions Algorithm to 
         current matrix sketch and adjoined buffer
@@ -287,7 +344,7 @@ def john_rotate(self):
             self.sketch[len(s):,:] = 0
             self.nextZeroRow = len(s)
 
-    def john_reconstructionError(self, matrixCentered):
+    def reconstructionError(self, matrixCentered):
         """ 
         Compute the reconstruction error of the matrix sketch
         against given data
@@ -296,6 +353,11 @@ def john_reconstructionError(self, matrixCentered):
         ----------
         matrixCentered: ndarray
            Data to compare matrix sketch to 
+
+        Returns
+        -------
+        float,
+            Data subtracted by data projected onto sketched space, scaled by minimum theoretical sketch
        """
         matSketch = self.sketch
         k = 10
@@ -315,13 +377,18 @@ def john_reconstructionError(self, matrixCentered):
     def lowMemoryReconstructionError(self, matrixCentered):
         """ 
         Compute the low memory reconstruction error of the matrix sketch
-        against given data. This si the same as john_reconstructionError,
+        against given data. This si the same as reconstructionError,
         but estimates the norm computation and does not scale by the matrix. 
 
         Parameters
         ----------
         matrixCentered: ndarray
            Data to compare matrix sketch to 
+
+        Returns
+        -------
+        float,
+            Data subtracted by data projected onto sketched space, scaled by matrix elements
        """
         matSketch = self.sketch
         k = 10
@@ -351,7 +418,7 @@ def estimFrobNormSquared(self, addMe, arrs, its):
         Returns
         -------
         sumMe/its*no_rows : float
-            Estimate of frobenius norm of produce 
+            Estimate of frobenius norm of product
             of arrs matrices plus addMe matrix
 
         Notes
@@ -397,12 +464,11 @@ def gatherFreqDirsSerial(self):
                 for row in bufferMe:
                     if(np.any(row)):
                         if self.nextZeroRow >= self.m:
-                            self.john_rotate()
+                            self.rotate()
                     self.sketch[self.nextZeroRow,:] = row 
                     self.nextZeroRow += 1
                     counter += 1
             toReturn = self.sketch.copy()
-            print("COMPLETED MERGE PROCESS: ", toReturn)
             self.sketch = origMatSketch
             return toReturn
         else:
@@ -420,10 +486,12 @@ def write(self):
         """
         Write matrix sketch to h5 file. 
         """
-        filename = self.writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank)
+        filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
-            hf.create_dataset("imgsTracked", data=self.imgsTracked)
+            hf.create_dataset("mean", data=self.mean)
+            hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked))
+            hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images
         self.comm.Barrier()
         return filename 
 
@@ -432,7 +500,7 @@ class MergeTree:
 
     """Frequent Directions Merging Object."""
 
-    def __init__(self, divBy, readFile, dataSetName, writeDirec):
+    def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun):
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
@@ -440,32 +508,37 @@ def __init__(self, divBy, readFile, dataSetName, writeDirec):
         self.divBy = divBy
         
         with h5py.File(readFile, 'r') as hf:
-            self.data = hf[dataSetName][:]
+            self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], writeDirec=writeDirec) 
+        self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], dir=dir) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
         if self.rank==0:
             print(self.buffSizes)
 
-        #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA
-        self.fd.john_update_model(self.data.T)
+        self.fd.update_model(self.data.T)
 
-        self.writeDirec = writeDirec
+        self.dir = dir
 
+        self.allWriteDirecs = allWriteDirecs
 
-    def merge(self):
 
+        self.fullMean = None
+        self.fullNumIncorp = 0
+        self.fullImgsTracked = []
+
+        self.currRun = currRun
+
+    def merge(self):
         """
         Merge Frequent Direction Components in a tree-like fashion. 
         Returns
         -------
         finalSketch : ndarray
             Merged matrix sketch of cumulative data
-
-
         """
+
         powerNum = 1
         while(powerNum < self.size):
             powerNum = powerNum * self.divBy
@@ -484,14 +557,25 @@ def merge(self):
                         bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double)
                         self.comm.Recv(bufferMe, source=proc, tag=17)
                         bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1]))
-                        self.fd.john_update_model(bufferMe.T)
+                        self.fd.update_model(bufferMe.T)
                 else:
                     bufferMe = self.fd.get().copy().flatten()
                     self.comm.Send(bufferMe, dest=root, tag=17)
             level += 1
         if self.rank==0:
-            finalSketch = self.fd.get()
-            return finalSketch
+            fullLen = len(self.allWriteDirecs)
+            for readMe in self.allWriteDirecs:
+                with h5py.File(readMe, 'r') as hf:
+                    if self.fullMean is None:
+                        self.fullMean = hf["mean"][:]
+                        self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"]
+                        self.fullImgsTracked = hf["imgsTracked"][:]
+                    else:
+                        self.fullMean =  (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp
+                                + hf["sketch"].attrs["numImgsIncorp"])
+                        self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"]
+                        self.fullImgsTracked = np.vstack((self.fullImgsTracked,  hf["imgsTracked"][:]))
+            return self.fd.get()
         else:
             return
 
@@ -499,10 +583,13 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
-        filename = self.writeDirec + '{}_merge.h5'.format(currRun)
+        filename = self.dir + '{}_merge.h5'.format(self.currRun)
         if self.rank==0:
             with h5py.File(filename, 'w') as hf:
                 hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
+                hf.create_dataset("mean",  data=self.fullMean)
+                hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp
+                hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
         self.comm.Barrier()
         return filename
 
@@ -511,68 +598,46 @@ class ApplyCompression:
 
     def __init__(
         self,
-        john_start,
-        tot_imgs,
-        ell, 
-        alpha,
+        start_offset,
+        total_imgs,
         exp,
         run,
         det_type,
-        rankAdapt,
-        readFile, dataSetName,
-        writeDirec,
-        merger=False,
-        mergerFeatures=0,
+        readFile,
+        dir,
+        batchSize,
+        threshold,
+        noZeroIntensity,
+        normalizeIntensity,
+        currRun,
         downsample=False,
-        bin_factor=2,
-        output_dir=""
+        bin_factor=2
     ):
 
-        self.writeDirec = writeDirec
+        self.dir = dir
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
-        self.merger = merger
-
-        if not merger:
-            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-            self.psi.counter = john_start + tot_imgs*self.rank//self.size
-
-            self.downsample = downsample
-            self.bin_factor = bin_factor
-            self.output_dir = output_dir
-
-            (
-                self.num_images,
-                _,
-                self.num_features,
-            ) = self.set_params(tot_imgs, ell, bin_factor)
-
-            self.task_durations = dict({})
-
-            self.num_incorporated_images = 0
-        else:
-            #JOHN: NEED TO IMPROVE. THIS IS WACK. 
-            self.num_features = mergerFeatures
-            self.task_durations = dict({})
-            self.num_incorporated_images = 0
 
-        self.d = self.num_features
-        self.ell = ell
-        self.m = 2*self.ell
-        self.sketch = zeros( (self.m, self.d) ) 
-        self.nextZeroRow = 0
-        self.alpha = alpha
+        self.total_imgs = total_imgs
 
-        self.noImgsToProcess = tot_imgs//self.size
+        self.currRun = currRun
 
-        self.rankAdapt = rankAdapt
-        self.increaseEll = False
+        self.imgGrabber = FreqDir(start_offset=start_offset,total_imgs=total_imgs, currRun = currRun,
+                exp=exp,run=run,det_type=det_type,dir="", downsample=downsample, bin_factor=bin_factor,
+                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity)
+        self.batchSize = batchSize
 
+        (
+            self.num_images,
+            self.num_features
+        ) = self.imgGrabber.set_params(total_imgs, bin_factor)
+        self.num_incorporated_images = 0
 
         with h5py.File(readFile, 'r') as hf:
-            self.data = hf[dataSetName][:]
+            self.data = hf["sketch"][:]
+            self.mean = hf["mean"][:]
 
         U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
         self.components = Vt
@@ -583,94 +648,30 @@ def __init__(
         self.imageIndicesProcessed = []
 
 
-    def set_params(self, num_images, num_components, bin_factor):
-        """
-        Method to initialize FreqDir parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, num_features
-
     def run(self):
         """
         Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. 
         """
-        for batch in range(0,self.noImgsToProcess,self.ell*6):
-            startCounter = self.psi.counter
-            self.fetch_and_update_model(self.ell*6)
-            self.imageIndicesProcessed.append((startCounter, self.psi.counter))
-
-    def get_formatted_images(self, n):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
+        noImgsToProcess = self.num_images//self.size
+        for batch in range(0,noImgsToProcess,self.batchSize):
+            self.fetch_and_process_data()
 
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
 
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
+    def fetch_and_process_data(self):
         """
+        Fetch and downsample data, apply projection algorithm
+        """
+        startCounter = self.imgGrabber.psi.counter
+        img_batch = self.imgGrabber.get_formatted_images(self.batchSize)
+        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
 
-        bin_factor = self.bin_factor
-        downsample = self.downsample
-
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+        toSave_img_batch = self.assembleImgsToSave(img_batch)
 
-        return formatted_imgs
+        if self.smallImgs is None:
+            self.smallImgs = toSave_img_batch
+        else:
+            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
+        self.apply_compression(img_batch - self.mean)
 
     def assembleImgsToSave(self, imgs):
         """
@@ -680,41 +681,18 @@ def assembleImgsToSave(self, imgs):
         ----------
         imgs: ndarray
             images to downsample
-
-        Notes
-        -----
-        There is no need to use a for loop here, since assemble_image_stack_batch 
-        works on batches of images, and reshape can as well.
         """
-        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+        pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run))
 
         saveMe = []
         for img in imgs.T:
-            imgRe = np.reshape(img, self.psi.det.shape())
+            imgRe = np.reshape(img, self.imgGrabber.psi.det.shape())
             imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-            #saveMe.append(np.array(Image.fromarray(imgRe, mode='L').resize((150, 150), Image.Resampling.BICUBIC)))
             saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150))))
         saveMe = np.array(saveMe)
         return saveMe
 
-    def fetch_and_update_model(self, n):
-        """
-        Fetch and downsample data, apply projection algorithm
-
-        Parameters
-        ----------
-        n : int
-            number of images to process
-        """
-        img_batch = self.get_formatted_images(n)
-        toSave_img_batch = self.assembleImgsToSave(img_batch)
-        if self.smallImgs is None:
-            self.smallImgs = toSave_img_batch
-        else:
-            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
-        self.john_apply_compression(img_batch)
-
-    def john_apply_compression(self, X):
+    def apply_compression(self, X):
         """
         Project data X onto matrix sketch space. 
 
@@ -732,7 +710,7 @@ def write(self):
         """
         Write projected data and downsampled data to h5 file
         """
-        filename = self.writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank)
+        filename = self.dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)

From c0f4a4290d506a9556d486acb6dce6081b8528d2 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdflogin001.sdf.slac.stanford.edu>
Date: Thu, 3 Aug 2023 14:23:23 -0700
Subject: [PATCH 15/57] Addressed more pull comments, fixed mean bug, and added
 more documentation.

---
 btx/processing/freqdir.py | 45 ++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 4a4ccdebc..81e0f208c 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -255,16 +255,36 @@ def fetch_and_update_model(self, n):
         """
         img_batch = self.get_formatted_images(n)
         if self.mean is None:
-            self.mean = np.sum(img_batch, axis=0)/(img_batch.shape[0])
+            self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1])
         else:
-            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=0))/(
-                    self.num_incorporated_images + (img_batch.shape[0]))
-        self.update_model(img_batch - self.mean)
+            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
+                    self.num_incorporated_images + (img_batch.shape[1]))
+        print("DATA IS NOW SHAPE: ", img_batch.shape)
+        print("SKETCH IS SHAPE: ", self.sketch.shape)
+        print("MEAN IS SHAPE: ", self.mean.shape)
+        self.update_model((img_batch.T - self.mean).T)
 
 
     def update_model(self, X):
         """
-        Update matrix sketch with new batch of observations
+        Update matrix sketch with new batch of observations. 
+
+        The matrix sketch array is of size 2*ell. The first ell rows maintained
+        represent the current matrix sketch. The next ell rows form a buffer.
+        Each row of the data is added to the buffer until ell rows have been
+        accumulated. Then, we apply the rotate function to the buffer, which
+        incorporates the buffer data into the matrix sketch. 
+        
+        Following the rotation step, it is checked if rank adaption is enabled. Then,
+        is checked if there is enough data to perform one full rotation/shrinkage
+        step. Without this check, one runs the risk of having zero rows in the
+        sketch, which is innaccurate in representing the data one has seen.
+        If one can increase the rank, the increaseEll flag is raised, and once sufficient
+        data has been accumulated in the buffer, the sketch and buffer size is increased.
+        This happens when we check if increaseEll, canRankAdapt, and rankAdapt are all true,
+        whereby we check if we should be increasing the rank due to high error, we
+        have sufficient incoming data to do so (to avoid zero rows in the matrix sketch), 
+        and the user would like for the rank to be adaptive, respectively. 
         
         Parameters
         ----------
@@ -304,8 +324,14 @@ def update_model(self, X):
     
     def rotate(self):
         """ 
-        Apply Frequent Directions Algorithm to 
-        current matrix sketch and adjoined buffer
+        Apply Frequent Directions rotation/shrinkage step to current matrix sketch and adjoined buffer. 
+
+        The Frequent Directions algorithm is inspired by the well known Misra Gries Frequent Items
+        algorithm. The Frequent Items problem is informally as follows: given a sequence of items, find the items which occur most frequently. The Misra Gries Frequent Items algorithm maintains a dictionary of <= k items and counts. For each item in a sequence, if the item is in the dictionary, increase its count. if the item is not in the dictionary and the size of the dictionary is <= k, then add the item with a count of 1 to the dictionary. Otherwise, decrease all counts in the dictionary by 1 and remove any items with 0 count. Every item which occurs more than n/k times is guaranteed to appear in the output array.
+
+        The Frequent Directions Algorithm works in an analogous way for vectors: in the same way that Frequent Items periodically deletes ell different elements, Frequent Directions periodically "shrinks? ell orthogonal vectors by roughly the same amount. To do so, at each step: 1) Data is appended to the matrix sketch (whereby the last ell rows form a buffer and are zeroed at the start of the algorithm and after each rotation). 2) Matrix Sketch is rotated from left via SVD so that its rows are orthogonal and in descending magnitude order. 3) Norm of sketch rows are shrunk so that the smallest direction is set to 0.
+
+        This function performs the rotation and shrinkage step by performing SVD and left multiplying by the unitary U matrix, followed by a subtraction. This particular implementation follows the alpha FD algorithm, which only performs the shrinkage step on the first alpha rows of the sketch, which has been shown to perform better than vanilla FD in [2]. 
 
         Notes
         -----
@@ -638,6 +664,9 @@ def __init__(
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
             self.mean = hf["mean"][:]
+        
+        print("NEW DATA IS SHAPE: ", self.data.shape)
+        print("NEW MEAN IS SHAPE: ", self.mean.shape)
 
         U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
         self.components = Vt
@@ -671,7 +700,7 @@ def fetch_and_process_data(self):
             self.smallImgs = toSave_img_batch
         else:
             self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
-        self.apply_compression(img_batch - self.mean)
+        self.apply_compression((img_batch.T - self.mean).T)
 
     def assembleImgsToSave(self, imgs):
         """

From a05ca8b8095d7c74f17006bb6801a3f71230f954 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdflogin002.sdf.slac.stanford.edu>
Date: Thu, 3 Aug 2023 20:29:30 -0700
Subject: [PATCH 16/57] Added additional documentation for MergeTree and
 ApplyCompression modules

---
 btx/processing/freqdir.py | 77 +++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 81e0f208c..da970a751 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -61,7 +61,8 @@ class FreqDir:
     in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
     https://doi.org/10.1007/978-3-662-44777-2_39
 
-    Attributes:
+    Attributes
+    ----------
        start_offset: starting index of images to process
        total_imgs: total number of images to process
        ell: number of components of matrix sketch
@@ -72,7 +73,7 @@ class FreqDir:
        dir: directory to write output
        merger: indicates whether object will be used to merge other FD objects
        mergerFeatures: used if merger is true and indicates number of features of local matrix sketches
-       downsample, bin: whether data should be downsampled and by how much
+       downsample, bin_factor: whether data should be downsampled and by how much
        threshold: whether data should be thresholded (zero if less than threshold amount)
        normalizeIntensity: whether data should be normalized to have total intensity of one
        noZeroIntensity: whether data with low total intensity should be discarded
@@ -259,9 +260,6 @@ def fetch_and_update_model(self, n):
         else:
             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
                     self.num_incorporated_images + (img_batch.shape[1]))
-        print("DATA IS NOW SHAPE: ", img_batch.shape)
-        print("SKETCH IS SHAPE: ", self.sketch.shape)
-        print("MEAN IS SHAPE: ", self.mean.shape)
         self.update_model((img_batch.T - self.mean).T)
 
 
@@ -474,7 +472,12 @@ def estimFrobNormSquared(self, addMe, arrs, its):
     def gatherFreqDirsSerial(self):
         """
         Gather local matrix sketches to root node and
-        merge local sketches together. 
+        merge local sketches together in a serial fashion. 
+
+        Returns
+        -------
+        toReturn : ndarray
+            Sketch of all data processed by all cores
         """
         sendbuf = self.ell
         buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -505,12 +508,22 @@ def gatherFreqDirsSerial(self):
     def get(self):
         """
         Fetch matrix sketch
+
+        Returns
+        -------
+        self.sketch[:self.ell,:] : ndarray
+            Sketch of data locally processed
         """
         return self.sketch[:self.ell, :]
 
     def write(self):
         """
         Write matrix sketch to h5 file. 
+
+        Returns
+        -------
+        filename : string
+            Name of h5 file where sketch, mean of data, and indices of data processed is written
         """
         filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
@@ -524,7 +537,28 @@ def write(self):
 
 class MergeTree:
 
-    """Frequent Directions Merging Object."""
+    """
+    Class used to efficiently merge Frequent Directions Matrix Sketches
+
+    The Frequent Directions matrix sketch has the special property that it is a mergeable
+    summary. This means it can be merged easily and retain the same theoretical guarantees
+    by stacking two sketches ontop of one another and applying the algorithm again.
+
+    We can perform this merging process in a tree-like fashion in order to merge any 
+    number of sketches in log number of applications of the frequent directions algorithm. 
+
+    The class is designed to take in local sketches of data from h5 files produced by 
+    the FreqDir class (where local refers to the fact that a subset of the total number
+    of images has been processed by the algorithm in a single core and saved to its own h5 file).
+
+    Attributes
+    ----------
+    divBy: Factor to merge by at each step: number of sketches must be a power of divBy 
+    readFile: File name of local sketch for this particular core to process
+    dir: directory to write output
+    allWriteDirecs: all file names of local sketches
+    currRun: Current datetime used to identify run
+    """
 
     def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun):
         self.comm = MPI.COMM_WORLD
@@ -620,7 +654,31 @@ def write(self):
         return filename
 
 class ApplyCompression:
-    """Compute principal components of matrix sketch and apply to data"""
+    """
+    Compute principal components of matrix sketch and apply to data
+
+    Attributes
+    ----------
+    start_offset: starting index of images to process
+    total_imgs: total number of images to process
+    exp, run, det_type: experiment properties
+    dir: directory to write output
+    downsample, bin_factor: whether data should be downsampled and by how much
+    threshold: whether data should be thresholded (zero if less than threshold amount)
+    normalizeIntensity: whether data should be normalized to have total intensity of one
+    noZeroIntensity: whether data with low total intensity should be discarded
+    readFile: H5 file with matrix sketch
+    batchSize: Number of images to process at each iteration
+    data: numpy array housing current matrix sketch
+    mean: geometric mean of data processed
+    num_incorporated_images: number of images processed so far
+    imgageIndicesProcessed: indices of images processed so far
+    currRun: Current datetime used to identify run
+    imgGrabber: FD object used solely to retrieve data from psana
+    components: Principal Components of matrix sketch
+    processedData: Data projected onto matrix sketch range
+    smallImages: Downsampled images for visualization purposes 
+    """
 
     def __init__(
         self,
@@ -665,9 +723,6 @@ def __init__(
             self.data = hf["sketch"][:]
             self.mean = hf["mean"][:]
         
-        print("NEW DATA IS SHAPE: ", self.data.shape)
-        print("NEW MEAN IS SHAPE: ", self.mean.shape)
-
         U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
         self.components = Vt
         

From 832226d7645dcb03b0114686205bee06c2a9c508 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 8 Aug 2023 03:14:06 -0700
Subject: [PATCH 17/57] Checkpoint. Not sure what has been changed

---
 btx/processing/freqdir.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index da970a751..b4b2e1efb 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -229,7 +229,7 @@ def get_formatted_images(self, n):
 
         img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
         nimg_batch = []
-        for img in img_batch:
+        for img in img_batch.T:
             if self.threshold:
                 secondQuartile = np.sort(img)[-1]//4
                 nimg = (img>secondQuartile)*img
@@ -243,7 +243,7 @@ def get_formatted_images(self, n):
                     nimg_batch.append(nimg/currIntensity)
                 else:
                     nimg_batch.append(nimg)
-        return np.array(nimg_batch)
+        return np.array(nimg_batch).T
 
     def fetch_and_update_model(self, n):
         """
@@ -575,7 +575,7 @@ def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun):
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
         if self.rank==0:
-            print(self.buffSizes)
+            print("BUFFER SIZES: ", self.buffSizes)
 
         self.fd.update_model(self.data.T)
 

From 2a298ba781aff5a0ae4a796ebffe05dbfaac27c8 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Wed, 9 Aug 2023 08:49:59 -0700
Subject: [PATCH 18/57] Checkpoint

---
 btx/processing/freqdir.py | 62 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index b4b2e1efb..c3c18fa75 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -24,7 +24,8 @@
 
 import h5py
 from PIL import Image
-
+import random
+import heapq
 
 class FreqDir:
 
@@ -84,6 +85,7 @@ class FreqDir:
        num_incorporated_images: number of images processed so far
        imgsTracked: indices of images processed so far
        currRun: Current datetime used to identify run
+       samplingFactor: Proportion of batch data to process based on Priority Sampling Algorithm
     """
 
     def __init__(
@@ -105,6 +107,7 @@ def __init__(
         threshold=False,
         normalizeIntensity=False,
         noZeroIntensity=False, 
+        samplingFactor=1.0
     ):
 
         self.comm = MPI.COMM_WORLD
@@ -144,6 +147,8 @@ def __init__(
         self.noZeroIntensity = noZeroIntensity
         self.normalizeIntensity=normalizeIntensity
 
+        self.samplingFactor = samplingFactor
+
     def set_params(self, num_images, bin_factor):
         """
         Method to initialize FreqDir parameters.
@@ -188,8 +193,8 @@ def run(self):
         """
 
         noImgsToProcess = self.num_images//self.size
-        for batch in range(0,noImgsToProcess,self.ell*3):
-            self.fetch_and_update_model(self.ell*3)
+        for batch in range(0,noImgsToProcess,int(self.ell*3//self.samplingFactor)):
+            self.fetch_and_update_model(int(self.ell*3//self.samplingFactor))
 
     def get_formatted_images(self, n):
         """
@@ -255,6 +260,15 @@ def fetch_and_update_model(self, n):
             number of images to incorporate
         """
         img_batch = self.get_formatted_images(n)
+
+        if self.samplingFactor <1:
+            print("PRE PSAMP REDUCTION SHAPE: ", img_batch.shape)
+            psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
+            for row in img_batch.T:
+                psamp.update(row)
+            img_batch = np.array(psamp.sketch.get()).T
+            print("PSAMP REDUCTION SHAPE: ", img_batch.shape)
+
         if self.mean is None:
             self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1])
         else:
@@ -800,3 +814,45 @@ def write(self):
             hf.create_dataset("SmallImages", data=self.smallImgs)
         self.comm.Barrier()
         return filename
+
+
+class CustomPriorityQueue:
+    def __init__(self, max_size):
+        self.queue = []
+        self.index = 0  # To handle items with the same priority
+        self.max_size = max_size
+
+    def push(self, item, priority, origWeight):
+        if len(self.queue) >= self.max_size:
+            self.pop()  # Remove the lowest-priority item if queue is full
+        heapq.heappush(self.queue, (priority, self.index, (item, priority, origWeight)))
+        self.index += 1
+
+    def pop(self):
+        return heapq.heappop(self.queue)[-1]
+
+    def is_empty(self):
+        return len(self.queue) == 0
+
+    def size(self):
+        return len(self.queue)
+
+    def get(self):
+        ret = []
+        while self.queue:
+            curr = heapq.heappop(self.queue)[-1]
+            #ret.append(curr[0]*max(curr[1], curr[2])/curr[2])
+            ret.append(curr[0])
+        return ret
+
+class PrioritySampling:
+    def __init__(self, ell, d):
+        self.ell = ell
+        self.d = d
+        self.sketch = CustomPriorityQueue(self.ell)
+
+    def update(self, vec):
+        ui = random.random()
+        wi = np.linalg.norm(vec)**2
+        pi = wi/ui
+        self.sketch.push(vec, pi, wi)

From d321fab0f08240e4a23ef671bda8027c5e7c499e Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Thu, 10 Aug 2023 16:06:36 -0700
Subject: [PATCH 19/57] Added priority sampling in previous commit. Addressed
 minor comments in code review in this commit.

---
 btx/processing/freqdir.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index c3c18fa75..4780f85dd 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -116,8 +116,7 @@ def __init__(
 
         self.currRun = currRun
 
-        self.merger = merger
-        if not merger:
+        if not self.merger:
             self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
             self.psi.counter = start_offset + total_imgs*self.rank//self.size
             self.downsample = downsample
@@ -270,7 +269,7 @@ def fetch_and_update_model(self, n):
             print("PSAMP REDUCTION SHAPE: ", img_batch.shape)
 
         if self.mean is None:
-            self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1])
+            self.mean = np.mean(img_batch, axis=1)
         else:
             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
                     self.num_incorporated_images + (img_batch.shape[1]))

From ed9e9eaff2ad38acef8ca371f3f0f5c0da6a69b9 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Wed, 16 Aug 2023 13:12:04 -0700
Subject: [PATCH 20/57] Checkpoint. Didn't really change anythnig other than
 initial steps towards parent class (reverting old functions to pipca
 versions).

---
 btx/processing/dimRed.py  | 228 ++++++++++++++++++++++++++++++++++++++
 btx/processing/freqdir.py |  65 ++++++++++-
 2 files changed, 288 insertions(+), 5 deletions(-)
 create mode 100644 btx/processing/dimRed.py

diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py
new file mode 100644
index 000000000..37d410b96
--- /dev/null
+++ b/btx/processing/dimRed.py
@@ -0,0 +1,228 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+import holoviews as hv
+hv.extension('bokeh')
+from holoviews.streams import Params
+
+import panel as pn
+import panel.widgets as pnw
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+class DimRed:
+
+    """Parallelized Incremental Principal Component Analysis."""
+
+    def __init__(
+        self,
+        exp,
+        run,
+        det_type,
+        start_offset=0,
+        num_images=10,
+        num_components=10,
+        batch_size=10,
+        priming=False,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = start_offset
+        self.start_offset = start_offset
+
+        self.priming = priming
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            self.num_components,
+            self.batch_size,
+            self.num_features,
+        ) = self.set_params(num_images, num_components, batch_size, bin_factor)
+
+        self.split_indices, self.split_counts = distribute_indices_over_ranks(
+            self.num_features, self.size
+        )
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+        self.outliers, self.pc_data = [], []
+
+    def get_params(self):
+        """
+        Method to retrieve iPCA params.
+
+        Returns
+        -------
+        num_incorporated_images : int
+            number of images used to build model
+        num_components : int
+            number of components maintained in model
+        batch_size : int
+            batch size used in model updates
+        num_features : int
+            dimensionality of incorporated images
+        """
+        return (
+            self.num_incorporated_images,
+            self.num_components,
+            self.batch_size,
+            self.num_features,
+        )
+
+    def set_params(self, num_images, num_components, batch_size, bin_factor):
+        """
+        Method to initialize iPCA parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        batch_size : int
+            Desired size of image block to be incorporated into model at each update.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        batch_size : int
+            Size of image block to be incorporated into model at each update.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+        batch_size = min(batch_size, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, batch_size, num_features
+
+
+    def display_dashboard(self):
+        """
+        Displays a pipca dashboard with a PC plot and intensity heatmap.
+        """
+        
+        start_img = self.start_offset
+        
+        # Create PC dictionary and widgets
+        PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)}
+        PC_options = list(PCs)
+        
+        PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options)
+        PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options)
+        widgets_scatter = pn.WidgetBox(PCx, PCy, width=100)
+        
+        tap_source = None
+        posxy = hv.streams.Tap(source=tap_source, x=0, y=0)
+        
+        # Create PC scatter plot
+        @pn.depends(PCx.param.value, PCy.param.value)
+        def create_scatter(PCx, PCy):
+            img_index_arr = np.arange(start_img, start_img + len(PCs[PCx]))
+            scatter_data = {**PCs, 'Image': img_index_arr}
+            
+            opts = dict(width=400, height=300, color='Image', cmap='rainbow', 
+                        colorbar=True, show_grid=True, toolbar='above', tools=['hover'])
+            scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], 
+                                label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts)
+            
+            posxy.source = scatter
+            return scatter
+        
+        # Define function to compute heatmap based on tap location
+        def tap_heatmap(x, y, pcx, pcy):
+            # Finds the index of image closest to the tap location
+            img_source = None
+            min_diff = None
+            square_diff = None
+            
+            for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])):    
+                square_diff = (x - xv) ** 2 + (y - yv) ** 2
+                if (min_diff is None or square_diff < min_diff):
+                    min_diff = square_diff
+                    img_source = i
+            
+            # Downsample so heatmap is at most 100 x 100
+            counter = self.psi.counter
+            self.psi.counter = start_img + img_source
+            img = self.psi.get_images(1)
+            _, x_pixels, y_pixels = img.shape
+            self.psi.counter = counter
+            
+            max_pixels = 100
+            bin_factor_x = int(x_pixels / max_pixels)
+            bin_factor_y = int(y_pixels / max_pixels)
+            
+            while x_pixels % bin_factor_x != 0:
+                bin_factor_x += 1
+            while y_pixels % bin_factor_y != 0:
+                bin_factor_y += 1
+            
+            img = img.reshape((x_pixels, y_pixels))
+            binned_img = img.reshape(int(x_pixels / bin_factor_x),
+                                        bin_factor_x,
+                                        int(y_pixels / bin_factor_y),
+                                        bin_factor_y).mean(-1).mean(1)
+            
+            # Creates hm_data array for heatmap
+            bin_x_pixels, bin_y_pixels = binned_img.shape
+            rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten()
+            cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels)
+            
+            hm_data = np.stack((rows, cols, binned_img.flatten()))
+            hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3))
+        
+            opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above')
+            heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts)
+            
+            return heatmap
+            
+        # Connect the Tap stream to the tap_heatmap callback
+        stream1 = [posxy]
+        stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value})
+        tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2)
+        
+        return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector')
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 4780f85dd..9efe12c1e 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -107,7 +107,9 @@ def __init__(
         threshold=False,
         normalizeIntensity=False,
         noZeroIntensity=False, 
-        samplingFactor=1.0
+        samplingFactor=1.0, 
+        num_components=10, 
+        batch_size = 10
     ):
 
         self.comm = MPI.COMM_WORLD
@@ -116,15 +118,22 @@ def __init__(
 
         self.currRun = currRun
 
+        self.merger = merger
         if not self.merger:
             self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
             self.psi.counter = start_offset + total_imgs*self.rank//self.size
             self.downsample = downsample
             self.bin_factor = bin_factor
+#            (
+#                self.num_images,
+#                self.num_features,
+#            ) = self.set_params(total_imgs, bin_factor)
             (
-                self.num_images,
-                self.num_features,
-            ) = self.set_params(total_imgs, bin_factor)
+            self.num_images,
+            self.num_components,
+            self.batch_size,
+            self.num_features,
+            ) = self.set_params(total_imgs, num_components, batch_size, bin_factor)
         else:
             self.num_features = mergerFeatures
         self.task_durations = dict({})
@@ -148,7 +157,53 @@ def __init__(
 
         self.samplingFactor = samplingFactor
 
-    def set_params(self, num_images, bin_factor):
+    def set_params(self, num_images, num_components, batch_size, bin_factor):
+        """
+        Method to initialize iPCA parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        batch_size : int
+            Desired size of image block to be incorporated into model at each update.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        batch_size : int
+            Size of image block to be incorporated into model at each update.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+        batch_size = min(batch_size, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, batch_size, num_features
+
+    def OLD_set_params(self, num_images, bin_factor):
         """
         Method to initialize FreqDir parameters.
 

From 79894e5251049320ebe3f544500442224954e1d8 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 20 Aug 2023 12:12:44 -0700
Subject: [PATCH 21/57] Set up parent class for dimension reduction called
 DimRed. Shared functions are housed here for Frequent Directions and PIPCA
 module. ALso appropriately modified FD and PIPCA code. Fixed indexing issue,
 removed means, zeroed negative values and fixed overflowing issues. Other
 nice changes.

---
 btx/processing/dimRed.py   |  85 ++--
 btx/processing/freqdir.py  | 225 ++++-------
 btx/processing/pipca.py    | 143 +++++--
 btx/processing/pipcaOLD.py | 790 +++++++++++++++++++++++++++++++++++++
 4 files changed, 1051 insertions(+), 192 deletions(-)
 create mode 100644 btx/processing/pipcaOLD.py

diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py
index 37d410b96..06ff78c32 100644
--- a/btx/processing/dimRed.py
+++ b/btx/processing/dimRed.py
@@ -39,7 +39,7 @@ def __init__(
         priming=False,
         downsample=False,
         bin_factor=2,
-        output_dir="",
+        output_dir=""
     ):
 
         self.comm = MPI.COMM_WORLD
@@ -139,90 +139,127 @@ def set_params(self, num_images, num_components, batch_size, bin_factor):
 
         return num_images, num_components, batch_size, num_features
 
-
     def display_dashboard(self):
         """
         Displays a pipca dashboard with a PC plot and intensity heatmap.
         """
-        
+
         start_img = self.start_offset
-        
+
         # Create PC dictionary and widgets
         PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)}
         PC_options = list(PCs)
-        
+
         PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options)
         PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options)
         widgets_scatter = pn.WidgetBox(PCx, PCy, width=100)
-        
+
         tap_source = None
         posxy = hv.streams.Tap(source=tap_source, x=0, y=0)
-        
+
         # Create PC scatter plot
         @pn.depends(PCx.param.value, PCy.param.value)
         def create_scatter(PCx, PCy):
             img_index_arr = np.arange(start_img, start_img + len(PCs[PCx]))
             scatter_data = {**PCs, 'Image': img_index_arr}
-            
-            opts = dict(width=400, height=300, color='Image', cmap='rainbow', 
+
+            opts = dict(width=400, height=300, color='Image', cmap='rainbow',
                         colorbar=True, show_grid=True, toolbar='above', tools=['hover'])
-            scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], 
+            scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'],
                                 label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts)
-            
+
             posxy.source = scatter
             return scatter
-        
+
         # Define function to compute heatmap based on tap location
         def tap_heatmap(x, y, pcx, pcy):
             # Finds the index of image closest to the tap location
             img_source = None
             min_diff = None
             square_diff = None
-            
-            for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])):    
+
+            for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])):
                 square_diff = (x - xv) ** 2 + (y - yv) ** 2
                 if (min_diff is None or square_diff < min_diff):
                     min_diff = square_diff
                     img_source = i
-            
+
             # Downsample so heatmap is at most 100 x 100
             counter = self.psi.counter
             self.psi.counter = start_img + img_source
             img = self.psi.get_images(1)
             _, x_pixels, y_pixels = img.shape
             self.psi.counter = counter
-            
+
             max_pixels = 100
             bin_factor_x = int(x_pixels / max_pixels)
             bin_factor_y = int(y_pixels / max_pixels)
-            
+
             while x_pixels % bin_factor_x != 0:
                 bin_factor_x += 1
             while y_pixels % bin_factor_y != 0:
                 bin_factor_y += 1
-            
+
             img = img.reshape((x_pixels, y_pixels))
             binned_img = img.reshape(int(x_pixels / bin_factor_x),
                                         bin_factor_x,
                                         int(y_pixels / bin_factor_y),
                                         bin_factor_y).mean(-1).mean(1)
-            
+
             # Creates hm_data array for heatmap
             bin_x_pixels, bin_y_pixels = binned_img.shape
             rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten()
             cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels)
-            
+
             hm_data = np.stack((rows, cols, binned_img.flatten()))
             hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3))
-        
+
             opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above')
             heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts)
-            
+
             return heatmap
-            
+
         # Connect the Tap stream to the tap_heatmap callback
         stream1 = [posxy]
         stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value})
         tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2)
-        
+
         return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector')
+
+
+def distribute_indices_over_ranks(d, size):
+    """
+
+    Parameters
+    ----------
+    d : int
+        total number of dimensions
+    size : int
+        number of ranks in world
+
+    Returns
+    -------
+    split_indices : ndarray, shape (size+1 x 1)
+        division indices between ranks
+    split_counts : ndarray, shape (size x 1)
+        number of dimensions allocated per rank
+    """
+
+    total_indices = 0
+    split_indices, split_counts = [0], []
+
+    for r in range(size):
+        num_per_rank = d // size
+        if r < (d % size):
+            num_per_rank += 1
+
+        split_counts.append(num_per_rank)
+
+        total_indices += num_per_rank
+        split_indices.append(total_indices)
+
+    split_indices = np.array(split_indices)
+    split_counts = np.array(split_counts)
+
+    return split_indices, split_counts
+
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 9efe12c1e..f81df0065 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1,3 +1,8 @@
+import sys
+sys.path.append("/sdf/home/w/winnicki/btx/")
+from btx.processing.dimRed import *
+  
+
 import os, csv, argparse
 
 import numpy as np
@@ -27,7 +32,7 @@
 import random
 import heapq
 
-class FreqDir:
+class FreqDir(DimRed):
 
     """
     Parallel Frequent Directions.
@@ -65,13 +70,13 @@ class FreqDir:
     Attributes
     ----------
        start_offset: starting index of images to process
-       total_imgs: total number of images to process
+       num_imgs: total number of images to process
        ell: number of components of matrix sketch
        alpha: proportion  of components to not rotate in frequent directions algorithm
        exp, run, det_type: experiment properties
        rankAdapt: indicates whether to perform rank adaptive FD
        increaseEll: internal variable indicating whether ell should be increased for rank adaption
-       dir: directory to write output
+       output_dir: directory to write output
        merger: indicates whether object will be used to merge other FD objects
        mergerFeatures: used if merger is true and indicates number of features of local matrix sketches
        downsample, bin_factor: whether data should be downsampled and by how much
@@ -91,13 +96,12 @@ class FreqDir:
     def __init__(
         self,
         start_offset,
-        total_imgs,
+        num_imgs,
         exp,
         run,
         det_type,
-        dir,
+        output_dir,
         currRun,
-        ell=0, 
         alpha=0,
         rankAdapt=False,
         merger=False,
@@ -109,39 +113,29 @@ def __init__(
         noZeroIntensity=False, 
         samplingFactor=1.0, 
         num_components=10, 
-        batch_size = 10
+        batch_size = 10,
+        priming=False
     ):
 
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
+        super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
+                num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming,
+                downsample=downsample, bin_factor=bin_factor, output_dir=output_dir)
+
+        self.psi.counter = start_offset + self.num_images*self.rank//self.size
 
         self.currRun = currRun
 
+        self.output_dir = output_dir
+
         self.merger = merger
-        if not self.merger:
-            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-            self.psi.counter = start_offset + total_imgs*self.rank//self.size
-            self.downsample = downsample
-            self.bin_factor = bin_factor
-#            (
-#                self.num_images,
-#                self.num_features,
-#            ) = self.set_params(total_imgs, bin_factor)
-            (
-            self.num_images,
-            self.num_components,
-            self.batch_size,
-            self.num_features,
-            ) = self.set_params(total_imgs, num_components, batch_size, bin_factor)
-        else:
+
+        if self.merger:
             self.num_features = mergerFeatures
-        self.task_durations = dict({})
+
         self.num_incorporated_images = 0
 
-        self.dir = dir
         self.d = self.num_features
-        self.ell = ell
+        self.ell = num_components
         self.m = 2*self.ell
         self.sketch = zeros( (self.m, self.d) ) 
         self.nextZeroRow = 0
@@ -157,89 +151,6 @@ def __init__(
 
         self.samplingFactor = samplingFactor
 
-    def set_params(self, num_images, num_components, batch_size, bin_factor):
-        """
-        Method to initialize iPCA parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        batch_size : int
-            Desired size of image block to be incorporated into model at each update.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        batch_size : int
-            Size of image block to be incorporated into model at each update.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-        batch_size = min(batch_size, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, batch_size, num_features
-
-    def OLD_set_params(self, num_images, bin_factor):
-        """
-        Method to initialize FreqDir parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_features
-
     def run(self):
         """
         Perform frequent directions matrix sketching
@@ -247,8 +158,8 @@ def run(self):
         """
 
         noImgsToProcess = self.num_images//self.size
-        for batch in range(0,noImgsToProcess,int(self.ell*3//self.samplingFactor)):
-            self.fetch_and_update_model(int(self.ell*3//self.samplingFactor))
+        for batch in range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor)):
+            self.fetch_and_update_model(int(self.ell*2//self.samplingFactor))
 
     def get_formatted_images(self, n):
         """
@@ -287,6 +198,10 @@ def get_formatted_images(self, n):
         num_valid_imgs, p, x, y = imgs.shape
 
         img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        #JOHN NEW ADDITION 08 20 2023 08 55
+        img_batch[img_batch<0] = 0
+
         nimg_batch = []
         for img in img_batch.T:
             if self.threshold:
@@ -294,16 +209,43 @@ def get_formatted_images(self, n):
                 nimg = (img>secondQuartile)*img
             else:
                 nimg = img
-            currIntensity = np.sum(nimg.flatten())
-            if self.noZeroIntensity and currIntensity<1000:
+
+            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+            if self.noZeroIntensity and currIntensity<50000:
                 continue
             else:
-                if currIntensity>10000 and self.normalizeIntensity:
+                if currIntensity>=50000 and self.normalizeIntensity:
                     nimg_batch.append(nimg/currIntensity)
                 else:
                     nimg_batch.append(nimg)
         return np.array(nimg_batch).T
 
+    ###########################################################################
+
+    #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch.
+    def intensityFunc_threshold(img):
+        if img is None:
+            return img
+        else:
+            secondQuartile = np.sort(img)[-1]//4
+            return (img>secondQuartile)*img
+
+    def intensityFunc_removeZeroIntensity(img, currIntensity):
+        if currIntensity<50000:
+            return None
+        else:
+            return img
+
+    def intensityFunc_normalizeIntensity(img, currIntensity):
+        if img is None:
+            return img
+
+        if currIntensity<50000:
+            return img
+        else:
+            return img/currIntensity
+    ###########################################################################
+
     def fetch_and_update_model(self, n):
         """
         Fetch images and update model.
@@ -324,11 +266,15 @@ def fetch_and_update_model(self, n):
             print("PSAMP REDUCTION SHAPE: ", img_batch.shape)
 
         if self.mean is None:
-            self.mean = np.mean(img_batch, axis=1)
+#            self.mean = np.mean(img_batch, axis=1)
+             self.mean = np.sum(img_batch, axis=1, dtype=np.double)/(img_batch.shape[1])
         else:
-            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
+#            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
+#                    self.num_incorporated_images + (img_batch.shape[1]))
+             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/(
                     self.num_incorporated_images + (img_batch.shape[1]))
-        self.update_model((img_batch.T - self.mean).T)
+#        self.update_model((img_batch.T - self.mean).T)
+        self.update_model(img_batch)
 
 
     def update_model(self, X):
@@ -593,7 +539,7 @@ def write(self):
         filename : string
             Name of h5 file where sketch, mean of data, and indices of data processed is written
         """
-        filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
+        filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
             hf.create_dataset("mean", data=self.mean)
@@ -628,7 +574,7 @@ class MergeTree:
     currRun: Current datetime used to identify run
     """
 
-    def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun):
+    def __init__(self, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun):
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
@@ -638,16 +584,17 @@ def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun):
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], dir=dir) 
+        self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
         if self.rank==0:
             print("BUFFER SIZES: ", self.buffSizes)
 
+        print(self.data.T.shape)
         self.fd.update_model(self.data.T)
 
-        self.dir = dir
+        self.output_dir = output_dir
 
         self.allWriteDirecs = allWriteDirecs
 
@@ -711,7 +658,7 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
-        filename = self.dir + '{}_merge.h5'.format(self.currRun)
+        filename = self.output_dir + '{}_merge.h5'.format(self.currRun)
         if self.rank==0:
             with h5py.File(filename, 'w') as hf:
                 hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
@@ -728,7 +675,7 @@ class ApplyCompression:
     Attributes
     ----------
     start_offset: starting index of images to process
-    total_imgs: total number of images to process
+    num_imgs: total number of images to process
     exp, run, det_type: experiment properties
     dir: directory to write output
     downsample, bin_factor: whether data should be downsampled and by how much
@@ -751,12 +698,12 @@ class ApplyCompression:
     def __init__(
         self,
         start_offset,
-        total_imgs,
+        num_imgs,
         exp,
         run,
         det_type,
         readFile,
-        dir,
+        output_dir,
         batchSize,
         threshold,
         noZeroIntensity,
@@ -766,25 +713,24 @@ def __init__(
         bin_factor=2
     ):
 
-        self.dir = dir
+        self.output_dir = output_dir
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
 
-        self.total_imgs = total_imgs
+        self.num_imgs = num_imgs
 
         self.currRun = currRun
 
-        self.imgGrabber = FreqDir(start_offset=start_offset,total_imgs=total_imgs, currRun = currRun,
-                exp=exp,run=run,det_type=det_type,dir="", downsample=downsample, bin_factor=bin_factor,
-                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity)
+        self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
+                exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor,
+                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False)
         self.batchSize = batchSize
 
-        (
-            self.num_images,
-            self.num_features
-        ) = self.imgGrabber.set_params(total_imgs, bin_factor)
+        self.num_images = self.imgGrabber.num_images
+        self.num_features = self.imgGrabber.num_features
+
         self.num_incorporated_images = 0
 
         with h5py.File(readFile, 'r') as hf:
@@ -823,7 +769,8 @@ def fetch_and_process_data(self):
             self.smallImgs = toSave_img_batch
         else:
             self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
-        self.apply_compression((img_batch.T - self.mean).T)
+#        self.apply_compression((img_batch.T - self.mean).T)
+        self.apply_compression(img_batch)
 
     def assembleImgsToSave(self, imgs):
         """
@@ -862,7 +809,7 @@ def write(self):
         """
         Write projected data and downsampled data to h5 file
         """
-        filename = self.dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank)
+        filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)
diff --git a/btx/processing/pipca.py b/btx/processing/pipca.py
index 5ce47ea8f..6c401be6f 100644
--- a/btx/processing/pipca.py
+++ b/btx/processing/pipca.py
@@ -1,3 +1,7 @@
+import sys
+sys.path.append("/sdf/home/w/winnicki/btx/")
+from btx.processing.dimRed import *
+
 import os, csv, argparse
 
 import numpy as np
@@ -6,6 +10,13 @@
 from matplotlib import pyplot as plt
 from matplotlib import colors
 
+import holoviews as hv
+hv.extension('bokeh')
+from holoviews.streams import Params
+
+import panel as pn
+import panel.widgets as pnw
+
 from btx.misc.shortcuts import TaskTimer
 
 from btx.interfaces.ipsana import (
@@ -16,7 +27,7 @@
     assemble_image_stack_batch,
 )
 
-class PiPCA:
+class PiPCA(DimRed):
 
     """Parallelized Incremental Principal Component Analysis."""
 
@@ -34,35 +45,20 @@ def __init__(
         bin_factor=2,
         output_dir="",
     ):
-
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
-
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = start_offset
-
-        self.priming = priming
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
-
-        (
-            self.num_images,
-            self.num_components,
-            self.batch_size,
-            self.num_features,
-        ) = self.set_params(num_images, num_components, batch_size, bin_factor)
-
-        self.split_indices, self.split_counts = distribute_indices_over_ranks(
-            self.num_features, self.size
+        super().__init__(
+        exp=exp,
+        run=run,
+        det_type=det_type,
+        start_offset=start_offset,
+        num_images=num_images,
+        num_components=num_components,
+        batch_size=batch_size,
+        priming=priming,
+        downsample=downsample,
+        bin_factor=bin_factor,
+        output_dir=output_dir,
         )
 
-        self.task_durations = dict({})
-
-        self.num_incorporated_images = 0
-        self.outliers, self.pc_data = [], []
-
     def get_params(self):
         """
         Method to retrieve iPCA params.
@@ -161,6 +157,8 @@ def run(self):
         # update model with remaining batches
         for batch_size in batch_sizes:
             self.fetch_and_update_model(batch_size)
+            
+        print("Model complete")
 
     def get_formatted_images(self, n, start_index, end_index):
         """
@@ -278,7 +276,7 @@ def update_model(self, X):
 
             with TaskTimer(self.task_durations, "record pc data"):
                 if n > 0:
-                    self.record_loadings(X, 5)
+                    self.record_loadings(X, q)
 
             with TaskTimer(self.task_durations, "update mean and variance"):
                 mu_n = self.mu
@@ -672,6 +670,93 @@ def display_image(self, idx, output_dir="", save_image=False):
 
         plt.show()
 
+        
+    def display_dashboard(self):
+        """
+        Displays a pipca dashboard with a PC plot and intensity heatmap.
+        """
+        
+        start_img = self.start_offset
+        
+        # Create PC dictionary and widgets
+        PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)}
+        PC_options = list(PCs)
+        
+        PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options)
+        PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options)
+        widgets_scatter = pn.WidgetBox(PCx, PCy, width=100)
+        
+        tap_source = None
+        posxy = hv.streams.Tap(source=tap_source, x=0, y=0)
+        
+        # Create PC scatter plot
+        @pn.depends(PCx.param.value, PCy.param.value)
+        def create_scatter(PCx, PCy):
+            img_index_arr = np.arange(start_img, start_img + len(PCs[PCx]))
+            scatter_data = {**PCs, 'Image': img_index_arr}
+            
+            opts = dict(width=400, height=300, color='Image', cmap='rainbow', 
+                        colorbar=True, show_grid=True, toolbar='above', tools=['hover'])
+            scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], 
+                                label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts)
+            
+            posxy.source = scatter
+            return scatter
+        
+        # Define function to compute heatmap based on tap location
+        def tap_heatmap(x, y, pcx, pcy):
+            # Finds the index of image closest to the tap location
+            img_source = None
+            min_diff = None
+            square_diff = None
+            
+            for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])):    
+                square_diff = (x - xv) ** 2 + (y - yv) ** 2
+                if (min_diff is None or square_diff < min_diff):
+                    min_diff = square_diff
+                    img_source = i
+            
+            # Downsample so heatmap is at most 100 x 100
+            counter = self.psi.counter
+            self.psi.counter = start_img + img_source
+            img = self.psi.get_images(1)
+            _, x_pixels, y_pixels = img.shape
+            self.psi.counter = counter
+            
+            max_pixels = 100
+            bin_factor_x = int(x_pixels / max_pixels)
+            bin_factor_y = int(y_pixels / max_pixels)
+            
+            while x_pixels % bin_factor_x != 0:
+                bin_factor_x += 1
+            while y_pixels % bin_factor_y != 0:
+                bin_factor_y += 1
+            
+            img = img.reshape((x_pixels, y_pixels))
+            binned_img = img.reshape(int(x_pixels / bin_factor_x),
+                                        bin_factor_x,
+                                        int(y_pixels / bin_factor_y),
+                                        bin_factor_y).mean(-1).mean(1)
+            
+            # Creates hm_data array for heatmap
+            bin_x_pixels, bin_y_pixels = binned_img.shape
+            rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten()
+            cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels)
+            
+            hm_data = np.stack((rows, cols, binned_img.flatten()))
+            hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3))
+        
+            opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above')
+            heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts)
+            
+            return heatmap
+            
+        # Connect the Tap stream to the tap_heatmap callback
+        stream1 = [posxy]
+        stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value})
+        tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2)
+        
+        return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector')
 
 def distribute_indices_over_ranks(d, size):
     """
diff --git a/btx/processing/pipcaOLD.py b/btx/processing/pipcaOLD.py
new file mode 100644
index 000000000..5ce47ea8f
--- /dev/null
+++ b/btx/processing/pipcaOLD.py
@@ -0,0 +1,790 @@
+import os, csv, argparse
+
+import numpy as np
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+
+class PiPCA:
+
+    """Parallelized Incremental Principal Component Analysis."""
+
+    def __init__(
+        self,
+        exp,
+        run,
+        det_type,
+        start_offset=0,
+        num_images=10,
+        num_components=10,
+        batch_size=10,
+        priming=False,
+        downsample=False,
+        bin_factor=2,
+        output_dir="",
+    ):
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = start_offset
+
+        self.priming = priming
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.output_dir = output_dir
+
+        (
+            self.num_images,
+            self.num_components,
+            self.batch_size,
+            self.num_features,
+        ) = self.set_params(num_images, num_components, batch_size, bin_factor)
+
+        self.split_indices, self.split_counts = distribute_indices_over_ranks(
+            self.num_features, self.size
+        )
+
+        self.task_durations = dict({})
+
+        self.num_incorporated_images = 0
+        self.outliers, self.pc_data = [], []
+
+    def get_params(self):
+        """
+        Method to retrieve iPCA params.
+
+        Returns
+        -------
+        num_incorporated_images : int
+            number of images used to build model
+        num_components : int
+            number of components maintained in model
+        batch_size : int
+            batch size used in model updates
+        num_features : int
+            dimensionality of incorporated images
+        """
+        return (
+            self.num_incorporated_images,
+            self.num_components,
+            self.batch_size,
+            self.num_features,
+        )
+
+    def set_params(self, num_images, num_components, batch_size, bin_factor):
+        """
+        Method to initialize iPCA parameters.
+
+        Parameters
+        ----------
+        num_images : int
+            Desired number of images to incorporate into model.
+        num_components : int
+            Desired number of components for model to maintain.
+        batch_size : int
+            Desired size of image block to be incorporated into model at each update.
+        bin_factor : int
+            Factor to bin data by.
+
+        Returns
+        -------
+        num_images : int
+            Number of images to incorporate into model.
+        num_components : int
+            Number of components for model to maintain.
+        batch_size : int
+            Size of image block to be incorporated into model at each update.
+        num_features : int
+            Number of features (dimension) in each image.
+        """
+        max_events = self.psi.max_events
+        downsample = self.downsample
+
+        num_images = min(num_images, max_events) if num_images != -1 else max_events
+        num_components = min(num_components, num_images)
+        batch_size = min(batch_size, num_images)
+
+        # set d
+        det_shape = self.psi.det.shape()
+        num_features = np.prod(det_shape).astype(int)
+
+        if downsample:
+            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
+                print("Invalid bin factor, toggled off downsampling.")
+                self.downsample = False
+            else:
+                num_features = int(num_features / bin_factor**2)
+
+        return num_images, num_components, batch_size, num_features
+
+    def run(self):
+        """
+        Perform iPCA on run subject to initialization parameters.
+        """
+        m = self.batch_size
+        num_images = self.num_images
+
+        # initialize and prime model, if specified
+        if self.priming:
+            img_batch = self.get_formatted_images(
+                self.num_components, 0, self.num_features
+            )
+            self.prime_model(img_batch)
+        else:
+            self.U = np.zeros((self.split_counts[self.rank], self.num_components))
+            self.S = np.ones(self.num_components)
+            self.mu = np.zeros((self.split_counts[self.rank], 1))
+            self.total_variance = np.zeros((self.split_counts[self.rank], 1))
+
+        # divide remaining number of images into batches
+        # will become redundant in a streaming setting, need to change
+        rem_imgs = num_images - self.num_incorporated_images
+        batch_sizes = np.array(
+            [m] * np.floor(rem_imgs / m).astype(int)
+            + ([rem_imgs % m] if rem_imgs % m else [])
+        )
+
+        # update model with remaining batches
+        for batch_size in batch_sizes:
+            self.fetch_and_update_model(batch_size)
+
+    def get_formatted_images(self, n, start_index, end_index):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+
+        bin_factor = self.bin_factor
+        downsample = self.downsample
+
+        # may have to rewrite eventually when number of images becomes large,
+        # i.e. streamed setting, either that or downsample aggressively
+        imgs = self.psi.get_images(n, assemble=False)
+
+        if downsample:
+            imgs = bin_data(imgs, bin_factor)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+
+        num_valid_imgs, p, x, y = imgs.shape
+        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+
+        return formatted_imgs[start_index:end_index, :]
+
+    def prime_model(self, X):
+        """
+        Initialize model on sample of data using batch PCA.
+
+        Parameters
+        ----------
+        X : ndarray, shape (d x n)
+            set of n (d x 1) observations
+        """
+
+        d, n = X.shape
+
+        if self.rank == 0:
+            print(f"Priming model with {n} samples...")
+
+
+        mu_full, total_variance_full = self.calculate_sample_mean_and_variance(X)
+
+        self.mu = mu_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]]
+        self.total_variance = total_variance_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]]
+        
+        centered_data = X - np.tile(mu_full, n)
+
+        U, self.S, _ = np.linalg.svd(centered_data, full_matrices=False)
+        self.U = U[self.split_indices[self.rank]:self.split_indices[self.rank+1], :]
+
+        self.num_incorporated_images += n
+
+    def fetch_and_update_model(self, n):
+        """
+        Fetch images and update model.
+
+        Parameters
+        ----------
+        n : int
+            number of images to incorporate
+        """
+
+        rank = self.rank
+        start_index, end_index = self.split_indices[rank], self.split_indices[rank + 1]
+
+        img_batch = self.get_formatted_images(n, start_index, end_index)
+
+        self.update_model(img_batch)
+
+    def update_model(self, X):
+        """
+        Update model with new batch of observations using iPCA.
+
+        Parameters
+        ----------
+        X : ndarray, shape (d x m)
+            batch of m (d x 1) observations
+
+        Notes
+        -----
+        Implementation of iPCA algorithm from [1].
+
+        References
+        ----------
+        [1] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking.
+        International journal of computer vision. 2008 May;77(1):125-41.
+        """
+        _, m = X.shape
+        n = self.num_incorporated_images
+        q = self.num_components
+
+        with TaskTimer(self.task_durations, "total update"):
+
+            if self.rank == 0:
+                print(
+                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                        m=m, s="s" if m > 1 else "", n=n, q=q
+                    )
+                )
+
+            with TaskTimer(self.task_durations, "record pc data"):
+                if n > 0:
+                    self.record_loadings(X, 5)
+
+            with TaskTimer(self.task_durations, "update mean and variance"):
+                mu_n = self.mu
+                mu_m, s_m = self.calculate_sample_mean_and_variance(X)
+
+                self.total_variance = self.update_sample_variance(
+                    self.total_variance, s_m, mu_n, mu_m, n, m
+                )
+                self.mu = self.update_sample_mean(mu_n, mu_m, n, m)
+
+            with TaskTimer(
+                self.task_durations, "center data and compute augment vector"
+            ):
+                X_centered = X - np.tile(mu_m, m)
+                mean_augment_vector = np.sqrt(n * m / (n + m)) * (mu_m - mu_n)
+
+                X_augmented = np.hstack((X_centered, mean_augment_vector))
+
+            with TaskTimer(self.task_durations, "first matrix product U@S"):
+                US = self.U @ np.diag(self.S)
+
+            with TaskTimer(self.task_durations, "QR concatenate"):
+                A = np.hstack((US, X_augmented))
+
+            with TaskTimer(self.task_durations, "parallel QR"):
+                Q_r, U_tilde, S_tilde = self.parallel_qr(A)
+
+            with TaskTimer(self.task_durations, "compute local U_prime"):
+                self.U = Q_r @ U_tilde[:, :q]
+                self.S = S_tilde[:q]
+
+            self.num_incorporated_images += m
+
+
+    def calculate_sample_mean_and_variance(self, imgs):
+        """
+        Compute the sample mean and variance of a flattened stack of n images.
+
+        Parameters
+        ----------
+        imgs : ndarray, shape (d x n)
+            horizonally stacked batch of flattened images
+
+        Returns
+        -------
+        mu_m : ndarray, shape (d x 1)
+            mean of imgs
+        su_m : ndarray, shape (d x 1)
+            sample variance of imgs (1 dof)
+        """
+        d, m = imgs.shape
+
+        mu_m = np.reshape(np.mean(imgs, axis=1), (d, 1))
+        s_m = np.zeros((d, 1))
+
+        if m > 1:
+            s_m = np.reshape(np.var(imgs, axis=1, ddof=1), (d, 1))
+
+        return mu_m, s_m
+
+    def parallel_qr(self, A):
+        """
+        Perform parallelized qr factorization on input matrix A.
+
+        Parameters
+        ----------
+        A : ndarray, shape (_ x q+m+1)
+            Input data to be factorized.
+
+        Returns
+        -------
+        q_fin : ndarray, shape (_, q+m+1)
+            Q_{r,1} from TSQR algorithm, where r = self.rank + 1
+        U_tilde : ndarray, shape (q+m+1, q+m+1)
+            Q_{r,2} from TSQR algorithm, where r = self.rank + 1
+        S_tilde : ndarray, shape (q+m+1)
+            R_tilde from TSQR algorithm, where r = self.rank + 1
+
+        Notes
+        -----
+        Parallel QR algorithm implemented from [1], with additional elements from [2]
+        sprinkled in to record elements for iPCA using SVD, etc.
+
+        References
+        ----------
+        [1] Benson AR, Gleich DF, Demmel J. Direct QR factorizations for tall-and-skinny
+        matrices in MapReduce architectures. In2013 IEEE international conference on
+        big data 2013 Oct 6 (pp. 264-272). IEEE.
+
+        [2] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking.
+        International journal of computer vision. 2008 May;77(1):125-41.
+
+        [3] Maulik, R., & Mengaldo, G. (2021, November). PyParSVD: A streaming, distributed and
+        randomized singular-value-decomposition library. In 2021 7th International Workshop on
+        Data Analysis and Reduction for Big Scientific Data (DRBSD-7) (pp. 19-25). IEEE.
+        """
+        _, x = A.shape
+        q = self.num_components
+        m = x - q - 1
+
+        with TaskTimer(self.task_durations, "qr - local qr"):
+            Q_r1, R_r = np.linalg.qr(A, mode="reduced")
+
+        self.comm.Barrier()
+
+        with TaskTimer(self.task_durations, "qr - r_tot gather"):
+            if self.rank == 0:
+                R = np.empty((self.size * (q + m + 1), q + m + 1))
+            else:
+                R = None
+
+            self.comm.Gather(R_r, R, root=0)
+
+        if self.rank == 0:
+            with TaskTimer(self.task_durations, "qr - global qr"):
+                Q_2, R_tilde = np.linalg.qr(R, mode="reduced")
+
+            with TaskTimer(self.task_durations, "qr - global svd"):
+                U_tilde, S_tilde, _ = np.linalg.svd(R_tilde)
+        else:
+            U_tilde = np.empty((q + m + 1, q + m + 1))
+            S_tilde = np.empty(q + m + 1)
+            Q_2 = None
+
+        self.comm.Barrier()
+
+        with TaskTimer(self.task_durations, "qr - scatter q_tot"):
+            Q_r2 = np.empty((q + m + 1, q + m + 1))
+            self.comm.Scatter(Q_2, Q_r2, root=0)
+
+        with TaskTimer(self.task_durations, "qr - local matrix build"):
+            Q_r = Q_r1 @ Q_r2
+
+        self.comm.Barrier()
+
+        with TaskTimer(self.task_durations, "qr - bcast S_tilde"):
+            self.comm.Bcast(S_tilde, root=0)
+
+        self.comm.Barrier()
+
+        with TaskTimer(self.task_durations, "qr - bcast U_tilde"):
+            self.comm.Bcast(U_tilde, root=0)
+
+        return Q_r, U_tilde, S_tilde
+
+    def update_sample_mean(self, mu_n, mu_m, n, m):
+        """
+        Compute combined mean of two blocks of data.
+
+        Parameters
+        ----------
+        mu_n : ndarray, shape (d x 1)
+            mean of first block of data
+        mu_m : ndarray, shape (d x 1)
+            mean of second block of data
+        n : int
+            number of observations in first block of data
+        m : int
+            number of observations in second block of data
+
+        Returns
+        -------
+        mu_nm : ndarray, shape (d x 1)
+            combined mean of both blocks of input data
+        """
+        mu_nm = mu_m
+
+        if n != 0:
+            mu_nm = (1 / (n + m)) * (n * mu_n + m * mu_m)
+
+        return mu_nm
+
+    def update_sample_variance(self, s_n, s_m, mu_n, mu_m, n, m):
+        """
+        Compute combined sample variance of two blocks
+        of data described by input parameters.
+
+        Parameters
+        ----------
+        s_n : ndarray, shape (d x 1)
+            sample variance of first block of data
+        s_m : ndarray, shape (d x 1)
+            sample variance of second block of data
+        mu_n : ndarray, shape (d x 1)
+            mean of first block of data
+        mu_m : ndarray, shape (d x 1)
+            mean of second block of data
+        n : int
+            number of observations in first block of data
+        m : int
+            number of observations in second block of data
+
+        Returns
+        -------
+        s_nm : ndarray, shape (d x 1)
+            combined sample variance of both blocks of data described by input
+            parameters
+        """
+        s_nm = s_m
+
+        if n != 0:
+            s_nm = (((n - 1) * s_n + (m - 1) * s_m)
+                    + (n * m * (mu_n - mu_m) ** 2) / (n + m)) / (n + m - 1)
+
+        return s_nm
+
+    def get_model(self):
+        """
+        Method to retrieve model parameters.
+
+        Returns
+        -------
+        U_tot : ndarray, shape (d x q)
+            iPCA principal axes from model.
+        S_tot : ndarray, shape (1 x q)
+            iPCA singular values from model.
+        mu_tot : ndarray, shape (1 x d)
+            Data mean computed from all input images.
+        var_tot : ndarray, shape (1 x d)
+            Sample data variance computed from all input images.
+        """
+        if self.rank == 0:
+            U_tot = np.empty(self.num_features * self.num_components)
+            mu_tot = np.empty((self.num_features, 1))
+            var_tot = np.empty((self.num_features, 1))
+        else:
+            U_tot, mu_tot, var_tot = None, None, None
+
+        start_indices = self.split_indices[:-1]
+
+        self.comm.Gatherv(
+            self.U.flatten(),
+            [
+                U_tot,
+                self.split_counts * self.num_components,
+                start_indices * self.num_components,
+                MPI.DOUBLE,
+            ],
+            root=0,
+        )
+
+        if self.rank == 0:
+            U_tot = np.reshape(U_tot, (self.num_features, self.num_components))
+
+        self.comm.Gatherv(
+            self.mu,
+            [
+                mu_tot,
+                self.split_counts * self.num_components,
+                start_indices,
+                MPI.DOUBLE,
+            ],
+            root=0,
+        )
+        self.comm.Gatherv(
+            self.total_variance,
+            [
+                var_tot,
+                self.split_counts * self.num_components,
+                start_indices,
+                MPI.DOUBLE,
+            ],
+            root=0,
+        )
+
+        S_tot = self.S
+
+        return U_tot, S_tot, mu_tot, var_tot
+
+    def get_outliers(self):
+        """
+        Method to retrieve and print outliers on root process.
+        """
+
+        if self.rank == 0:
+            print(self.outliers)
+
+    def record_loadings(self, X, q_sig):
+        """
+        Method to store all loadings, ΣV^T, from present batch using past
+        model iteration.
+
+        Parameters
+        ----------
+        X : ndarray, shape (_ x m)
+            Local subdivision of current image data batch.
+
+        q_sig : int
+            The q_sig components used in generating the loadings for 
+        """
+        _, m = X.shape
+        n, d = self.num_incorporated_images, self.num_features
+
+        start_indices = self.split_indices[:-1]
+
+        U, _, mu, _ = self.get_model()
+
+        if self.rank == 0:
+            X_tot = np.empty((d, m))
+        else:
+            X_tot = None
+
+        self.comm.Gatherv(
+            X.flatten(),
+            [
+                X_tot,
+                self.split_counts * m,
+                start_indices * m,
+                MPI.DOUBLE,
+            ],
+            root=0,
+        )
+
+        if self.rank == 0:
+
+            X_tot = np.reshape(X_tot, (d, m))
+            cb = X_tot - np.tile(mu, (1, m))
+
+            pcs = U.T @ cb
+            self.pc_data = (
+                np.concatenate((self.pc_data, pcs), axis=1)
+                if len(self.pc_data)
+                else pcs
+            )
+
+            pc_dist = np.linalg.norm(pcs[:q_sig], axis=0)
+            std = np.std(pc_dist)
+            mu = np.mean(pc_dist)
+
+            batch_outliers = np.where(np.abs(pc_dist - mu) > std)[0] + n - m
+
+            self.outliers = (
+                np.concatenate((self.outliers, batch_outliers), axis=0)
+                if len(self.outliers)
+                else batch_outliers
+            )
+
+    def display_image(self, idx, output_dir="", save_image=False):
+        """
+        Method to retrieve single image from run subject to model binning constraints.
+
+        Parameters
+        ----------
+        idx : int
+            Run index of image to be retrieved.
+        output_dir : str, optional
+            File path to output directory, by default ""
+        save_image : bool, optional
+            Whether to save image to file, by default False
+        """
+
+        U, S, mu, var = self.get_model()
+
+        if self.rank != 0:
+            return
+
+        bin_factor = 1
+        if self.downsample:
+            bin_factor = self.bin_factor
+
+        n, q, m, d = self.get_params()
+
+        a, b, c = self.psi.det.shape()
+        b = int(b / bin_factor)
+        c = int(c / bin_factor)
+
+        fig, ax = plt.subplots(1)
+
+        counter = self.psi.counter
+        self.psi.counter = idx
+        img = self.get_formatted_images(1, 0, d)
+        self.psi.counter = counter
+
+        img = img - mu
+        img = np.reshape(img, (a, b, c))
+
+        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+        binned_pim = bin_pixel_index_map(pixel_index_map, bin_factor)
+
+        img = assemble_image_stack_batch(img, binned_pim)
+
+        vmax = np.max(img.flatten())
+        ax.imshow(
+            img,
+            norm=colors.SymLogNorm(linthresh=1.0, linscale=1.0, vmin=0, vmax=vmax),
+            interpolation=None
+        )
+
+        if save_image:
+            plt.savefig(output_dir)
+
+        plt.show()
+
+
+def distribute_indices_over_ranks(d, size):
+    """
+
+    Parameters
+    ----------
+    d : int
+        total number of dimensions
+    size : int
+        number of ranks in world
+
+    Returns
+    -------
+    split_indices : ndarray, shape (size+1 x 1)
+        division indices between ranks
+    split_counts : ndarray, shape (size x 1)
+        number of dimensions allocated per rank
+    """
+
+    total_indices = 0
+    split_indices, split_counts = [0], []
+
+    for r in range(size):
+        num_per_rank = d // size
+        if r < (d % size):
+            num_per_rank += 1
+
+        split_counts.append(num_per_rank)
+
+        total_indices += num_per_rank
+        split_indices.append(total_indices)
+
+    split_indices = np.array(split_indices)
+    split_counts = np.array(split_counts)
+
+    return split_indices, split_counts
+
+
+#### for command line use ###
+
+
+def parse_input():
+    """
+    Parse command line input.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
+    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
+    parser.add_argument(
+        "-d",
+        "--det_type",
+        help="Detector name, e.g epix10k2M or jungfrau4M.",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--start_offset",
+        help="Run index of first image to be incorporated into iPCA model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_components",
+        help="Number of principal components to compute and maintain.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--batch_size",
+        help="Size of image batch incorporated in each model update.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--num_images",
+        help="Total number of images to be incorporated into model.",
+        required=False,
+        type=int,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Path to output directory for recording task duration data.",
+        required=False,
+        type=str,
+    )
+    parser.add_argument(
+        "--priming",
+        help="Initialize model with PCA.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--downsample",
+        help="Enable downsampling of images.",
+        required=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--bin_factor",
+        help="Bin factor if using downsizing.",
+        required=False,
+        type=int,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    params = parse_input()
+    kwargs = {k: v for k, v in vars(params).items() if v is not None}
+
+    pipca = PiPCA(**kwargs)
+    pipca.run()
+    pipca.get_outliers()

From 2c57322fc1dc7cd21c0100db0a0ce4fef14ea84a Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Wed, 23 Aug 2023 15:08:38 -0700
Subject: [PATCH 22/57] Cleaned up code and added documentation where
 appropriate.

---
 btx/processing/dimRed.py  |  6 ++---
 btx/processing/freqdir.py | 55 ++++++++++++++++++++++++++++++++-------
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py
index 06ff78c32..0bd1db85d 100644
--- a/btx/processing/dimRed.py
+++ b/btx/processing/dimRed.py
@@ -25,7 +25,7 @@
 
 class DimRed:
 
-    """Parallelized Incremental Principal Component Analysis."""
+    """Dimension Reduction Parent Class."""
 
     def __init__(
         self,
@@ -73,7 +73,7 @@ def __init__(
 
     def get_params(self):
         """
-        Method to retrieve iPCA params.
+        Method to retrieve dimension reduction parameters.
 
         Returns
         -------
@@ -95,7 +95,7 @@ def get_params(self):
 
     def set_params(self, num_images, num_components, batch_size, bin_factor):
         """
-        Method to initialize iPCA parameters.
+        Method to initialize dimension reduction parameters.
 
         Parameters
         ----------
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index f81df0065..2d2125620 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -35,7 +35,7 @@
 class FreqDir(DimRed):
 
     """
-    Parallel Frequent Directions.
+    Parallel Rank Adaptive Frequent Directions.
     
     Based on [1] and [2]. Frequent Directions is a matrix sketching algorithm used to
     approximate large data sets. The basic goal of matrix sketching is to process an
@@ -199,13 +199,16 @@ def get_formatted_images(self, n):
 
         img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
 
-        #JOHN NEW ADDITION 08 20 2023 08 55
         img_batch[img_batch<0] = 0
 
         nimg_batch = []
         for img in img_batch.T:
             if self.threshold:
-                secondQuartile = np.sort(img)[-1]//4
+#                secondQuartile = np.sort(img)[-1]//4
+#                secondQuartile = np.mean(img)
+#                secondQuartile = np.median(img)
+#                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
+                secondQuartile = np.quantile(img, 0.85)
                 nimg = (img>secondQuartile)*img
             else:
                 nimg = img
@@ -258,16 +261,13 @@ def fetch_and_update_model(self, n):
         img_batch = self.get_formatted_images(n)
 
         if self.samplingFactor <1:
-            print("PRE PSAMP REDUCTION SHAPE: ", img_batch.shape)
             psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
             for row in img_batch.T:
                 psamp.update(row)
             img_batch = np.array(psamp.sketch.get()).T
-            print("PSAMP REDUCTION SHAPE: ", img_batch.shape)
 
         if self.mean is None:
-#            self.mean = np.mean(img_batch, axis=1)
-             self.mean = np.sum(img_batch, axis=1, dtype=np.double)/(img_batch.shape[1])
+            self.mean = np.mean(img_batch, axis=1)
         else:
 #            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
 #                    self.num_incorporated_images + (img_batch.shape[1]))
@@ -321,7 +321,7 @@ def update_model(self, X):
                         self.m = 2*self.ell
                         self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d))))
                         self.increaseEll = False
-                        print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell))
+                        print("Increasing rank of process {} to {}".format(self.rank, self.ell))
                     else:
                         copyBatch = self.sketch[self.ell:,:].copy()
                         self.rotate()
@@ -539,13 +539,14 @@ def write(self):
         filename : string
             Name of h5 file where sketch, mean of data, and indices of data processed is written
         """
+        self.comm.barrier()
         filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
             hf.create_dataset("mean", data=self.mean)
             hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked))
             hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images
-        self.comm.Barrier()
+        self.comm.barrier()
         return filename 
 
 
@@ -690,6 +691,8 @@ class ApplyCompression:
     imgageIndicesProcessed: indices of images processed so far
     currRun: Current datetime used to identify run
     imgGrabber: FD object used solely to retrieve data from psana
+    grabberToSaveImages: FD object used solely to retrieve 
+    non-downsampled data for thumbnail generation
     components: Principal Components of matrix sketch
     processedData: Data projected onto matrix sketch range
     smallImages: Downsampled images for visualization purposes 
@@ -726,6 +729,9 @@ def __init__(
         self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
                 exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor,
                 threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False)
+        self.grabberToSaveImages = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
+                exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0,
+                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False)
         self.batchSize = batchSize
 
         self.num_images = self.imgGrabber.num_images
@@ -763,7 +769,7 @@ def fetch_and_process_data(self):
         img_batch = self.imgGrabber.get_formatted_images(self.batchSize)
         self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
 
-        toSave_img_batch = self.assembleImgsToSave(img_batch)
+        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize))
 
         if self.smallImgs is None:
             self.smallImgs = toSave_img_batch
@@ -818,6 +824,11 @@ def write(self):
 
 
 class CustomPriorityQueue:
+    """
+    Custom Priority Queue. 
+
+    Maintains a priority queue of items based on user-inputted priority for said items. 
+    """
     def __init__(self, max_size):
         self.queue = []
         self.index = 0  # To handle items with the same priority
@@ -847,6 +858,30 @@ def get(self):
         return ret
 
 class PrioritySampling:
+    """
+    Priority Sampling. 
+
+    Based on [1] and [2]. Frequent Directions is a sampling algorithm that, 
+    given a high-volume stream of weighted items, creates a generic sample 
+    of a certain limited size that can later be used to estimate the total 
+    weight of arbitrary subsets. In our case, we use Priority Sampling to
+    generate a matrix sketch based, sampling rows of our data using the
+    2-norm as weights. Priority Sampling "first assigns each element i a random 
+    number u_i ∈ Unif(0, 1). This implies a priority p_i = w_i/u_i , based 
+    on its weight w_i (which for matrix rows w_i = ||a||_i^2). We then simply 
+    retain the l rows with largest priorities, using a priority queue of size l."
+
+    [1] Nick Duffield, Carsten Lund, and Mikkel Thorup. 2007. Priority sampling for 
+    estimation of arbitrary subset sums. J. ACM 54, 6 (December 2007), 32–es. 
+    https://doi.org/10.1145/1314690.1314696
+
+    Attributes
+    ----------
+    ell: Number of components to keep
+    d: Number of features of each datapoint
+    sketch: Matrix Sketch maintained by Priority Queue
+
+    """
     def __init__(self, ell, d):
         self.ell = ell
         self.d = d

From a3a25ed68573023809b4b52b67c617fab1e63f01 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Mon, 28 Aug 2023 08:59:43 -0700
Subject: [PATCH 23/57] Added UMAP visualization and wrapper class to FD
 module. Set up parent class for FD and PIPCA called dimension reduction
 (dimRed). Made various bug fixes.

---
 btx/processing/freqdir.py  | 541 ++++++++++++++++++++++++-
 btx/processing/pipcaOLD.py | 790 -------------------------------------
 2 files changed, 533 insertions(+), 798 deletions(-)
 delete mode 100644 btx/processing/pipcaOLD.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 2d2125620..90914f42e 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1,14 +1,21 @@
 import sys
 sys.path.append("/sdf/home/w/winnicki/btx/")
 from btx.processing.dimRed import *
-  
 
 import os, csv, argparse
+import math
+import time
+import random
+from collections import Counter
+import h5py
 
 import numpy as np
 from numpy import zeros, sqrt, dot, diag
 from numpy.linalg import svd, LinAlgError
 from scipy.linalg import svd as scipy_svd
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+import heapq
 
 from mpi4py import MPI
 
@@ -25,12 +32,23 @@
     assemble_image_stack_batch,
 )
 
-import time
-
-import h5py
 from PIL import Image
-import random
-import heapq
+from io import BytesIO
+import base64
+
+from datetime import datetime
+
+import umap
+import hdbscan
+
+from matplotlib import colors
+import matplotlib as mpl
+from matplotlib import cm
+
+from bokeh.plotting import figure, show, output_file, save
+from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
+from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3
+from bokeh.layouts import column, row
 
 class FreqDir(DimRed):
 
@@ -208,7 +226,7 @@ def get_formatted_images(self, n):
 #                secondQuartile = np.mean(img)
 #                secondQuartile = np.median(img)
 #                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
-                secondQuartile = np.quantile(img, 0.85)
+                secondQuartile = np.quantile(img, 0.93)
                 nimg = (img>secondQuartile)*img
             else:
                 nimg = img
@@ -793,7 +811,7 @@ def assembleImgsToSave(self, imgs):
         for img in imgs.T:
             imgRe = np.reshape(img, self.imgGrabber.psi.det.shape())
             imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-            saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150))))
+            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
         saveMe = np.array(saveMe)
         return saveMe
 
@@ -892,3 +910,510 @@ def update(self, vec):
         wi = np.linalg.norm(vec)**2
         pi = wi/ui
         self.sketch.push(vec, pi, wi)
+
+
+
+
+class visualizeFD:
+    """
+    Visualize FD Dimension Reduction using UMAP and DBSCAN
+    """
+    def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings):
+        self.inputFile = inputFile
+        self.outputFile = outputFile
+        output_file(filename=outputFile, title="Static HTML file")
+        self.viewResults = None
+        self.numImgsToUse = numImgsToUse
+        self.nprocs = nprocs
+        self.includeABOD = includeABOD
+        self.userGroupings = userGroupings
+
+    def embeddable_image(self, data):
+        img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
+#        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
+        image = Image.fromarray(img_data, mode='RGBA')
+        buffer = BytesIO()
+        image.save(buffer, format='png')
+        for_encoding = buffer.getvalue()
+        return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8')
+
+    def random_unique_numbers_from_range(self, start, end, count):
+        all_numbers = list(range(start, end + 1))
+        random.shuffle(all_numbers)
+        return all_numbers[:count]
+
+    def euclidean_distance(self, p1, p2):
+        return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
+
+    def compute_medoid(self, points):
+        min_total_distance = float('inf')
+        medoid = None
+        for i, point in enumerate(points):
+            total_distance = 0
+            for other_point in points:
+                total_distance += self.euclidean_distance(point, other_point)
+            if total_distance < min_total_distance:
+                min_total_distance = total_distance
+                medoid = point
+        return medoid
+
+    def genMedoids(self, medoidLabels, clusterPoints):
+        dictMe = {}
+        for j in set(medoidLabels):
+            dictMe[j] = []
+        for index, class_name in enumerate(medoidLabels):
+            dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1]))
+        medoid_lst = []
+        for k, v in dictMe.items():
+            lst = [(x[1], x[2]) for x in v]
+            medoid_point = self.compute_medoid(lst)
+            for test_index, test_point in enumerate(lst):
+                if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]):
+                    fin_ind = test_index
+            medoid_lst.append((k, v[fin_ind][0]))
+        return medoid_lst
+
+    def relabel_to_closest_zero(self, labels):
+        unique_labels = sorted(set(labels))
+        relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)}
+        relabeled = [relabel_dict[label] for label in labels]
+        return relabeled
+
+    def regABOD(self, pts):
+        abofs = []
+        for a in range(len(pts)):
+            test_list = [x for x in range(len(pts)) if x != a]
+            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
+            outlier_factors = []
+            for b, c in otherPts:
+                apt = pts[a]
+                bpt = pts[b]
+                cpt = pts[c]
+                ab = bpt - apt
+                ac = cpt - apt
+                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
+            abofs.append(np.var(np.array(outlier_factors)))
+        return abofs
+
+    def fastABOD(self, pts, nsamples):
+        nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts)
+        k_inds = nbrs.kneighbors(pts)[1]
+        abofs = []
+        count = 0
+        for a in range(len(pts)):
+            test_list = k_inds[a][1:]
+            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
+            outlier_factors = []
+            for (b, c) in otherPts:
+                apt = pts[a]
+                bpt = pts[b]
+                cpt = pts[c]
+                ab = bpt - apt
+                ac = cpt - apt
+                if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
+                    count += 1
+                    continue
+                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
+            abofs.append(np.var(np.array(outlier_factors)))
+        return abofs
+
+    def getOutliers(self, lst, divBy):
+        lstCopy = lst.copy()
+        lstCopy.sort()
+        quart10 = lstCopy[len(lstCopy)//divBy]
+        outlierInds = []
+        notOutlierInds = []
+        for j in range(len(lst)):
+            if lst[j]<quart10:
+                outlierInds.append(j)
+            else:
+                notOutlierInds.append(j)
+        return np.array(outlierInds), np.array(notOutlierInds)
+
+    def genHist(self, vals, endClass):
+        totNum = endClass + 1
+        countVals = Counter(vals)
+        hist = [0]*(totNum)
+        for val in set(countVals):
+            hist[val] = countVals[val]
+        maxval = max(countVals.values())
+        return hist, maxval
+
+    def genLeftRight(self, endClass):
+        return [*range(endClass+1)], [*range(1, endClass+2)]
+
+    def genUMAP(self):
+        imgs = None
+        projections = None
+        for currRank in range(self.nprocs):
+            with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
+                if imgs is None:
+                    imgs = hf["SmallImages"][:]
+                    projections = hf["ProjectedData"][:]
+                else:
+                    imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
+                    projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+
+        intensities = []
+        for img in imgs:
+            intensities.append(np.sum(img.flatten()))
+        intensities = np.array(intensities)
+
+        self.imgs = imgs[:self.numImgsToUse]
+        self.projections = projections[:self.numImgsToUse]
+        self.intensities = intensities[:self.numImgsToUse]
+
+        if len(self.imgs)!= self.numImgsToUse:
+            raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
+
+        self.clusterable_embedding = umap.UMAP(
+            n_neighbors=self.numImgsToUse//40,
+            n_components=2,
+            random_state=42
+        ).fit_transform(self.projections)
+
+        self.labels = hdbscan.HDBSCAN(
+            min_samples=int(self.numImgsToUse*0.75//40),
+            min_cluster_size=int(self.numImgsToUse//40),
+        ).fit_predict(self.clusterable_embedding)
+
+        exclusionList = np.array([])
+        self.clustered = np.isin(self.labels, exclusionList, invert=True)
+
+        self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
+        self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
+
+    def genABOD(self):
+        if self.includeABOD:
+            abod = self.fastABOD(self.projections, 10)
+            outliers, notOutliers = self.getOutliers(abod, 10)
+        else:
+            outliers = []
+            notOutliers = []
+        outlierLabels = []
+        for j in range(self.numImgsToUse):
+            if j in outliers:
+                outlierLabels.append(str(6))
+            else:
+                outlierLabels.append(str(0))
+        self.experData_df['anomDet'] = outlierLabels
+
+    def setUserGroupings(self, userGroupings):
+        """
+        Set User Grouping. An adjustment is made at the beginning of this function,
+        whereby 1 is added to each label. This is because internally, the clusters are stored
+        starting at -1 rather than 0.
+        """
+        self.userGroupings = [[x-1 for x in grouping] for grouping in userGroupings]
+
+    def genLabels(self):
+        newLabels = []
+        for j in self.labels[self.clustered]:
+            doneChecking = False
+            for grouping in self.userGroupings:
+                if j in grouping and not doneChecking:
+                    newLabels.append(min(grouping))
+                    doneChecking=True
+            if not doneChecking:
+                newLabels.append(j)
+        newLabels = list(np.array(newLabels) + 1)
+        self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
+        self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
+        self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
+        medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
+        self.medoidInds = [x[1] for x in medoid_lst]
+        medoidBold = []
+        for ind in range(self.numImgsToUse):
+            if ind in self.medoidInds:
+                medoidBold.append(12)
+            else:
+                medoidBold.append(4)
+        self.experData_df['medoidBold'] = medoidBold
+
+    def genHTML(self):
+        datasource = ColumnDataSource(self.experData_df)
+        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        plot_figure = figure(
+            title='UMAP projection with DBSCAN clustering of the LCLS dataset',
+            tools=('pan, wheel_zoom, reset'),
+            width = 2000, height = 600
+        )
+        plot_figure.add_tools(HoverTool(tooltips="""
+        <div>
+            <div>
+                <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
+            </div>
+            <div>
+                <span style='font-size: 16px; color: #224499'>Cluster #</span>
+                <span style='font-size: 18px'>@cluster</span>
+            </div>
+        </div>
+        """))
+        plot_figure.circle(
+            'x',
+            'y',
+            source=datasource,
+            color=dict(field='ptColor', transform=color_mapping),
+            line_alpha=0.6,
+            fill_alpha=0.6,
+            size='medoidBold',
+            legend_field='cluster'
+        )
+        plot_figure.sizing_mode = 'scale_both'
+        plot_figure.legend.location = "bottom_right"
+        plot_figure.legend.title = "Clusters"
+
+        vals = [x for x in self.newLabels]
+        trueSource = ColumnDataSource(data=dict(vals = vals))
+        hist, maxCount = self.genHist(vals, max(vals))
+        left, right = self.genLeftRight(max(vals))
+        histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right))
+        p = figure(width=2000, height=450, toolbar_location=None,
+                   title="Histogram Testing")
+        p.quad(source=histsource, top='hist', bottom=0, left='left', right='right',
+                 fill_color='skyblue', line_color="white")
+        p.y_range = Range1d(0, maxCount)
+        p.x_range = Range1d(0, max(vals)+1)
+        p.xaxis.axis_label = "Cluster Label"
+        p.yaxis.axis_label = "Count"
+
+        indexCDS = ColumnDataSource(dict(
+            index=[*range(0, self.numImgsToUse, 10)]
+            )
+        )
+        cols = RangeSlider(title="ET",
+                start=0,
+                end=self.numImgsToUse,
+                value=(0, self.numImgsToUse-1),
+                step=1, sizing_mode="stretch_width")
+        callback = CustomJS(args=dict(cols=cols, trueSource = trueSource,
+                                      histsource = histsource, datasource=datasource, indexCDS=indexCDS), code="""
+        function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) {
+            let counts = new Array(largestVal-smallestVal); for (let i=0; i<largestVal-smallestVal; ++i) counts[i] = 0;
+            for (let i = Math.round(startInd); i <= Math.round(endInd); i++) {
+                let numMe = numbers[i];
+                if (typeof counts[numMe] === 'undefined') {
+                  counts[numMe] = 1;
+                } else {
+                  counts[numMe]++;
+                }
+            }
+            return counts;
+            }
+        const vals = trueSource.data.vals
+        const leftVal = cols.value[0]
+        const rightVal = cols.value[1]
+        const oldhist = histsource.data.hist
+        const left = histsource.data.left
+        const right = histsource.data.right
+        const hist = countNumbersAtIndices(vals, leftVal, rightVal, left[0], right.slice(-1))
+        histsource.data = { hist, left, right }
+        let medoidBold = new Array(datasource.data.medoidBold.length); for (let i=0; i<datasource.data.medoidBold.length; ++i) medoidBold[i] = 0;
+                for (let i = Math.round(leftVal); i < Math.round(rightVal); i++) {
+            medoidBold[i] = 5
+        }
+        const x = datasource.data.x
+        const y = datasource.data.y
+        const image = datasource.data.image
+        const cluster = datasource.data.cluster
+        const ptColor = datasource.data.ptColor
+        const anomDet = datasource.data.anomDet
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet}
+        """)
+        cols.js_on_change('value', callback)
+
+
+        imgsPlot = figure(width=2000, height=150, toolbar_location=None)
+        imgsPlot.image(image=[self.imgs[imgind][::-1] for imgind in self.medoidInds],
+                x=[0.25+xind for xind in range(len(self.medoidInds))],
+                y=0,
+                dw=0.5, dh=1,
+                palette="Plasma256", level="image")
+        imgsPlot.axis.visible = False
+        imgsPlot.grid.visible = False
+        for xind in range(len(self.medoidInds)):
+            mytext = Label(x=0.375+xind, y=-0.25, text='Cluster {}'.format(xind))
+            imgsPlot.add_layout(mytext)
+        imgsPlot.y_range = Range1d(-0.3, 1.1)
+        imgsPlot.x_range = Range1d(0, max(vals)+1)
+
+        toggl = Toggle(label='► Play',active=False)
+        toggl_js = CustomJS(args=dict(slider=cols,indexCDS=indexCDS),code="""
+        // https://discourse.bokeh.org/t/possible-to-use-customjs-callback-from-a-button-to-animate-a-slider/3985/3
+            var check_and_iterate = function(index){
+                var slider_val0 = slider.value[0];
+                var slider_val1 = slider.value[1];
+                var toggle_val = cb_obj.active;
+                if(toggle_val == false) {
+                    cb_obj.label = '► Play';
+                    clearInterval(looop);
+                    }
+                else if(slider_val1 >= index[index.length - 1]) {
+                    cb_obj.label = '► Play';
+                    slider.value = [0, slider_val1-slider_val0];
+                    cb_obj.active = false;
+                    clearInterval(looop);
+                    }
+                else if(slider_val1 !== index[index.length - 1]){
+                    slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]];
+                    }
+                else {
+                clearInterval(looop);
+                    }
+            }
+            if(cb_obj.active == false){
+                cb_obj.label = '► Play';
+                clearInterval(looop);
+            }
+            else {
+                cb_obj.label = '❚❚ Pause';
+                var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']);
+            };
+        """)
+        toggl.js_on_change('active',toggl_js)
+
+        LABELS = ["DBSCAN Clustering", "Anomaly Detection"]
+        radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
+        radioGroup_js = CustomJS(args=dict(datasource=datasource), code="""
+            console.log(datasource.data.ptColor)
+            const x = datasource.data.x
+            const y = datasource.data.y
+            const image = datasource.data.image
+            const medoidBold = datasource.data.medoidBold
+            const cluster = datasource.data.cluster
+            const anomDet = datasource.data.anomDet
+
+            let ptColor = null
+
+            if (cb_obj.active==0){
+                ptColor = cluster
+            }
+            else{
+                ptColor = anomDet
+            }
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet}
+        """)
+        radio_button_group.js_on_change("active", radioGroup_js)
+
+        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group))
+
+    def fullVisualize(self):
+        self.genUMAP()
+        self.genABOD()
+        self.genLabels()
+        self.genHTML()
+
+    def updateLabels(self):
+        self.genLabels()
+        self.genHTML()
+
+    def userSave(self):
+        save(self.viewResults)
+
+    def userShow(self):
+        from IPython.display import display, HTML
+        display(HTML("<style>.container { width:100% !important; }</style>"))
+        display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
+        display(HTML("<style>.container { height:100% !important; }</style>"))
+        display(HTML("<style>.output_result { max-height:100% !important; }</style>"))
+        from bokeh.io import output_notebook
+        output_notebook()
+        show(self.viewResults)
+
+
+class WrapperFullFD:
+    """
+    Frequent Directions Data Processing Wrapper Class.
+    """
+    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize):
+        self.currRun = datetime.now().strftime("%y%m%d%H%M")
+        self.start_offset = start_offset
+        self.num_imgs = num_imgs
+        self.exp = exp
+        self.run = run
+        self.det_type = det_type
+        self.writeToHere = writeToHere
+        self.num_components=num_components
+        self.alpha = alpha
+        self.rankAdapt = rankAdapt
+        self.downsample=downsample
+        self.bin_factor= bin_factor
+        self.threshold= threshold
+        self.normalizeIntensity=normalizeIntensity
+        self.noZeroIntensity=noZeroIntensity
+        self.samplingFactor=samplingFactor
+        self.priming=priming
+        self.divBy = divBy 
+        self.batchSize = batchSize
+
+    def runMe(self):
+        stfull = time.perf_counter()
+
+        #SKETCHING STEP
+        ##########################################################################################
+        freqDir = FreqDir(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
+                det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt,
+                merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
+                threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
+                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming)
+        print("STARTING SKETCHING")
+        st = time.perf_counter()
+        freqDir.run()
+        localSketchFilename = freqDir.write()
+        et = time.perf_counter()
+        print("Estimated time for frequent directions rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st))
+
+        #MERGING STEP
+        ##########################################################################################
+        if freqDir.rank<10:
+            fullSketchFilename = localSketchFilename[:-4]
+        else:
+            fullSketchFilename = localSketchFilename[:-5]
+        allNames = []
+        for j in range(freqDir.size):
+            allNames.append(fullSketchFilename + str(j) + ".h5")
+        mergeTree = MergeTree(exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
+                output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun)
+        #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename,
+        #        dir=writeToHere, allWriteDirecs=allNames, currRun = currRun)
+
+        st = time.perf_counter()
+        mergeTree.merge()
+        mergedSketchFilename = mergeTree.write()
+        et = time.perf_counter()
+        print("Estimated time merge tree for rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st))
+
+
+
+        #PROJECTION STEP
+        ##########################################################################################
+        appComp = ApplyCompression(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
+                det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere,
+                batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
+                downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun)
+        st = time.perf_counter()
+        appComp.run()
+        appComp.write()
+        et = time.perf_counter()
+        print("Estimated time projection for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, et - st))
+
+
+        etfull = time.perf_counter()
+        print("Estimated full processing time for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, etfull - stfull))
+        ##########################################################################################
+
+        if freqDir.rank==0:
+            st = time.perf_counter()
+            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
+                            outputFile="./UMAPVis_{}.html".format(self.currRun),
+                            numImgsToUse=self.num_imgs,
+                            nprocs=freqDir.size,
+                            userGroupings=[],
+                            includeABOD=True)
+            visMe.fullVisualize()
+            visMe.userSave()
+            et = time.perf_counter()
+            print("UMAP HTML Generation Processing time: {}".format(et - st))
+            print("TOTAL PROCESING TIME: {}".format(et - stfull))
+
+
diff --git a/btx/processing/pipcaOLD.py b/btx/processing/pipcaOLD.py
deleted file mode 100644
index 5ce47ea8f..000000000
--- a/btx/processing/pipcaOLD.py
+++ /dev/null
@@ -1,790 +0,0 @@
-import os, csv, argparse
-
-import numpy as np
-from mpi4py import MPI
-
-from matplotlib import pyplot as plt
-from matplotlib import colors
-
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
-
-class PiPCA:
-
-    """Parallelized Incremental Principal Component Analysis."""
-
-    def __init__(
-        self,
-        exp,
-        run,
-        det_type,
-        start_offset=0,
-        num_images=10,
-        num_components=10,
-        batch_size=10,
-        priming=False,
-        downsample=False,
-        bin_factor=2,
-        output_dir="",
-    ):
-
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
-
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = start_offset
-
-        self.priming = priming
-        self.downsample = downsample
-        self.bin_factor = bin_factor
-        self.output_dir = output_dir
-
-        (
-            self.num_images,
-            self.num_components,
-            self.batch_size,
-            self.num_features,
-        ) = self.set_params(num_images, num_components, batch_size, bin_factor)
-
-        self.split_indices, self.split_counts = distribute_indices_over_ranks(
-            self.num_features, self.size
-        )
-
-        self.task_durations = dict({})
-
-        self.num_incorporated_images = 0
-        self.outliers, self.pc_data = [], []
-
-    def get_params(self):
-        """
-        Method to retrieve iPCA params.
-
-        Returns
-        -------
-        num_incorporated_images : int
-            number of images used to build model
-        num_components : int
-            number of components maintained in model
-        batch_size : int
-            batch size used in model updates
-        num_features : int
-            dimensionality of incorporated images
-        """
-        return (
-            self.num_incorporated_images,
-            self.num_components,
-            self.batch_size,
-            self.num_features,
-        )
-
-    def set_params(self, num_images, num_components, batch_size, bin_factor):
-        """
-        Method to initialize iPCA parameters.
-
-        Parameters
-        ----------
-        num_images : int
-            Desired number of images to incorporate into model.
-        num_components : int
-            Desired number of components for model to maintain.
-        batch_size : int
-            Desired size of image block to be incorporated into model at each update.
-        bin_factor : int
-            Factor to bin data by.
-
-        Returns
-        -------
-        num_images : int
-            Number of images to incorporate into model.
-        num_components : int
-            Number of components for model to maintain.
-        batch_size : int
-            Size of image block to be incorporated into model at each update.
-        num_features : int
-            Number of features (dimension) in each image.
-        """
-        max_events = self.psi.max_events
-        downsample = self.downsample
-
-        num_images = min(num_images, max_events) if num_images != -1 else max_events
-        num_components = min(num_components, num_images)
-        batch_size = min(batch_size, num_images)
-
-        # set d
-        det_shape = self.psi.det.shape()
-        num_features = np.prod(det_shape).astype(int)
-
-        if downsample:
-            if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor:
-                print("Invalid bin factor, toggled off downsampling.")
-                self.downsample = False
-            else:
-                num_features = int(num_features / bin_factor**2)
-
-        return num_images, num_components, batch_size, num_features
-
-    def run(self):
-        """
-        Perform iPCA on run subject to initialization parameters.
-        """
-        m = self.batch_size
-        num_images = self.num_images
-
-        # initialize and prime model, if specified
-        if self.priming:
-            img_batch = self.get_formatted_images(
-                self.num_components, 0, self.num_features
-            )
-            self.prime_model(img_batch)
-        else:
-            self.U = np.zeros((self.split_counts[self.rank], self.num_components))
-            self.S = np.ones(self.num_components)
-            self.mu = np.zeros((self.split_counts[self.rank], 1))
-            self.total_variance = np.zeros((self.split_counts[self.rank], 1))
-
-        # divide remaining number of images into batches
-        # will become redundant in a streaming setting, need to change
-        rem_imgs = num_images - self.num_incorporated_images
-        batch_sizes = np.array(
-            [m] * np.floor(rem_imgs / m).astype(int)
-            + ([rem_imgs % m] if rem_imgs % m else [])
-        )
-
-        # update model with remaining batches
-        for batch_size in batch_sizes:
-            self.fetch_and_update_model(batch_size)
-
-    def get_formatted_images(self, n, start_index, end_index):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-
-        bin_factor = self.bin_factor
-        downsample = self.downsample
-
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-        formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-
-        return formatted_imgs[start_index:end_index, :]
-
-    def prime_model(self, X):
-        """
-        Initialize model on sample of data using batch PCA.
-
-        Parameters
-        ----------
-        X : ndarray, shape (d x n)
-            set of n (d x 1) observations
-        """
-
-        d, n = X.shape
-
-        if self.rank == 0:
-            print(f"Priming model with {n} samples...")
-
-
-        mu_full, total_variance_full = self.calculate_sample_mean_and_variance(X)
-
-        self.mu = mu_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]]
-        self.total_variance = total_variance_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]]
-        
-        centered_data = X - np.tile(mu_full, n)
-
-        U, self.S, _ = np.linalg.svd(centered_data, full_matrices=False)
-        self.U = U[self.split_indices[self.rank]:self.split_indices[self.rank+1], :]
-
-        self.num_incorporated_images += n
-
-    def fetch_and_update_model(self, n):
-        """
-        Fetch images and update model.
-
-        Parameters
-        ----------
-        n : int
-            number of images to incorporate
-        """
-
-        rank = self.rank
-        start_index, end_index = self.split_indices[rank], self.split_indices[rank + 1]
-
-        img_batch = self.get_formatted_images(n, start_index, end_index)
-
-        self.update_model(img_batch)
-
-    def update_model(self, X):
-        """
-        Update model with new batch of observations using iPCA.
-
-        Parameters
-        ----------
-        X : ndarray, shape (d x m)
-            batch of m (d x 1) observations
-
-        Notes
-        -----
-        Implementation of iPCA algorithm from [1].
-
-        References
-        ----------
-        [1] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking.
-        International journal of computer vision. 2008 May;77(1):125-41.
-        """
-        _, m = X.shape
-        n = self.num_incorporated_images
-        q = self.num_components
-
-        with TaskTimer(self.task_durations, "total update"):
-
-            if self.rank == 0:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=m, s="s" if m > 1 else "", n=n, q=q
-                    )
-                )
-
-            with TaskTimer(self.task_durations, "record pc data"):
-                if n > 0:
-                    self.record_loadings(X, 5)
-
-            with TaskTimer(self.task_durations, "update mean and variance"):
-                mu_n = self.mu
-                mu_m, s_m = self.calculate_sample_mean_and_variance(X)
-
-                self.total_variance = self.update_sample_variance(
-                    self.total_variance, s_m, mu_n, mu_m, n, m
-                )
-                self.mu = self.update_sample_mean(mu_n, mu_m, n, m)
-
-            with TaskTimer(
-                self.task_durations, "center data and compute augment vector"
-            ):
-                X_centered = X - np.tile(mu_m, m)
-                mean_augment_vector = np.sqrt(n * m / (n + m)) * (mu_m - mu_n)
-
-                X_augmented = np.hstack((X_centered, mean_augment_vector))
-
-            with TaskTimer(self.task_durations, "first matrix product U@S"):
-                US = self.U @ np.diag(self.S)
-
-            with TaskTimer(self.task_durations, "QR concatenate"):
-                A = np.hstack((US, X_augmented))
-
-            with TaskTimer(self.task_durations, "parallel QR"):
-                Q_r, U_tilde, S_tilde = self.parallel_qr(A)
-
-            with TaskTimer(self.task_durations, "compute local U_prime"):
-                self.U = Q_r @ U_tilde[:, :q]
-                self.S = S_tilde[:q]
-
-            self.num_incorporated_images += m
-
-
-    def calculate_sample_mean_and_variance(self, imgs):
-        """
-        Compute the sample mean and variance of a flattened stack of n images.
-
-        Parameters
-        ----------
-        imgs : ndarray, shape (d x n)
-            horizonally stacked batch of flattened images
-
-        Returns
-        -------
-        mu_m : ndarray, shape (d x 1)
-            mean of imgs
-        su_m : ndarray, shape (d x 1)
-            sample variance of imgs (1 dof)
-        """
-        d, m = imgs.shape
-
-        mu_m = np.reshape(np.mean(imgs, axis=1), (d, 1))
-        s_m = np.zeros((d, 1))
-
-        if m > 1:
-            s_m = np.reshape(np.var(imgs, axis=1, ddof=1), (d, 1))
-
-        return mu_m, s_m
-
-    def parallel_qr(self, A):
-        """
-        Perform parallelized qr factorization on input matrix A.
-
-        Parameters
-        ----------
-        A : ndarray, shape (_ x q+m+1)
-            Input data to be factorized.
-
-        Returns
-        -------
-        q_fin : ndarray, shape (_, q+m+1)
-            Q_{r,1} from TSQR algorithm, where r = self.rank + 1
-        U_tilde : ndarray, shape (q+m+1, q+m+1)
-            Q_{r,2} from TSQR algorithm, where r = self.rank + 1
-        S_tilde : ndarray, shape (q+m+1)
-            R_tilde from TSQR algorithm, where r = self.rank + 1
-
-        Notes
-        -----
-        Parallel QR algorithm implemented from [1], with additional elements from [2]
-        sprinkled in to record elements for iPCA using SVD, etc.
-
-        References
-        ----------
-        [1] Benson AR, Gleich DF, Demmel J. Direct QR factorizations for tall-and-skinny
-        matrices in MapReduce architectures. In2013 IEEE international conference on
-        big data 2013 Oct 6 (pp. 264-272). IEEE.
-
-        [2] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking.
-        International journal of computer vision. 2008 May;77(1):125-41.
-
-        [3] Maulik, R., & Mengaldo, G. (2021, November). PyParSVD: A streaming, distributed and
-        randomized singular-value-decomposition library. In 2021 7th International Workshop on
-        Data Analysis and Reduction for Big Scientific Data (DRBSD-7) (pp. 19-25). IEEE.
-        """
-        _, x = A.shape
-        q = self.num_components
-        m = x - q - 1
-
-        with TaskTimer(self.task_durations, "qr - local qr"):
-            Q_r1, R_r = np.linalg.qr(A, mode="reduced")
-
-        self.comm.Barrier()
-
-        with TaskTimer(self.task_durations, "qr - r_tot gather"):
-            if self.rank == 0:
-                R = np.empty((self.size * (q + m + 1), q + m + 1))
-            else:
-                R = None
-
-            self.comm.Gather(R_r, R, root=0)
-
-        if self.rank == 0:
-            with TaskTimer(self.task_durations, "qr - global qr"):
-                Q_2, R_tilde = np.linalg.qr(R, mode="reduced")
-
-            with TaskTimer(self.task_durations, "qr - global svd"):
-                U_tilde, S_tilde, _ = np.linalg.svd(R_tilde)
-        else:
-            U_tilde = np.empty((q + m + 1, q + m + 1))
-            S_tilde = np.empty(q + m + 1)
-            Q_2 = None
-
-        self.comm.Barrier()
-
-        with TaskTimer(self.task_durations, "qr - scatter q_tot"):
-            Q_r2 = np.empty((q + m + 1, q + m + 1))
-            self.comm.Scatter(Q_2, Q_r2, root=0)
-
-        with TaskTimer(self.task_durations, "qr - local matrix build"):
-            Q_r = Q_r1 @ Q_r2
-
-        self.comm.Barrier()
-
-        with TaskTimer(self.task_durations, "qr - bcast S_tilde"):
-            self.comm.Bcast(S_tilde, root=0)
-
-        self.comm.Barrier()
-
-        with TaskTimer(self.task_durations, "qr - bcast U_tilde"):
-            self.comm.Bcast(U_tilde, root=0)
-
-        return Q_r, U_tilde, S_tilde
-
-    def update_sample_mean(self, mu_n, mu_m, n, m):
-        """
-        Compute combined mean of two blocks of data.
-
-        Parameters
-        ----------
-        mu_n : ndarray, shape (d x 1)
-            mean of first block of data
-        mu_m : ndarray, shape (d x 1)
-            mean of second block of data
-        n : int
-            number of observations in first block of data
-        m : int
-            number of observations in second block of data
-
-        Returns
-        -------
-        mu_nm : ndarray, shape (d x 1)
-            combined mean of both blocks of input data
-        """
-        mu_nm = mu_m
-
-        if n != 0:
-            mu_nm = (1 / (n + m)) * (n * mu_n + m * mu_m)
-
-        return mu_nm
-
-    def update_sample_variance(self, s_n, s_m, mu_n, mu_m, n, m):
-        """
-        Compute combined sample variance of two blocks
-        of data described by input parameters.
-
-        Parameters
-        ----------
-        s_n : ndarray, shape (d x 1)
-            sample variance of first block of data
-        s_m : ndarray, shape (d x 1)
-            sample variance of second block of data
-        mu_n : ndarray, shape (d x 1)
-            mean of first block of data
-        mu_m : ndarray, shape (d x 1)
-            mean of second block of data
-        n : int
-            number of observations in first block of data
-        m : int
-            number of observations in second block of data
-
-        Returns
-        -------
-        s_nm : ndarray, shape (d x 1)
-            combined sample variance of both blocks of data described by input
-            parameters
-        """
-        s_nm = s_m
-
-        if n != 0:
-            s_nm = (((n - 1) * s_n + (m - 1) * s_m)
-                    + (n * m * (mu_n - mu_m) ** 2) / (n + m)) / (n + m - 1)
-
-        return s_nm
-
-    def get_model(self):
-        """
-        Method to retrieve model parameters.
-
-        Returns
-        -------
-        U_tot : ndarray, shape (d x q)
-            iPCA principal axes from model.
-        S_tot : ndarray, shape (1 x q)
-            iPCA singular values from model.
-        mu_tot : ndarray, shape (1 x d)
-            Data mean computed from all input images.
-        var_tot : ndarray, shape (1 x d)
-            Sample data variance computed from all input images.
-        """
-        if self.rank == 0:
-            U_tot = np.empty(self.num_features * self.num_components)
-            mu_tot = np.empty((self.num_features, 1))
-            var_tot = np.empty((self.num_features, 1))
-        else:
-            U_tot, mu_tot, var_tot = None, None, None
-
-        start_indices = self.split_indices[:-1]
-
-        self.comm.Gatherv(
-            self.U.flatten(),
-            [
-                U_tot,
-                self.split_counts * self.num_components,
-                start_indices * self.num_components,
-                MPI.DOUBLE,
-            ],
-            root=0,
-        )
-
-        if self.rank == 0:
-            U_tot = np.reshape(U_tot, (self.num_features, self.num_components))
-
-        self.comm.Gatherv(
-            self.mu,
-            [
-                mu_tot,
-                self.split_counts * self.num_components,
-                start_indices,
-                MPI.DOUBLE,
-            ],
-            root=0,
-        )
-        self.comm.Gatherv(
-            self.total_variance,
-            [
-                var_tot,
-                self.split_counts * self.num_components,
-                start_indices,
-                MPI.DOUBLE,
-            ],
-            root=0,
-        )
-
-        S_tot = self.S
-
-        return U_tot, S_tot, mu_tot, var_tot
-
-    def get_outliers(self):
-        """
-        Method to retrieve and print outliers on root process.
-        """
-
-        if self.rank == 0:
-            print(self.outliers)
-
-    def record_loadings(self, X, q_sig):
-        """
-        Method to store all loadings, ΣV^T, from present batch using past
-        model iteration.
-
-        Parameters
-        ----------
-        X : ndarray, shape (_ x m)
-            Local subdivision of current image data batch.
-
-        q_sig : int
-            The q_sig components used in generating the loadings for 
-        """
-        _, m = X.shape
-        n, d = self.num_incorporated_images, self.num_features
-
-        start_indices = self.split_indices[:-1]
-
-        U, _, mu, _ = self.get_model()
-
-        if self.rank == 0:
-            X_tot = np.empty((d, m))
-        else:
-            X_tot = None
-
-        self.comm.Gatherv(
-            X.flatten(),
-            [
-                X_tot,
-                self.split_counts * m,
-                start_indices * m,
-                MPI.DOUBLE,
-            ],
-            root=0,
-        )
-
-        if self.rank == 0:
-
-            X_tot = np.reshape(X_tot, (d, m))
-            cb = X_tot - np.tile(mu, (1, m))
-
-            pcs = U.T @ cb
-            self.pc_data = (
-                np.concatenate((self.pc_data, pcs), axis=1)
-                if len(self.pc_data)
-                else pcs
-            )
-
-            pc_dist = np.linalg.norm(pcs[:q_sig], axis=0)
-            std = np.std(pc_dist)
-            mu = np.mean(pc_dist)
-
-            batch_outliers = np.where(np.abs(pc_dist - mu) > std)[0] + n - m
-
-            self.outliers = (
-                np.concatenate((self.outliers, batch_outliers), axis=0)
-                if len(self.outliers)
-                else batch_outliers
-            )
-
-    def display_image(self, idx, output_dir="", save_image=False):
-        """
-        Method to retrieve single image from run subject to model binning constraints.
-
-        Parameters
-        ----------
-        idx : int
-            Run index of image to be retrieved.
-        output_dir : str, optional
-            File path to output directory, by default ""
-        save_image : bool, optional
-            Whether to save image to file, by default False
-        """
-
-        U, S, mu, var = self.get_model()
-
-        if self.rank != 0:
-            return
-
-        bin_factor = 1
-        if self.downsample:
-            bin_factor = self.bin_factor
-
-        n, q, m, d = self.get_params()
-
-        a, b, c = self.psi.det.shape()
-        b = int(b / bin_factor)
-        c = int(c / bin_factor)
-
-        fig, ax = plt.subplots(1)
-
-        counter = self.psi.counter
-        self.psi.counter = idx
-        img = self.get_formatted_images(1, 0, d)
-        self.psi.counter = counter
-
-        img = img - mu
-        img = np.reshape(img, (a, b, c))
-
-        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
-        binned_pim = bin_pixel_index_map(pixel_index_map, bin_factor)
-
-        img = assemble_image_stack_batch(img, binned_pim)
-
-        vmax = np.max(img.flatten())
-        ax.imshow(
-            img,
-            norm=colors.SymLogNorm(linthresh=1.0, linscale=1.0, vmin=0, vmax=vmax),
-            interpolation=None
-        )
-
-        if save_image:
-            plt.savefig(output_dir)
-
-        plt.show()
-
-
-def distribute_indices_over_ranks(d, size):
-    """
-
-    Parameters
-    ----------
-    d : int
-        total number of dimensions
-    size : int
-        number of ranks in world
-
-    Returns
-    -------
-    split_indices : ndarray, shape (size+1 x 1)
-        division indices between ranks
-    split_counts : ndarray, shape (size x 1)
-        number of dimensions allocated per rank
-    """
-
-    total_indices = 0
-    split_indices, split_counts = [0], []
-
-    for r in range(size):
-        num_per_rank = d // size
-        if r < (d % size):
-            num_per_rank += 1
-
-        split_counts.append(num_per_rank)
-
-        total_indices += num_per_rank
-        split_indices.append(total_indices)
-
-    split_indices = np.array(split_indices)
-    split_counts = np.array(split_counts)
-
-    return split_indices, split_counts
-
-
-#### for command line use ###
-
-
-def parse_input():
-    """
-    Parse command line input.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str)
-    parser.add_argument("-r", "--run", help="Run number.", required=True, type=int)
-    parser.add_argument(
-        "-d",
-        "--det_type",
-        help="Detector name, e.g epix10k2M or jungfrau4M.",
-        required=True,
-        type=str,
-    )
-    parser.add_argument(
-        "--start_offset",
-        help="Run index of first image to be incorporated into iPCA model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_components",
-        help="Number of principal components to compute and maintain.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--batch_size",
-        help="Size of image batch incorporated in each model update.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--num_images",
-        help="Total number of images to be incorporated into model.",
-        required=False,
-        type=int,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Path to output directory for recording task duration data.",
-        required=False,
-        type=str,
-    )
-    parser.add_argument(
-        "--priming",
-        help="Initialize model with PCA.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--downsample",
-        help="Enable downsampling of images.",
-        required=False,
-        action="store_true",
-    )
-    parser.add_argument(
-        "--bin_factor",
-        help="Bin factor if using downsizing.",
-        required=False,
-        type=int,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-
-    params = parse_input()
-    kwargs = {k: v for k, v in vars(params).items() if v is not None}
-
-    pipca = PiPCA(**kwargs)
-    pipca.run()
-    pipca.get_outliers()

From b3b57b6c67895cc3ebd6afadad7da14da4b23f7c Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Thu, 31 Aug 2023 15:14:38 -0700
Subject: [PATCH 24/57] Changed data grabbing to only retrieve data once at the
 beginning. Also made a number of bug fixes. There still is expected to be bug
 in this version where too much accessing h5 causes h5 file not found error
 due to h5py not having parallel h5py configured. UMAP is back to working
 correctly though.

---
 btx/processing/freqdir.py | 664 ++++++++++++++++++++++++++++----------
 1 file changed, 498 insertions(+), 166 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 90914f42e..00d520bac 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -15,6 +15,7 @@
 from scipy.linalg import svd as scipy_svd
 import pandas as pd
 from sklearn.neighbors import NearestNeighbors
+from sklearn.metrics.pairwise import euclidean_distances
 import heapq
 
 from mpi4py import MPI
@@ -35,11 +36,13 @@
 from PIL import Image
 from io import BytesIO
 import base64
+import tables
 
 from datetime import datetime
 
 import umap
 import hdbscan
+from sklearn.cluster import OPTICS, cluster_optics_dbscan
 
 from matplotlib import colors
 import matplotlib as mpl
@@ -50,6 +53,9 @@
 from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3
 from bokeh.layouts import column, row
 
+import cProfile
+import string
+
 class FreqDir(DimRed):
 
     """
@@ -113,6 +119,9 @@ class FreqDir(DimRed):
 
     def __init__(
         self,
+        comm,
+        rank,
+        size,
         start_offset,
         num_imgs,
         exp,
@@ -120,6 +129,7 @@ def __init__(
         det_type,
         output_dir,
         currRun,
+        imgData,
         alpha=0,
         rankAdapt=False,
         merger=False,
@@ -139,6 +149,10 @@ def __init__(
                 num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming,
                 downsample=downsample, bin_factor=bin_factor, output_dir=output_dir)
 
+        self.comm = comm
+        self.rank= rank
+        self.size = size
+
         self.psi.counter = start_offset + self.num_images*self.rank//self.size
 
         self.currRun = currRun
@@ -169,6 +183,8 @@ def __init__(
 
         self.samplingFactor = samplingFactor
 
+        self.imgData = imgData
+
     def run(self):
         """
         Perform frequent directions matrix sketching
@@ -176,70 +192,102 @@ def run(self):
         """
 
         noImgsToProcess = self.num_images//self.size
-        for batch in range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor)):
-            self.fetch_and_update_model(int(self.ell*2//self.samplingFactor))
-
-    def get_formatted_images(self, n):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-
-        bin_factor = self.bin_factor
-        downsample = self.downsample
+        for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))):
+            self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd)
 
-        # may have to rewrite eventually when number of images becomes large,
-        # i.e. streamed setting, either that or downsample aggressively
-        imgs = self.psi.get_images(n, assemble=False)
-
-        if downsample:
-            imgs = bin_data(imgs, bin_factor)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-
-        num_valid_imgs, p, x, y = imgs.shape
-
-        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-
-        img_batch[img_batch<0] = 0
-
-        nimg_batch = []
-        for img in img_batch.T:
-            if self.threshold:
-#                secondQuartile = np.sort(img)[-1]//4
-#                secondQuartile = np.mean(img)
-#                secondQuartile = np.median(img)
-#                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
-                secondQuartile = np.quantile(img, 0.93)
-                nimg = (img>secondQuartile)*img
-            else:
-                nimg = img
-
-            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-            if self.noZeroIntensity and currIntensity<50000:
-                continue
-            else:
-                if currIntensity>=50000 and self.normalizeIntensity:
-                    nimg_batch.append(nimg/currIntensity)
-                else:
-                    nimg_batch.append(nimg)
-        return np.array(nimg_batch).T
+    def elu(self,x):
+        if x > 0:
+            return x
+        else:
+            return 0.01*(math.exp(x)-1)
+
+#    def get_formatted_images(self, n, includeUnformatted=False):
+#        """
+#        Fetch n - x image segments from run, where x is the number of 'dead' images.
+#
+#        Parameters
+#        ----------
+#        n : int
+#            number of images to retrieve
+#        start_index : int
+#            start index of subsection of data to retrieve
+#        end_index : int
+#            end index of subsection of data to retrieve
+#
+#        Returns
+#        -------
+#        ndarray, shape (end_index-start_index, n-x)
+#            n-x retrieved image segments of dimension end_index-start_index
+#        """
+#        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
+#        # may have to rewrite eventually when number of images becomes large,
+#        # i.e. streamed setting, either that or downsample aggressively
+#        imgs = self.psi.get_images(n, assemble=False)
+#
+#        if includeUnformatted:
+#            imgsCopy = imgs.copy()
+#            imgsCopy = imgsCopy[
+#                [i for i in range(imgsCopy.shape[0]) if not np.isnan(imgsCopy[i : i + 1]).any()]
+#            ]
+#            num_valid_imgsCopy, p, x, y = imgsCopy.shape
+#            img_batchCopy = np.reshape(imgsCopy, (num_valid_imgsCopy, p * x * y)).T
+#            img_batchCopy[img_batchCopy<0] = 0
+#            nimg_batchCopy = []
+#            for img in img_batchCopy.T:
+#                if self.threshold:
+#    #                secondQuartile = np.sort(img)[-1]//4
+#    #                secondQuartile = np.mean(img)
+#    #                secondQuartile = np.median(img)
+#    #                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
+#                    secondQuartile = np.quantile(img, 0.93)
+#                    nimg = (img>secondQuartile)*img
+#    #                elu_v = np.vectorize(self.elu)
+#    #                nimg = elu_v(img-secondQuartile)+secondQuartile
+#                else:
+#                    nimg = img
+#                currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+#                if self.noZeroIntensity and currIntensity<50000:
+#                    continue
+#                else:
+#                    if currIntensity>=50000 and self.normalizeIntensity:
+#                        nimg_batchCopy.append(nimg/currIntensity)
+#                    else:
+#                        nimg_batchCopy.append(nimg)
+#
+#        if self.downsample:
+#            imgs = bin_data(imgs, self.bin_factor)
+#        imgs = imgs[
+#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+#        ]
+#        num_valid_imgs, p, x, y = imgs.shape
+#        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+#        img_batch[img_batch<0] = 0
+#        nimg_batch = []
+#        for img in img_batch.T:
+#            if self.threshold:
+##                secondQuartile = np.sort(img)[-1]//4
+##                secondQuartile = np.mean(img)
+##                secondQuartile = np.median(img)
+##                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
+#                secondQuartile = np.quantile(img, 0.93)
+#                nimg = (img>secondQuartile)*img
+##                elu_v = np.vectorize(self.elu)
+##                nimg = elu_v(img-secondQuartile)+secondQuartile
+#            else:
+#                nimg = img
+#
+#            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+#            if self.noZeroIntensity and currIntensity<50000:
+#                continue
+#            else:
+#                if currIntensity>=50000 and self.normalizeIntensity:
+#                    nimg_batch.append(nimg/currIntensity)
+#                else:
+#                    nimg_batch.append(nimg)
+#        if includeUnformatted:
+#            return (np.array(nimg_batch).T, np.array(nimg_batchCopy).T)
+#        else:
+#            return np.array(nimg_batch).T
 
     ###########################################################################
 
@@ -267,7 +315,7 @@ def intensityFunc_normalizeIntensity(img, currIntensity):
             return img/currIntensity
     ###########################################################################
 
-    def fetch_and_update_model(self, n):
+    def fetch_and_update_model(self, n, currInd):
         """
         Fetch images and update model.
 
@@ -276,7 +324,9 @@ def fetch_and_update_model(self, n):
         n : int
             number of images to incorporate
         """
-        img_batch = self.get_formatted_images(n)
+#        img_batch = self.get_formatted_images(n)
+        img_batch = self.imgData[currInd]
+#        print("1414oiioqdca", img_batch.shape)
 
         if self.samplingFactor <1:
             psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
@@ -321,7 +371,7 @@ def update_model(self, X):
         X: ndarray
             data to update matrix sketch with
         """
-        _, numIncorp = X.shape
+        _, numIncorp  = X.shape
         origNumIncorp = numIncorp
         with TaskTimer(self.task_durations, "total update"):
             if self.rank==0 and not self.merger:
@@ -331,7 +381,6 @@ def update_model(self, X):
                     )
                 )
             for row in X.T:
-                
                 canRankAdapt = numIncorp > (self.ell + 15)
                 if self.nextZeroRow >= self.m:
                     if self.increaseEll and canRankAdapt and self.rankAdapt:
@@ -377,28 +426,24 @@ def rotate(self):
         in Computer Science, vol 8737. Springer, Berlin, Heidelberg. 
         https://doi.org/10.1007/978-3-662-44777-2_39
         """
-        try:
-            [_,s,Vt] = svd(self.sketch , full_matrices=False)
-        except LinAlgError as err:
-            [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False)
-        if len(s) >= self.ell:
-            sCopy = s.copy()
+        [_,S,Vt] = np.linalg.svd(self.sketch , full_matrices=False)
+        ssize = S.shape[0]
+        if ssize >= self.ell:
+            sCopy = S.copy()
            #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise.
-            toShrink = s[:self.ell]**2 - s[self.ell-1]**2
+            toShrink = S[:self.ell]**2 - S[self.ell-1]**2
             #John: Explicitly set this value to be 0, since sometimes it is negative
             # or even turns to NaN due to roundoff error
             toShrink[-1] = 0
             toShrink = sqrt(toShrink)
-            
             toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
-
             self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
             self.sketch[self.ell:,:] = 0
             self.nextZeroRow = self.ell
         else:
-            self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:])
-            self.sketch[len(s):,:] = 0
-            self.nextZeroRow = len(s)
+            self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:]
+            self.sketch[ssize:,:] = 0
+            self.nextZeroRow = ssize
 
     def reconstructionError(self, matrixCentered):
         """ 
@@ -564,6 +609,8 @@ def write(self):
             hf.create_dataset("mean", data=self.mean)
             hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked))
             hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images
+        tables.file._open_files.close_all()
+        print("CREATED FILE: ", filename)
         self.comm.barrier()
         return filename 
 
@@ -593,24 +640,25 @@ class MergeTree:
     currRun: Current datetime used to identify run
     """
 
-    def __init__(self, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun):
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
+    def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun):
+        self.comm = comm
+        self.rank = rank
+        self.size = size
         
         self.divBy = divBy
         
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
+        tables.file._open_files.close_all()
 
-        self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False) 
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
         if self.rank==0:
             print("BUFFER SIZES: ", self.buffSizes)
 
-        print(self.data.T.shape)
+#        print(self.data.shape)
         self.fd.update_model(self.data.T)
 
         self.output_dir = output_dir
@@ -651,6 +699,8 @@ def merge(self):
                         bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double)
                         self.comm.Recv(bufferMe, source=proc, tag=17)
                         bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1]))
+#                        print("BUFFERME SHAPE", bufferMe.shape)
+#                        self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1])))))
                         self.fd.update_model(bufferMe.T)
                 else:
                     bufferMe = self.fd.get().copy().flatten()
@@ -669,6 +719,7 @@ def merge(self):
                                 + hf["sketch"].attrs["numImgsIncorp"])
                         self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"]
                         self.fullImgsTracked = np.vstack((self.fullImgsTracked,  hf["imgsTracked"][:]))
+                tables.file._open_files.close_all()
             return self.fd.get()
         else:
             return
@@ -677,6 +728,7 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
+#        print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked)
         filename = self.output_dir + '{}_merge.h5'.format(self.currRun)
         if self.rank==0:
             with h5py.File(filename, 'w') as hf:
@@ -684,7 +736,9 @@ def write(self):
                 hf.create_dataset("mean",  data=self.fullMean)
                 hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp
                 hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
-        self.comm.Barrier()
+            print("CREATED FILE: ", filename)
+            tables.file._open_files.close_all()
+        self.comm.barrier()
         return filename
 
 class ApplyCompression:
@@ -718,6 +772,9 @@ class ApplyCompression:
 
     def __init__(
         self,
+        comm,
+        rank,
+        size,
         start_offset,
         num_imgs,
         exp,
@@ -730,36 +787,39 @@ def __init__(
         noZeroIntensity,
         normalizeIntensity,
         currRun,
+        imgData, 
+        thumbnailData,
         downsample=False,
         bin_factor=2
     ):
 
-        self.output_dir = output_dir
+        self.comm = comm
+        self.rank = rank
+        self.size= size
 
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
+        self.output_dir = output_dir
 
         self.num_imgs = num_imgs
 
         self.currRun = currRun
 
-        self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
-                exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor,
-                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False)
-        self.grabberToSaveImages = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
-                exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0,
-                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False)
-        self.batchSize = batchSize
-
-        self.num_images = self.imgGrabber.num_images
-        self.num_features = self.imgGrabber.num_features
+#        self.imgGrabber = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
+#                exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor,
+#                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None)
+#        self.grabberToSaveImages = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
+#                exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0,
+#                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None)
+#        self.batchSize = batchSize
 
         self.num_incorporated_images = 0
 
+        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile, os.path.isfile(readFile)))
+        while(not os.path.isfile(readFile)):
+            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile, self.rank))
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
             self.mean = hf["mean"][:]
+        tables.file._open_files.close_all()
         
         U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
         self.components = Vt
@@ -769,25 +829,45 @@ def __init__(
 
         self.imageIndicesProcessed = []
 
+        self.imgData = imgData
+        self.thumbnailData = thumbnailData
+
 
     def run(self):
         """
         Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. 
         """
-        noImgsToProcess = self.num_images//self.size
-        for batch in range(0,noImgsToProcess,self.batchSize):
-            self.fetch_and_process_data()
+        noImgsToProcess = self.num_imgs//self.size
+#        for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)):
+        for currInd in range(len(self.imgData)):
+            self.fetch_and_process_data(currInd)
+#        print("RANK {} IS DONE".format(self.rank))
+#        self.fetch_and_process_data()
 
 
-    def fetch_and_process_data(self):
+    def fetch_and_process_data(self, currInd):
         """
         Fetch and downsample data, apply projection algorithm
         """
-        startCounter = self.imgGrabber.psi.counter
-        img_batch = self.imgGrabber.get_formatted_images(self.batchSize)
-        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
+#        startCounter = self.imgGrabber.psi.counter
 
-        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize))
+#        stimggrab = time.perf_counter()
+#        img_batch,img_batchUnformatted = self.imgGrabber.get_formatted_images(self.batchSize,includeUnformatted=True)
+#        img_batch = self.imgGrabber.get_formatted_images(self.batchSize)
+#        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
+#        etimggrab = time.perf_counter()
+#        print("{} Image Grab TIME: ".format(self.rank), etimggrab - stimggrab)
+
+#        stassemble = time.perf_counter()
+#        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize))
+#        toSave_img_batch = self.assembleImgsToSave(img_batchUnformatted)
+#        etassemble = time.perf_counter()
+#        print("{} Assemble TIME: ".format(self.rank), etassemble - stassemble)
+
+#        stassemble = time.perf_counter()
+
+        img_batch = self.imgData[currInd]
+        toSave_img_batch = self.thumbnailData[currInd]
 
         if self.smallImgs is None:
             self.smallImgs = toSave_img_batch
@@ -795,25 +875,52 @@ def fetch_and_process_data(self):
             self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
 #        self.apply_compression((img_batch.T - self.mean).T)
         self.apply_compression(img_batch)
+#        etassemble = time.perf_counter()
+#        print("{} Apply Compression TIME: ".format(self.rank), etassemble - stassemble)
+
+
+#        noImgsToProcess = self.num_images//self.size
+#        startCounter = self.imgGrabber.psi.counter
+#        img_batch = self.imgGrabber.get_formatted_images(noImgsToProcess)
+#        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
+#        st_compress = time.perf_counter()
+#        self.apply_compression(img_batch)
+#        et_compress = time.perf_counter()
+#        print("COMPRESSION TIME: ", et_compress - st_compress#)
+#
+#        st_assemble = time.perf_counter()
+#        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(noImgsToProcess))
+#        if self.smallImgs is None:
+#            self.smallImgs = toSave_img_batch
+#        else:
+#            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
+#        et_assemble = time.perf_counter()
+#        print("ASSEMBLE TIME: ", et_assemble-st_assemble)
+
+
+#    def assembleImgsToSave(self, imgs):
+#        """
+#        Form the images from psana pixel index map and downsample images. 
+#
+#        Parameters
+#        ----------
+#        imgs: ndarray
+#            images to downsample
+#        """
+#        pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run))
+#
+#        saveMe = []
+#        for img in imgs.T:
+#            imgRe = np.reshape(img, self.imgGrabber.psi.det.shape())
+#            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+#            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
+#        return np.array(saveMe)
+##        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
+##            self.imgGrabber.psi.det.shape()[0], 
+##            self.imgGrabber.psi.det.shape()[1], 
+##            self.imgGrabber.psi.det.shape()[2]))
+##        return assemble_image_stack_batch(imgsRe, pixel_index_map)
 
-    def assembleImgsToSave(self, imgs):
-        """
-        Form the images from psana pixel index map and downsample images. 
-
-        Parameters
-        ----------
-        imgs: ndarray
-            images to downsample
-        """
-        pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run))
-
-        saveMe = []
-        for img in imgs.T:
-            imgRe = np.reshape(img, self.imgGrabber.psi.det.shape())
-            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
-        saveMe = np.array(saveMe)
-        return saveMe
 
     def apply_compression(self, X):
         """
@@ -837,7 +944,9 @@ def write(self):
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)
-        self.comm.Barrier()
+        tables.file._open_files.close_all()
+        print("CREATED FILE: ", filename)
+        self.comm.barrier()
         return filename
 
 
@@ -942,20 +1051,23 @@ def random_unique_numbers_from_range(self, start, end, count):
         random.shuffle(all_numbers)
         return all_numbers[:count]
 
-    def euclidean_distance(self, p1, p2):
-        return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
+#    def euclidean_distance(self, p1, p2):
+#        return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
+
+#    def compute_medoid(self, points):
+#        min_total_distance = float('inf')
+#        medoid = None
+#        for i, point in enumerate(points):
+#            total_distance = 0
+#            for other_point in points:
+#                total_distance += self.euclidean_distance(point, other_point)
+#            if total_distance < min_total_distance:
+#                min_total_distance = total_distance
+#                medoid = point
+#        return medoid
 
     def compute_medoid(self, points):
-        min_total_distance = float('inf')
-        medoid = None
-        for i, point in enumerate(points):
-            total_distance = 0
-            for other_point in points:
-                total_distance += self.euclidean_distance(point, other_point)
-            if total_distance < min_total_distance:
-                min_total_distance = total_distance
-                medoid = point
-        return medoid
+        return points[np.argmin(euclidean_distances(points).sum(axis=0))]
 
     def genMedoids(self, medoidLabels, clusterPoints):
         dictMe = {}
@@ -1043,9 +1155,12 @@ def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
     def genUMAP(self):
+        for dirval in os.listdir(self.inputFile[:-26]):
+            print("ITEM IN DIRECTORY:", dirval)
         imgs = None
         projections = None
         for currRank in range(self.nprocs):
+            print("GETTING CURRENT RANK: ", currRank)
             with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
                 if imgs is None:
                     imgs = hf["SmallImages"][:]
@@ -1053,15 +1168,18 @@ def genUMAP(self):
                 else:
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+            tables.file._open_files.close_all()
 
         intensities = []
         for img in imgs:
             intensities.append(np.sum(img.flatten()))
         intensities = np.array(intensities)
 
-        self.imgs = imgs[:self.numImgsToUse]
-        self.projections = projections[:self.numImgsToUse]
-        self.intensities = intensities[:self.numImgsToUse]
+        skipMe = 4
+        self.imgs = imgs[:self.numImgsToUse:skipMe]
+        self.projections = projections[:self.numImgsToUse:skipMe]
+        self.intensities = intensities[:self.numImgsToUse:skipMe]
+        self.numImgsToUse = int(self.numImgsToUse/skipMe)
 
         if len(self.imgs)!= self.numImgsToUse:
             raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
@@ -1076,12 +1194,21 @@ def genUMAP(self):
             min_samples=int(self.numImgsToUse*0.75//40),
             min_cluster_size=int(self.numImgsToUse//40),
         ).fit_predict(self.clusterable_embedding)
-
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
 
+        self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05)
+        self.opticsClust.fit(self.clusterable_embedding)
+        self.opticsLabels = cluster_optics_dbscan(
+            reachability=self.opticsClust.reachability_,
+            core_distances=self.opticsClust.core_distances_,
+            ordering=self.opticsClust.ordering_,
+            eps=2,
+        )
+
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
+        self.experData_df['imgind'] = np.arange(self.numImgsToUse)
 
     def genABOD(self):
         if self.includeABOD:
@@ -1130,6 +1257,18 @@ def genLabels(self):
                 medoidBold.append(4)
         self.experData_df['medoidBold'] = medoidBold
 
+        opticsNewLabels = []
+        for j in self.opticsLabels[self.clustered]:
+            doneChecking = False
+            for grouping in self.userGroupings:
+                if j in grouping and not doneChecking:
+                    opticsNewLabels.append(min(grouping))
+                    doneChecking=True
+            if not doneChecking:
+                opticsNewLabels.append(j)
+        opticsNewLabels = list(np.array(opticsNewLabels) + 1)
+        self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
+
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
         color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
@@ -1146,6 +1285,7 @@ def genHTML(self):
             <div>
                 <span style='font-size: 16px; color: #224499'>Cluster #</span>
                 <span style='font-size: 18px'>@cluster</span>
+                <span style='font-size: 18px'>@imgind</span>
             </div>
         </div>
         """))
@@ -1178,7 +1318,7 @@ def genHTML(self):
         p.yaxis.axis_label = "Count"
 
         indexCDS = ColumnDataSource(dict(
-            index=[*range(0, self.numImgsToUse, 10)]
+            index=[*range(0, self.numImgsToUse, 2)]
             )
         )
         cols = RangeSlider(title="ET",
@@ -1218,7 +1358,8 @@ def genHTML(self):
         const cluster = datasource.data.cluster
         const ptColor = datasource.data.ptColor
         const anomDet = datasource.data.anomDet
-        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet}
+        const imgind = datasource.data.imgind
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind}
         """)
         cols.js_on_change('value', callback)
 
@@ -1272,9 +1413,37 @@ def genHTML(self):
         """)
         toggl.js_on_change('active',toggl_js)
 
-        LABELS = ["DBSCAN Clustering", "Anomaly Detection"]
+        reachabilityDiag = figure(
+            title='OPTICS Reachability Diag',
+            tools=('pan, wheel_zoom, reset'),
+            width = 2000, height = 400
+        )
+
+        space = np.arange(self.numImgsToUse)
+        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
+
+        opticsData_df = pd.DataFrame({'x':space,'y':reachability})
+        opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels]
+        opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']]
+        color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))],
+                                               palette=Category20[20])
+        opticssource = ColumnDataSource(opticsData_df)
+
+        reachabilityDiag.circle(
+            'x',
+            'y',
+            source=opticssource,
+            color=dict(field='ptColor', transform=color_mapping2),
+            line_alpha=0.6,
+            fill_alpha=0.6,
+            legend_field='cluster'
+        )
+        reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed")
+        reachabilityDiag.y_range = Range1d(-1, 10)
+
+        LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"]
         radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
-        radioGroup_js = CustomJS(args=dict(datasource=datasource), code="""
+        radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code="""
             console.log(datasource.data.ptColor)
             const x = datasource.data.x
             const y = datasource.data.y
@@ -1282,26 +1451,37 @@ def genHTML(self):
             const medoidBold = datasource.data.medoidBold
             const cluster = datasource.data.cluster
             const anomDet = datasource.data.anomDet
+            const imgind = datasource.data.imgind
+
+            const opticsClust = opticssource.data.cluster
 
             let ptColor = null
 
             if (cb_obj.active==0){
                 ptColor = cluster
             }
+            else if (cb_obj.active==1){
+                ptColor = opticsClust
+            }
             else{
                 ptColor = anomDet
             }
-            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet}
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind}
         """)
         radio_button_group.js_on_change("active", radioGroup_js)
 
-        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group))
+        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
 
     def fullVisualize(self):
+        print("here 4")
         self.genUMAP()
+        print("here 5")
         self.genABOD()
+        print("here 6")
         self.genLabels()
+        print("here 7")
         self.genHTML()
+        print("here 8")
 
     def updateLabels(self):
         self.genLabels()
@@ -1320,13 +1500,32 @@ def userShow(self):
         output_notebook()
         show(self.viewResults)
 
+def profile(filename=None, comm=MPI.COMM_WORLD):
+  def prof_decorator(f):
+    def wrap_f(*args, **kwargs):
+      pr = cProfile.Profile()
+      pr.enable()
+      result = f(*args, **kwargs)
+      pr.disable()
+
+      if filename is None:
+        pr.print_stats()
+      else:
+        filename_r = filename + ".{}".format(comm.rank)
+        pr.dump_stats(filename_r)
+
+      return result
+    return wrap_f
+  return prof_decorator
+
+def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
 
 class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
     def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize):
-        self.currRun = datetime.now().strftime("%y%m%d%H%M")
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1346,22 +1545,146 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
         self.divBy = divBy 
         self.batchSize = batchSize
 
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
+        self.imgsTracked = []
+
+        if self.rank==0:
+            self.currRun = datetime.now().strftime("%y%m%d%H%M%S")
+        else:
+            self.currRun = None
+        self.currRun = self.comm.bcast(self.currRun, root=0)
+
+    def assembleImgsToSave(self, imgs):
+        """
+        Form the images from psana pixel index map and downsample images. 
+
+        Parameters
+        ----------
+        imgs: ndarray
+            images to downsample
+        """
+        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+
+        saveMe = []
+        for img in imgs:
+            imgRe = np.reshape(img, self.psi.det.shape())
+            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
+        return np.array(saveMe)
+#        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
+#            self.imgGrabber.psi.det.shape()[0], 
+#            self.imgGrabber.psi.det.shape()[1], 
+#            self.imgGrabber.psi.det.shape()[2]))
+#        return assemble_image_stack_batch(imgsRe, pixel_index_map)
+
+    def get_formatted_images(self, startInd, n, includeThumbnails=False):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+        self.psi.counter = startInd
+        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
+
+        imgs = self.psi.get_images(n, assemble=False)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+        num_valid_imgs, p, x, y = imgs.shape
+        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+        img_batch[img_batch<0] = 0
+        nimg_batch = []
+        for img in img_batch.T:
+            if self.threshold:
+                secondQuartile = np.quantile(img, 0.93)
+                nimg = (img>secondQuartile)*img
+#                elu_v = np.vectorize(self.elu)
+#                nimg = elu_v(img-secondQuartile)+secondQuartile
+            else:
+                nimg = img
+
+            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+#            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
+            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
+                continue
+            else:
+                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
+#                if not self.normalizeIntensity:
+                    nimg_batch.append(nimg/currIntensity)
+                else:
+#                    nimg_batch.append(nimg)
+                    nimg_batch.append(np.zeros(nimg.shape))
+        nimg_batch = np.array(nimg_batch)
+        if self.downsample:
+            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
+            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
+            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
+#            print(binned_imgs.shape)
+        else:
+            binned_imgs = nimg_batch.T
+        if includeThumbnails:
+            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))))
+        else:
+            return binned_imgs
+
+    @profile(filename="fullFD_profile")
     def runMe(self):
         stfull = time.perf_counter()
 
+        #DATA RETRIEVAL STEP
+        ##########################################################################################
+        self.fullImgData = []
+        self.fullThumbnailData = []
+        noImgsToProcess = self.num_imgs//self.size
+        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
+        batchSize = int(self.num_components*2//self.samplingFactor)
+        for batch in range(0, noImgsToProcess, batchSize): 
+            startInd = startingPoint+batch
+            binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True)
+#            print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape)
+            self.fullImgData.append(binned_imgs)
+            self.fullThumbnailData.append(thumbnails)
+        print(self.imgsTracked)
+
+        filenameTest0 = random.randint(0, 10)
+        filenameTest0 = self.comm.allgather(filenameTest0) 
+        print("TEST 0: ", self.rank, filenameTest0)
+
         #SKETCHING STEP
         ##########################################################################################
-        freqDir = FreqDir(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
+        freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
                 det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
                 threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
-                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming)
-        print("STARTING SKETCHING")
+                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData)
+        print("STARTING SKETCHING FOR {}".format(self.currRun))
         st = time.perf_counter()
         freqDir.run()
         localSketchFilename = freqDir.write()
         et = time.perf_counter()
-        print("Estimated time for frequent directions rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st))
+        print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
+
+        filenameTest1 = random.randint(0, 10)
+        filenameTest1 = self.comm.allgather(filenameTest1) 
+        print("TEST 1: ", self.rank, filenameTest1)
 
         #MERGING STEP
         ##########################################################################################
@@ -1372,37 +1695,44 @@ def runMe(self):
         allNames = []
         for j in range(freqDir.size):
             allNames.append(fullSketchFilename + str(j) + ".h5")
-        mergeTree = MergeTree(exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
+        mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
                 output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun)
         #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename,
         #        dir=writeToHere, allWriteDirecs=allNames, currRun = currRun)
-
         st = time.perf_counter()
         mergeTree.merge()
         mergedSketchFilename = mergeTree.write()
         et = time.perf_counter()
-        print("Estimated time merge tree for rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st))
-
+        print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
 
+        filenameTest2 = random.randint(0, 10)
+        filenameTest2 = self.comm.allgather(filenameTest2) 
+        print("TEST 2: ", self.rank, filenameTest2)
 
         #PROJECTION STEP
         ##########################################################################################
-        appComp = ApplyCompression(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
+        appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
                 det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere,
                 batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
-                downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun)
+                downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData)
         st = time.perf_counter()
         appComp.run()
         appComp.write()
         et = time.perf_counter()
-        print("Estimated time projection for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, et - st))
-
+        print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
+        print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
+        
+        self.comm.barrier()
+        self.comm.Barrier()
+        filenameTest3 = random.randint(0, 10)
+        filenameTest3 = self.comm.allgather(filenameTest3) 
+        print("TEST 3: ", self.rank, filenameTest3)
 
-        etfull = time.perf_counter()
-        print("Estimated full processing time for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, etfull - stfull))
         ##########################################################################################
-
-        if freqDir.rank==0:
+        
+        
+        if self.rank==0:
+            print("here 1")
             st = time.perf_counter()
             visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
                             outputFile="./UMAPVis_{}.html".format(self.currRun),
@@ -1410,7 +1740,9 @@ def runMe(self):
                             nprocs=freqDir.size,
                             userGroupings=[],
                             includeABOD=True)
+            print("here 2")
             visMe.fullVisualize()
+            print("here 3")
             visMe.userSave()
             et = time.perf_counter()
             print("UMAP HTML Generation Processing time: {}".format(et - st))

From e75809ca4aeebf0642d0c9d08342e01ab1ae7037 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Fri, 1 Sep 2023 19:32:10 -0700
Subject: [PATCH 25/57] 124 hz officially a achieved. Permission denied and h5
 truncated, h5 file signature found error resolved (sleep for a couple seconds
 and a bunch of allgather statements seems to have fixed the issue.

---
 btx/processing/freqdir.py | 155 ++++++++++++++++++++------------------
 1 file changed, 83 insertions(+), 72 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 00d520bac..f3c4eb0c4 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -36,7 +36,6 @@
 from PIL import Image
 from io import BytesIO
 import base64
-import tables
 
 from datetime import datetime
 
@@ -172,7 +171,7 @@ def __init__(
         self.sketch = zeros( (self.m, self.d) ) 
         self.nextZeroRow = 0
         self.alpha = alpha
-        self.mean = None
+#        self.mean = None
         self.imgsTracked = []
 
         self.rankAdapt = rankAdapt
@@ -325,8 +324,9 @@ def fetch_and_update_model(self, n, currInd):
             number of images to incorporate
         """
 #        img_batch = self.get_formatted_images(n)
-        img_batch = self.imgData[currInd]
-#        print("1414oiioqdca", img_batch.shape)
+        print("a90wjufipoamfoawfa09opi", self.imgData.shape)
+        img_batch = self.imgData[:, currInd*n:currInd*(n+1)]
+        print("1414oiioqdca", img_batch.shape)
 
         if self.samplingFactor <1:
             psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
@@ -334,13 +334,13 @@ def fetch_and_update_model(self, n, currInd):
                 psamp.update(row)
             img_batch = np.array(psamp.sketch.get()).T
 
-        if self.mean is None:
-            self.mean = np.mean(img_batch, axis=1)
-        else:
-#            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
+#        if self.mean is None:
+#            self.mean = np.mean(img_batch, axis=1)
+#        else:
+##            self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/(
+##                    self.num_incorporated_images + (img_batch.shape[1]))
+#             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/(
 #                    self.num_incorporated_images + (img_batch.shape[1]))
-             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/(
-                    self.num_incorporated_images + (img_batch.shape[1]))
 #        self.update_model((img_batch.T - self.mean).T)
         self.update_model(img_batch)
 
@@ -606,11 +606,10 @@ def write(self):
         filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
-            hf.create_dataset("mean", data=self.mean)
+#            hf.create_dataset("mean", data=self.mean)
             hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked))
             hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images
-        tables.file._open_files.close_all()
-        print("CREATED FILE: ", filename)
+        print(self.rank, "CREATED FILE: ", filename)
         self.comm.barrier()
         return filename 
 
@@ -647,16 +646,16 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
+        time.sleep(5)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
-        tables.file._open_files.close_all()
 
         self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-        if self.rank==0:
-            print("BUFFER SIZES: ", self.buffSizes)
+#        if self.rank==0:
+#            print("BUFFER SIZES: ", self.buffSizes)
 
 #        print(self.data.shape)
         self.fd.update_model(self.data.T)
@@ -711,15 +710,14 @@ def merge(self):
             for readMe in self.allWriteDirecs:
                 with h5py.File(readMe, 'r') as hf:
                     if self.fullMean is None:
-                        self.fullMean = hf["mean"][:]
+#                        self.fullMean = hf["mean"][:]
                         self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"]
                         self.fullImgsTracked = hf["imgsTracked"][:]
                     else:
-                        self.fullMean =  (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp
-                                + hf["sketch"].attrs["numImgsIncorp"])
+#                        self.fullMean =  (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp
+#                                + hf["sketch"].attrs["numImgsIncorp"])
                         self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"]
                         self.fullImgsTracked = np.vstack((self.fullImgsTracked,  hf["imgsTracked"][:]))
-                tables.file._open_files.close_all()
             return self.fd.get()
         else:
             return
@@ -730,14 +728,19 @@ def write(self):
         """
 #        print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked)
         filename = self.output_dir + '{}_merge.h5'.format(self.currRun)
+
         if self.rank==0:
-            with h5py.File(filename, 'w') as hf:
-                hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
-                hf.create_dataset("mean",  data=self.fullMean)
-                hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp
-                hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
-            print("CREATED FILE: ", filename)
-            tables.file._open_files.close_all()
+            for ind in range(self.size):
+                filename2 = filename[:-3] + "_"+str(ind)+".h5"
+                with h5py.File(filename2, 'w') as hf:
+                    hf.create_dataset("sketch",  data=self.fd.sketch[:self.fd.ell, :])
+#                    hf.create_dataset("mean",  data=self.fullMean)
+                    hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp
+                    hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
+#                print("CREATED FILE: ", filename2)
+                self.comm.send(filename2, dest=ind, tag=ind)
+        else:
+            print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank))
         self.comm.barrier()
         return filename
 
@@ -813,13 +816,15 @@ def __init__(
 
         self.num_incorporated_images = 0
 
-        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile, os.path.isfile(readFile)))
-        while(not os.path.isfile(readFile)):
-            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile, self.rank))
-        with h5py.File(readFile, 'r') as hf:
+        readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5"
+
+#        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
+#        while(not os.path.isfile(readFile2)):
+#            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
+        time.sleep(5)
+        with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
-            self.mean = hf["mean"][:]
-        tables.file._open_files.close_all()
+#            self.mean = hf["mean"][:]
         
         U, S, Vt = np.linalg.svd(self.data, full_matrices=False)
         self.components = Vt
@@ -837,10 +842,10 @@ def run(self):
         """
         Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. 
         """
-        noImgsToProcess = self.num_imgs//self.size
+#        noImgsToProcess = self.num_imgs//self.size
 #        for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)):
-        for currInd in range(len(self.imgData)):
-            self.fetch_and_process_data(currInd)
+#        for currInd in range(len(self.imgData)):
+        self.fetch_and_process_data(0)
 #        print("RANK {} IS DONE".format(self.rank))
 #        self.fetch_and_process_data()
 
@@ -866,8 +871,8 @@ def fetch_and_process_data(self, currInd):
 
 #        stassemble = time.perf_counter()
 
-        img_batch = self.imgData[currInd]
-        toSave_img_batch = self.thumbnailData[currInd]
+        img_batch = self.imgData
+        toSave_img_batch = self.thumbnailData
 
         if self.smallImgs is None:
             self.smallImgs = toSave_img_batch
@@ -944,8 +949,7 @@ def write(self):
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
             hf.create_dataset("SmallImages", data=self.smallImgs)
-        tables.file._open_files.close_all()
-        print("CREATED FILE: ", filename)
+#        print("CREATED FILE: ", filename)
         self.comm.barrier()
         return filename
 
@@ -1155,12 +1159,12 @@ def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
     def genUMAP(self):
-        for dirval in os.listdir(self.inputFile[:-26]):
-            print("ITEM IN DIRECTORY:", dirval)
+#        for dirval in os.listdir(self.inputFile[:-26]):
+#            print("ITEM IN DIRECTORY:", dirval)
         imgs = None
         projections = None
         for currRank in range(self.nprocs):
-            print("GETTING CURRENT RANK: ", currRank)
+#            print("GETTING CURRENT RANK: ", currRank)
             with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
                 if imgs is None:
                     imgs = hf["SmallImages"][:]
@@ -1168,14 +1172,13 @@ def genUMAP(self):
                 else:
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
-            tables.file._open_files.close_all()
 
         intensities = []
         for img in imgs:
             intensities.append(np.sum(img.flatten()))
         intensities = np.array(intensities)
 
-        skipMe = 4
+        skipMe = 8
         self.imgs = imgs[:self.numImgsToUse:skipMe]
         self.projections = projections[:self.numImgsToUse:skipMe]
         self.intensities = intensities[:self.numImgsToUse:skipMe]
@@ -1199,12 +1202,15 @@ def genUMAP(self):
 
         self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05)
         self.opticsClust.fit(self.clusterable_embedding)
-        self.opticsLabels = cluster_optics_dbscan(
-            reachability=self.opticsClust.reachability_,
-            core_distances=self.opticsClust.core_distances_,
-            ordering=self.opticsClust.ordering_,
-            eps=2,
-        )
+#        self.opticsLabels = cluster_optics_dbscan(
+#            reachability=self.opticsClust.reachability_,
+#            core_distances=self.opticsClust.core_distances_,
+#            ordering=self.opticsClust.ordering_,
+#            eps=2,
+#        )
+
+#        self.opticsLabels = self.opticsClust.labels_[self.opticsClust.ordering_]
+        self.opticsLabels = self.opticsClust.labels_
 
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
@@ -1419,8 +1425,10 @@ def genHTML(self):
             width = 2000, height = 400
         )
 
-        space = np.arange(self.numImgsToUse)
-        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
+#        space = np.arange(self.numImgsToUse)
+        space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_]
+#        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
+        reachability = self.opticsClust.reachability_
 
         opticsData_df = pd.DataFrame({'x':space,'y':reachability})
         opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels]
@@ -1473,15 +1481,15 @@ def genHTML(self):
         self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
 
     def fullVisualize(self):
-        print("here 4")
+#        print("here 4")
         self.genUMAP()
-        print("here 5")
+#        print("here 5")
         self.genABOD()
-        print("here 6")
+#        print("here 6")
         self.genLabels()
-        print("here 7")
+#        print("here 7")
         self.genHTML()
-        print("here 8")
+#        print("here 8")
 
     def updateLabels(self):
         self.genLabels()
@@ -1602,6 +1610,7 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False):
         """
         self.psi.counter = startInd
         self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
+        print(self.imgsTracked)
 
         imgs = self.psi.get_images(n, assemble=False)
 
@@ -1651,18 +1660,20 @@ def runMe(self):
 
         #DATA RETRIEVAL STEP
         ##########################################################################################
-        self.fullImgData = []
-        self.fullThumbnailData = []
-        noImgsToProcess = self.num_imgs//self.size
-        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        batchSize = int(self.num_components*2//self.samplingFactor)
-        for batch in range(0, noImgsToProcess, batchSize): 
-            startInd = startingPoint+batch
-            binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True)
+#        self.fullImgData = []
+#        self.fullThumbnailData = []
+#        noImgsToProcess = self.num_imgs//self.size
+#        batchSize = int(self.num_components*2//self.samplingFactor)
+#        for batch in range(0, noImgsToProcess, batchSize): 
+#            startInd = startingPoint+batch
+#            binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True)
 #            print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape)
-            self.fullImgData.append(binned_imgs)
-            self.fullThumbnailData.append(thumbnails)
-        print(self.imgsTracked)
+#            self.fullImgData.append(binned_imgs)
+#            self.fullThumbnailData.append(thumbnails)
+#        print(self.imgsTracked)
+        
+        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
+        self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True)
 
         filenameTest0 = random.randint(0, 10)
         filenameTest0 = self.comm.allgather(filenameTest0) 
@@ -1732,7 +1743,7 @@ def runMe(self):
         
         
         if self.rank==0:
-            print("here 1")
+#            print("here 1")
             st = time.perf_counter()
             visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
                             outputFile="./UMAPVis_{}.html".format(self.currRun),
@@ -1740,9 +1751,9 @@ def runMe(self):
                             nprocs=freqDir.size,
                             userGroupings=[],
                             includeABOD=True)
-            print("here 2")
+#            print("here 2")
             visMe.fullVisualize()
-            print("here 3")
+#            print("here 3")
             visMe.userSave()
             et = time.perf_counter()
             print("UMAP HTML Generation Processing time: {}".format(et - st))

From 697723eb3414f567ea1fb231f9993bbcdc6cec3a Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 3 Sep 2023 12:07:01 -0700
Subject: [PATCH 26/57] Everything working and produces 120 hz processing
 speed. Added modularity to image processing (again), fixed optics labeling
 bug, added visualization parameters, background color of images is now dbscan
 cluster color

---
 btx/processing/freqdir.py | 237 ++++++++++++++++++++++++--------------
 1 file changed, 149 insertions(+), 88 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index f3c4eb0c4..6605df459 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -288,31 +288,6 @@ def elu(self,x):
 #        else:
 #            return np.array(nimg_batch).T
 
-    ###########################################################################
-
-    #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch.
-    def intensityFunc_threshold(img):
-        if img is None:
-            return img
-        else:
-            secondQuartile = np.sort(img)[-1]//4
-            return (img>secondQuartile)*img
-
-    def intensityFunc_removeZeroIntensity(img, currIntensity):
-        if currIntensity<50000:
-            return None
-        else:
-            return img
-
-    def intensityFunc_normalizeIntensity(img, currIntensity):
-        if img is None:
-            return img
-
-        if currIntensity<50000:
-            return img
-        else:
-            return img/currIntensity
-    ###########################################################################
 
     def fetch_and_update_model(self, n, currInd):
         """
@@ -646,7 +621,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
-        time.sleep(5)
+        time.sleep(30)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
@@ -821,7 +796,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(5)
+        time.sleep(30)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -1031,7 +1006,9 @@ class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
     """
-    def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings):
+    def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
+            skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
+            optics_min_samples, optics_xi, optics_min_cluster_size):
         self.inputFile = inputFile
         self.outputFile = outputFile
         output_file(filename=outputFile, title="Static HTML file")
@@ -1040,6 +1017,14 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use
         self.nprocs = nprocs
         self.includeABOD = includeABOD
         self.userGroupings = userGroupings
+        self.skipSize = skipSize
+        self.umap_n_neighbors = umap_n_neighbors
+        self.umap_random_state = umap_random_state
+        self.hdbscan_min_samples=hdbscan_min_samples
+        self.hdbscan_min_cluster_size=hdbscan_min_cluster_size
+        self.optics_min_samples=optics_min_samples
+        self.optics_xi = optics_xi
+        self.optics_min_cluster_size = optics_min_cluster_size
 
     def embeddable_image(self, data):
         img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
@@ -1178,29 +1163,29 @@ def genUMAP(self):
             intensities.append(np.sum(img.flatten()))
         intensities = np.array(intensities)
 
-        skipMe = 8
-        self.imgs = imgs[:self.numImgsToUse:skipMe]
-        self.projections = projections[:self.numImgsToUse:skipMe]
-        self.intensities = intensities[:self.numImgsToUse:skipMe]
-        self.numImgsToUse = int(self.numImgsToUse/skipMe)
+        self.imgs = imgs[:self.numImgsToUse:self.skipSize]
+        self.projections = projections[:self.numImgsToUse:self.skipSize]
+        self.intensities = intensities[:self.numImgsToUse:self.skipSize]
+
+        self.numImgsToUse = int(self.numImgsToUse/self.skipSize)
 
         if len(self.imgs)!= self.numImgsToUse:
             raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
 
         self.clusterable_embedding = umap.UMAP(
-            n_neighbors=self.numImgsToUse//40,
+            n_neighbors=self.umap_n_neighbors,
+            random_state=self.umap_random_state,
             n_components=2,
-            random_state=42
         ).fit_transform(self.projections)
 
         self.labels = hdbscan.HDBSCAN(
-            min_samples=int(self.numImgsToUse*0.75//40),
-            min_cluster_size=int(self.numImgsToUse//40),
+            min_samples = self.hdbscan_min_samples,
+            min_cluster_size = self.hdbscan_min_cluster_size
         ).fit_predict(self.clusterable_embedding)
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
 
-        self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05)
+        self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
         self.opticsClust.fit(self.clusterable_embedding)
 #        self.opticsLabels = cluster_optics_dbscan(
 #            reachability=self.opticsClust.reachability_,
@@ -1214,7 +1199,7 @@ def genUMAP(self):
 
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
-        self.experData_df['imgind'] = np.arange(self.numImgsToUse)
+        self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
 
     def genABOD(self):
         if self.includeABOD:
@@ -1253,6 +1238,7 @@ def genLabels(self):
         self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
         self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
         self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
+        self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
         medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
         self.medoidInds = [x[1] for x in medoid_lst]
         medoidBold = []
@@ -1284,14 +1270,17 @@ def genHTML(self):
             width = 2000, height = 600
         )
         plot_figure.add_tools(HoverTool(tooltips="""
-        <div>
+        <div style="background-color:@backgroundColor;">
+            <div>
+                <img src='@image' style='float: left; margin: 0px 15px 15px 0px'/>
+            </div>
             <div>
-                <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
+                <span style='font-size: 10px; color: #224499'>Cluster #</span>
+                <span style='font-size: 9px'>@cluster</span>
             </div>
             <div>
-                <span style='font-size: 16px; color: #224499'>Cluster #</span>
-                <span style='font-size: 18px'>@cluster</span>
-                <span style='font-size: 18px'>@imgind</span>
+                <span style='font-size: 10px; color: #224499'>Image #</span>
+                <span style='font-size: 9px'>@imgind</span>
             </div>
         </div>
         """))
@@ -1365,13 +1354,14 @@ def genHTML(self):
         const ptColor = datasource.data.ptColor
         const anomDet = datasource.data.anomDet
         const imgind = datasource.data.imgind
-        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind}
+        const backgroundColor = datasource.data.backgroundColor
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor}
         """)
         cols.js_on_change('value', callback)
 
 
         imgsPlot = figure(width=2000, height=150, toolbar_location=None)
-        imgsPlot.image(image=[self.imgs[imgind][::-1] for imgind in self.medoidInds],
+        imgsPlot.image(image=[self.imgs[imgindMe][::-1] for imgindMe in self.medoidInds],
                 x=[0.25+xind for xind in range(len(self.medoidInds))],
                 y=0,
                 dw=0.5, dh=1,
@@ -1396,10 +1386,10 @@ def genHTML(self):
                     clearInterval(looop);
                     }
                 else if(slider_val1 >= index[index.length - 1]) {
-                    cb_obj.label = '► Play';
+//                    cb_obj.label = '► Play';
                     slider.value = [0, slider_val1-slider_val0];
-                    cb_obj.active = false;
-                    clearInterval(looop);
+//                   cb_obj.active = false;
+//                    clearInterval(looop);
                     }
                 else if(slider_val1 !== index[index.length - 1]){
                     slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]];
@@ -1425,13 +1415,14 @@ def genHTML(self):
             width = 2000, height = 400
         )
 
-#        space = np.arange(self.numImgsToUse)
-        space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_]
+        space = np.arange(self.numImgsToUse)
+#        space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_]
 #        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
         reachability = self.opticsClust.reachability_
 
         opticsData_df = pd.DataFrame({'x':space,'y':reachability})
-        opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels]
+        opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels]
+        opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]]
         opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']]
         color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))],
                                                palette=Category20[20])
@@ -1460,8 +1451,9 @@ def genHTML(self):
             const cluster = datasource.data.cluster
             const anomDet = datasource.data.anomDet
             const imgind = datasource.data.imgind
+            const backgroundColor = datasource.data.backgroundColor
 
-            const opticsClust = opticssource.data.cluster
+            const opticsClust = opticssource.data.clusterForScatterPlot
 
             let ptColor = null
 
@@ -1474,7 +1466,7 @@ def genHTML(self):
             else{
                 ptColor = anomDet
             }
-            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind}
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor}
         """)
         radio_button_group.js_on_change("active", radioGroup_js)
 
@@ -1533,7 +1525,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize):
+    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize, thresholdQuantile):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1552,6 +1544,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
         self.priming=priming
         self.divBy = divBy 
         self.batchSize = batchSize
+        self.thresholdQuantile = thresholdQuantile
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -1567,6 +1560,8 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
+        self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01)
+
     def assembleImgsToSave(self, imgs):
         """
         Form the images from psana pixel index map and downsample images. 
@@ -1617,31 +1612,47 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False):
         imgs = imgs[
             [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
         ]
-        num_valid_imgs, p, x, y = imgs.shape
+        if len(imgs.shape)==4:
+            num_valid_imgs, p, x, y = imgs.shape
+        else:
+            p = 1
+            num_valid_imgs, x, y = imgs.shape
         img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
         img_batch[img_batch<0] = 0
         nimg_batch = []
         for img in img_batch.T:
-            if self.threshold:
-                secondQuartile = np.quantile(img, 0.93)
-                nimg = (img>secondQuartile)*img
-#                elu_v = np.vectorize(self.elu)
-#                nimg = elu_v(img-secondQuartile)+secondQuartile
-            else:
-                nimg = img
-
+            nimg = img
             currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-#            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
-            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
-                continue
-            else:
-                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
-#                if not self.normalizeIntensity:
-                    nimg_batch.append(nimg/currIntensity)
-                else:
-#                    nimg_batch.append(nimg)
-                    nimg_batch.append(np.zeros(nimg.shape))
+            if self.threshold:
+                nimg = self.imageProcessor.threshold(nimg)
+            if self.noZeroIntensity:
+                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
+            if self.normalizeIntensity:
+                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
+            if nimg is not None:
+                nimg_batch.append(nimg)
         nimg_batch = np.array(nimg_batch)
+#            self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img))
+#            if self.threshold:
+#                secondQuartile = np.quantile(img, self.thresholdQuantile)
+#                nimg = (img>secondQuartile)*img
+##                elu_v = np.vectorize(self.elu)
+##                nimg = elu_v(img-secondQuartile)+secondQuartile
+#            else:
+#                nimg = img
+#
+#            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+##            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
+#            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
+#                continue
+#            else:
+#                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
+##                if not self.normalizeIntensity:
+#                    nimg_batch.append(nimg/currIntensity)
+#                else:
+##                    nimg_batch.append(nimg)
+#                    nimg_batch.append(np.zeros(nimg.shape))
+#        nimg_batch = np.array(nimg_batch)
         if self.downsample:
             binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
             binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
@@ -1675,9 +1686,9 @@ def runMe(self):
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True)
 
-        filenameTest0 = random.randint(0, 10)
-        filenameTest0 = self.comm.allgather(filenameTest0) 
-        print("TEST 0: ", self.rank, filenameTest0)
+#        filenameTest0 = random.randint(0, 10)
+#        filenameTest0 = self.comm.allgather(filenameTest0) 
+#        print("TEST 0: ", self.rank, filenameTest0)
 
         #SKETCHING STEP
         ##########################################################################################
@@ -1693,9 +1704,9 @@ def runMe(self):
         et = time.perf_counter()
         print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
 
-        filenameTest1 = random.randint(0, 10)
-        filenameTest1 = self.comm.allgather(filenameTest1) 
-        print("TEST 1: ", self.rank, filenameTest1)
+#        filenameTest1 = random.randint(0, 10)
+#        filenameTest1 = self.comm.allgather(filenameTest1) 
+#        print("TEST 1: ", self.rank, filenameTest1)
 
         #MERGING STEP
         ##########################################################################################
@@ -1716,9 +1727,9 @@ def runMe(self):
         et = time.perf_counter()
         print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
 
-        filenameTest2 = random.randint(0, 10)
-        filenameTest2 = self.comm.allgather(filenameTest2) 
-        print("TEST 2: ", self.rank, filenameTest2)
+#        filenameTest2 = random.randint(0, 10)
+#        filenameTest2 = self.comm.allgather(filenameTest2) 
+#        print("TEST 2: ", self.rank, filenameTest2)
 
         #PROJECTION STEP
         ##########################################################################################
@@ -1735,9 +1746,9 @@ def runMe(self):
         
         self.comm.barrier()
         self.comm.Barrier()
-        filenameTest3 = random.randint(0, 10)
-        filenameTest3 = self.comm.allgather(filenameTest3) 
-        print("TEST 3: ", self.rank, filenameTest3)
+#        filenameTest3 = random.randint(0, 10)
+#        filenameTest3 = self.comm.allgather(filenameTest3) 
+#        print("TEST 3: ", self.rank, filenameTest3)
 
         ##########################################################################################
         
@@ -1745,12 +1756,21 @@ def runMe(self):
         if self.rank==0:
 #            print("here 1")
             st = time.perf_counter()
+
+            skipSize = 8 
+            numImgsToUse = int(self.num_imgs/skipSize)
             visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
                             outputFile="./UMAPVis_{}.html".format(self.currRun),
                             numImgsToUse=self.num_imgs,
-                            nprocs=freqDir.size,
+                            nprocs=self.size,
                             userGroupings=[],
-                            includeABOD=True)
+                            includeABOD=True,
+                            skipSize = skipSize,
+                            umap_n_neighbors=numImgsToUse//40,
+                            umap_random_state=42,
+                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
+                            hdbscan_min_cluster_size=int(numImgsToUse//40),
+                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
 #            print("here 2")
             visMe.fullVisualize()
 #            print("here 3")
@@ -1759,4 +1779,45 @@ def runMe(self):
             print("UMAP HTML Generation Processing time: {}".format(et - st))
             print("TOTAL PROCESING TIME: {}".format(et - stfull))
 
+class FD_ImageProcessing:
+    #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch.
+    def __init__(self, minIntensity, thresholdQuantile, eluAlpha):
+        self.minIntensity = minIntensity
+        self.thresholdQuantile = thresholdQuantile
+        self.eluAlpha = eluAlpha
+
+    def elu(self,x):
+        if x > 0:
+            return x
+        else:
+            return self.eluAlpha*(math.exp(x)-1)
+
+    def eluThreshold(self, img):
+        if img is None:
+            return img
+        else:
+            elu_v = np.vectorize(self.elu)
+            secondQuartile = np.quantile(img, self.thresholdQuantile)
+            return(elu_v(img-secondQuartile)+secondQuartile)
+
+
+    def threshold(self, img):
+        if img is None:
+            return img
+        else:
+            secondQuartile = np.quantile(img, self.thresholdQuantile)
+            return (img>secondQuartile)*img
 
+    def removeZeroIntensity(self, img, currIntensity):
+        if currIntensity<self.minIntensity:
+            return None
+        else:
+            return img
+
+    def normalizeIntensity(self, img, currIntensity):
+        if img is None:
+            return img
+        elif currIntensity<self.minIntensity:
+            return np.zeros(img.shape)
+        else:
+            return img/currIntensity

From 5a08eff36538dcc447f08dd8712f44c4290e9311 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 3 Sep 2023 16:21:51 -0700
Subject: [PATCH 27/57] Fixed UMAP html

---
 btx/processing/freqdir.py | 93 +++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 6605df459..7ebb67140 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1215,6 +1215,7 @@ def genABOD(self):
             else:
                 outlierLabels.append(str(0))
         self.experData_df['anomDet'] = outlierLabels
+        self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels]
 
     def setUserGroupings(self, userGroupings):
         """
@@ -1238,6 +1239,7 @@ def genLabels(self):
         self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
         self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
         self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
+        self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels]
         self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
         medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
         self.medoidInds = [x[1] for x in medoid_lst]
@@ -1260,6 +1262,7 @@ def genLabels(self):
                 opticsNewLabels.append(j)
         opticsNewLabels = list(np.array(opticsNewLabels) + 1)
         self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
+        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
 
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
@@ -1270,17 +1273,19 @@ def genHTML(self):
             width = 2000, height = 600
         )
         plot_figure.add_tools(HoverTool(tooltips="""
-        <div style="background-color:@backgroundColor;">
-            <div>
-                <img src='@image' style='float: left; margin: 0px 15px 15px 0px'/>
+        <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 0px 0px">
+            <div style='width: 64; height: 64; float: left;'>
+                <img src='@image'; float: left;'/>
             </div>
-            <div>
-                <span style='font-size: 10px; color: #224499'>Cluster #</span>
-                <span style='font-size: 9px'>@cluster</span>
-            </div>
-            <div>
-                <span style='font-size: 10px; color: #224499'>Image #</span>
-                <span style='font-size: 9px'>@imgind</span>
+            <div style="height: 64;">
+                <div style='margin-left: 75; margin-top: 20'>
+                    <span style='font-size: 15px; color: #224499'>Cluster </span>
+                    <span style='font-size: 15px'>@cluster</span>
+                </div>
+                <div style='margin-left: 75; margin-top: 20'>
+                    <span style='font-size: 15px; color: #224499'>Image </span>
+                    <span style='font-size: 15px'>@imgind</span>
+                </div>
             </div>
         </div>
         """))
@@ -1355,7 +1360,10 @@ def genHTML(self):
         const anomDet = datasource.data.anomDet
         const imgind = datasource.data.imgind
         const backgroundColor = datasource.data.backgroundColor
-        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor}
+        const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
+        const anom_backgroundColor = datasource.data.anom_backgroundColor
+        const optics_backgroundColor = datasource.data.optics_backgroundColor
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
         """)
         cols.js_on_change('value', callback)
 
@@ -1414,12 +1422,8 @@ def genHTML(self):
             tools=('pan, wheel_zoom, reset'),
             width = 2000, height = 400
         )
-
         space = np.arange(self.numImgsToUse)
-#        space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_]
-#        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
         reachability = self.opticsClust.reachability_
-
         opticsData_df = pd.DataFrame({'x':space,'y':reachability})
         opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels]
         opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]]
@@ -1427,7 +1431,6 @@ def genHTML(self):
         color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))],
                                                palette=Category20[20])
         opticssource = ColumnDataSource(opticsData_df)
-
         reachabilityDiag.circle(
             'x',
             'y',
@@ -1451,22 +1454,28 @@ def genHTML(self):
             const cluster = datasource.data.cluster
             const anomDet = datasource.data.anomDet
             const imgind = datasource.data.imgind
-            const backgroundColor = datasource.data.backgroundColor
+            const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
+            const anom_backgroundColor = datasource.data.anom_backgroundColor
+            const optics_backgroundColor = datasource.data.optics_backgroundColor
 
             const opticsClust = opticssource.data.clusterForScatterPlot
 
             let ptColor = null
+            let backgroundColor = null
 
             if (cb_obj.active==0){
                 ptColor = cluster
+                backgroundColor = dbscan_backgroundColor
             }
             else if (cb_obj.active==1){
                 ptColor = opticsClust
+                backgroundColor = optics_backgroundColor
             }
             else{
                 ptColor = anomDet
+                backgroundColor = anom_backgroundColor
             }
-            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor}
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
         """)
         radio_button_group.js_on_change("active", radioGroup_js)
 
@@ -1753,31 +1762,31 @@ def runMe(self):
         ##########################################################################################
         
         
-        if self.rank==0:
+#        if self.rank==0:
 #            print("here 1")
-            st = time.perf_counter()
-
-            skipSize = 8 
-            numImgsToUse = int(self.num_imgs/skipSize)
-            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
-                            outputFile="./UMAPVis_{}.html".format(self.currRun),
-                            numImgsToUse=self.num_imgs,
-                            nprocs=self.size,
-                            userGroupings=[],
-                            includeABOD=True,
-                            skipSize = skipSize,
-                            umap_n_neighbors=numImgsToUse//40,
-                            umap_random_state=42,
-                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
-                            hdbscan_min_cluster_size=int(numImgsToUse//40),
-                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
-#            print("here 2")
-            visMe.fullVisualize()
-#            print("here 3")
-            visMe.userSave()
-            et = time.perf_counter()
-            print("UMAP HTML Generation Processing time: {}".format(et - st))
-            print("TOTAL PROCESING TIME: {}".format(et - stfull))
+#            st = time.perf_counter()
+#
+#            skipSize = 8 
+#            numImgsToUse = int(self.num_imgs/skipSize)
+#            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
+#                            outputFile="./UMAPVis_{}.html".format(self.currRun),
+#                            numImgsToUse=self.num_imgs,
+#                            nprocs=self.size,
+#                            userGroupings=[],
+#                            includeABOD=True,
+#                            skipSize = skipSize,
+#                            umap_n_neighbors=numImgsToUse//40,
+#                            umap_random_state=42,
+#                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
+#                            hdbscan_min_cluster_size=int(numImgsToUse//40),
+#                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
+##            print("here 2")
+#            visMe.fullVisualize()
+##            print("here 3")
+#            visMe.userSave()
+#            et = time.perf_counter()
+#            print("UMAP HTML Generation Processing time: {}".format(et - st))
+#            print("TOTAL PROCESING TIME: {}".format(et - stfull))
 
 class FD_ImageProcessing:
     #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch.

From 4692ae352b55d8abbea9ad0467da5428937bc5ee Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Sun, 3 Sep 2023 21:07:26 -0700
Subject: [PATCH 28/57] Fixed img range processed tracking and counting bug.
 Also moved data loading into its own class.

---
 btx/processing/freqdir.py | 318 +++++++++++++++++++++++++-------------
 1 file changed, 210 insertions(+), 108 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 7ebb67140..a9e985e89 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -129,6 +129,7 @@ def __init__(
         output_dir,
         currRun,
         imgData,
+        imgsTracked,
         alpha=0,
         rankAdapt=False,
         merger=False,
@@ -152,8 +153,6 @@ def __init__(
         self.rank= rank
         self.size = size
 
-        self.psi.counter = start_offset + self.num_images*self.rank//self.size
-
         self.currRun = currRun
 
         self.output_dir = output_dir
@@ -172,7 +171,7 @@ def __init__(
         self.nextZeroRow = 0
         self.alpha = alpha
 #        self.mean = None
-        self.imgsTracked = []
+        self.imgsTracked = imgsTracked
 
         self.rankAdapt = rankAdapt
         self.increaseEll = False
@@ -625,7 +624,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) 
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None, imgsTracked=None) 
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -684,8 +683,9 @@ def merge(self):
             fullLen = len(self.allWriteDirecs)
             for readMe in self.allWriteDirecs:
                 with h5py.File(readMe, 'r') as hf:
-                    if self.fullMean is None:
+#                    if self.fullMean is None:
 #                        self.fullMean = hf["mean"][:]
+                    if self.fullNumIncorp==0:
                         self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"]
                         self.fullImgsTracked = hf["imgsTracked"][:]
                     else:
@@ -714,6 +714,7 @@ def write(self):
                     hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
 #                print("CREATED FILE: ", filename2)
                 self.comm.send(filename2, dest=ind, tag=ind)
+            print("aodiwjaomwdklmduhi22adjdqoi2jd", self.fullImgsTracked)
         else:
             print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank))
         self.comm.barrier()
@@ -1273,16 +1274,16 @@ def genHTML(self):
             width = 2000, height = 600
         )
         plot_figure.add_tools(HoverTool(tooltips="""
-        <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 0px 0px">
+        <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 5px 0px">
             <div style='width: 64; height: 64; float: left;'>
                 <img src='@image'; float: left;'/>
             </div>
             <div style="height: 64;">
-                <div style='margin-left: 75; margin-top: 20'>
+                <div style='margin-left: 75; margin-top: 10'>
                     <span style='font-size: 15px; color: #224499'>Cluster </span>
                     <span style='font-size: 15px'>@cluster</span>
                 </div>
-                <div style='margin-left: 75; margin-top: 20'>
+                <div style='margin-left: 75; margin-top: 10'>
                     <span style='font-size: 15px; color: #224499'>Image </span>
                     <span style='font-size: 15px'>@imgind</span>
                 </div>
@@ -1571,108 +1572,110 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
 
         self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01)
 
-    def assembleImgsToSave(self, imgs):
-        """
-        Form the images from psana pixel index map and downsample images. 
-
-        Parameters
-        ----------
-        imgs: ndarray
-            images to downsample
-        """
-        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
-
-        saveMe = []
-        for img in imgs:
-            imgRe = np.reshape(img, self.psi.det.shape())
-            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
-        return np.array(saveMe)
-#        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
-#            self.imgGrabber.psi.det.shape()[0], 
-#            self.imgGrabber.psi.det.shape()[1], 
-#            self.imgGrabber.psi.det.shape()[2]))
-#        return assemble_image_stack_batch(imgsRe, pixel_index_map)
-
-    def get_formatted_images(self, startInd, n, includeThumbnails=False):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-        self.psi.counter = startInd
-        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-        print(self.imgsTracked)
+        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile)
 
-        imgs = self.psi.get_images(n, assemble=False)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-        if len(imgs.shape)==4:
-            num_valid_imgs, p, x, y = imgs.shape
-        else:
-            p = 1
-            num_valid_imgs, x, y = imgs.shape
-        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-        img_batch[img_batch<0] = 0
-        nimg_batch = []
-        for img in img_batch.T:
-            nimg = img
-            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-            if self.threshold:
-                nimg = self.imageProcessor.threshold(nimg)
-            if self.noZeroIntensity:
-                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
-            if self.normalizeIntensity:
-                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
-            if nimg is not None:
-                nimg_batch.append(nimg)
-        nimg_batch = np.array(nimg_batch)
-#            self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img))
-#            if self.threshold:
-#                secondQuartile = np.quantile(img, self.thresholdQuantile)
-#                nimg = (img>secondQuartile)*img
-##                elu_v = np.vectorize(self.elu)
-##                nimg = elu_v(img-secondQuartile)+secondQuartile
-#            else:
-#                nimg = img
+#    def assembleImgsToSave(self, imgs):
+#        """
+#        Form the images from psana pixel index map and downsample images. 
+#
+#        Parameters
+#        ----------
+#        imgs: ndarray
+#            images to downsample
+#        """
+#        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
 #
+#        saveMe = []
+#        for img in imgs:
+#            imgRe = np.reshape(img, self.psi.det.shape())
+#            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+#            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
+#        return np.array(saveMe)
+##        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
+##            self.imgGrabber.psi.det.shape()[0], 
+##            self.imgGrabber.psi.det.shape()[1], 
+##            self.imgGrabber.psi.det.shape()[2]))
+##        return assemble_image_stack_batch(imgsRe, pixel_index_map)
+#
+#    def get_formatted_images(self, startInd, n, includeThumbnails=False):
+#        """
+#        Fetch n - x image segments from run, where x is the number of 'dead' images.
+#
+#        Parameters
+#        ----------
+#        n : int
+#            number of images to retrieve
+#        start_index : int
+#            start index of subsection of data to retrieve
+#        end_index : int
+#            end index of subsection of data to retrieve
+#
+#        Returns
+#        -------
+#        ndarray, shape (end_index-start_index, n-x)
+#            n-x retrieved image segments of dimension end_index-start_index
+#        """
+#        self.psi.counter = startInd
+#        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
+#        print(self.imgsTracked)
+#
+#        imgs = self.psi.get_images(n, assemble=False)
+#
+#        imgs = imgs[
+#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+#        ]
+#        if len(imgs.shape)==4:
+#            num_valid_imgs, p, x, y = imgs.shape
+#        else:
+#            p = 1
+#            num_valid_imgs, x, y = imgs.shape
+#        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+#        img_batch[img_batch<0] = 0
+#        nimg_batch = []
+#        for img in img_batch.T:
+#            nimg = img
 #            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-##            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
-#            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
-#                continue
-#            else:
-#                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
-##                if not self.normalizeIntensity:
-#                    nimg_batch.append(nimg/currIntensity)
-#                else:
-##                    nimg_batch.append(nimg)
-#                    nimg_batch.append(np.zeros(nimg.shape))
+#            if self.threshold:
+#                nimg = self.imageProcessor.threshold(nimg)
+#            if self.noZeroIntensity:
+#                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
+#            if self.normalizeIntensity:
+#                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
+#            if nimg is not None:
+#                nimg_batch.append(nimg)
 #        nimg_batch = np.array(nimg_batch)
-        if self.downsample:
-            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
-            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
-            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
-#            print(binned_imgs.shape)
-        else:
-            binned_imgs = nimg_batch.T
-        if includeThumbnails:
-            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))))
-        else:
-            return binned_imgs
+##            self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img))
+##            if self.threshold:
+##                secondQuartile = np.quantile(img, self.thresholdQuantile)
+##                nimg = (img>secondQuartile)*img
+###                elu_v = np.vectorize(self.elu)
+###                nimg = elu_v(img-secondQuartile)+secondQuartile
+##            else:
+##                nimg = img
+##
+##            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+###            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
+##            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
+##                continue
+##            else:
+##                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
+###                if not self.normalizeIntensity:
+##                    nimg_batch.append(nimg/currIntensity)
+##                else:
+###                    nimg_batch.append(nimg)
+##                    nimg_batch.append(np.zeros(nimg.shape))
+##        nimg_batch = np.array(nimg_batch)
+#        if self.downsample:
+#            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
+#            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
+#            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
+##            print(binned_imgs.shape)
+#        else:
+#            binned_imgs = nimg_batch.T
+#        if includeThumbnails:
+#            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))))
+#        else:
+#            return binned_imgs
 
     @profile(filename="fullFD_profile")
     def runMe(self):
@@ -1693,7 +1696,7 @@ def runMe(self):
 #        print(self.imgsTracked)
         
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True)
+        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True)
 
 #        filenameTest0 = random.randint(0, 10)
 #        filenameTest0 = self.comm.allgather(filenameTest0) 
@@ -1705,7 +1708,7 @@ def runMe(self):
                 det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
                 threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
-                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData)
+                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
         print("STARTING SKETCHING FOR {}".format(self.currRun))
         st = time.perf_counter()
         freqDir.run()
@@ -1830,3 +1833,102 @@ def normalizeIntensity(self, img, currIntensity):
             return np.zeros(img.shape)
         else:
             return img/currIntensity
+
+
+class DataRetriever:
+    def __init__(self, exp, det_type, run, start_offset, num_imgs, threshold, noZeroIntensity, normalizeIntensity, downsample, bin_factor, thresholdQuantile):
+        self.exp = exp
+        self.det_type = det_type
+        self.run = run
+        self.start_offset = start_offset
+        self.num_imgs = num_imgs
+        self.threshold = threshold
+        self.noZeroIntensity = noZeroIntensity
+        self.normalizeIntensity = normalizeIntensity
+        self.downsample = downsample
+        self.bin_factor = bin_factor
+        self.thresholdQuantile = thresholdQuantile
+        self.imgsTracked = []
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi.counter = self.start_offset
+        
+        self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01)
+
+
+    def assembleImgsToSave(self, imgs):
+        """
+        Form the images from psana pixel index map and downsample images. 
+
+        Parameters
+        ----------
+        imgs: ndarray
+            images to downsample
+        """
+        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+
+        saveMe = []
+        for img in imgs:
+            imgRe = np.reshape(img, self.psi.det.shape())
+            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
+        return np.array(saveMe)
+
+    def get_formatted_images(self, startInd, n, includeThumbnails=False):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+        self.psi.counter = startInd
+        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
+        print(self.imgsTracked)
+
+        imgs = self.psi.get_images(n, assemble=False)
+
+        imgs = imgs[
+            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+        ]
+        if len(imgs.shape)==4:
+            num_valid_imgs, p, x, y = imgs.shape
+        else:
+            p = 1
+            num_valid_imgs, x, y = imgs.shape
+        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+        img_batch[img_batch<0] = 0
+        nimg_batch = []
+        for img in img_batch.T:
+            nimg = img
+            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
+            if self.threshold:
+                nimg = self.imageProcessor.threshold(nimg)
+            if self.noZeroIntensity:
+                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
+            if self.normalizeIntensity:
+                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
+            if nimg is not None:
+                nimg_batch.append(nimg)
+        nimg_batch = np.array(nimg_batch)
+        if self.downsample:
+            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
+            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
+            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
+#            print(binned_imgs.shape)
+        else:
+            binned_imgs = nimg_batch.T
+        if includeThumbnails:
+            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))), self.imgsTracked)
+        else:
+            return (binned_imgs, self.imgsTracked)

From aa1b561b6c479b91cc48f86a0ef049ef1249eebc Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Sun, 3 Sep 2023 22:14:29 -0700
Subject: [PATCH 29/57] Cleaned up code. Produces 130hz with nice clustering.

---
 btx/processing/freqdir.py | 436 ++++----------------------------------
 1 file changed, 43 insertions(+), 393 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index a9e985e89..71a0bd6d0 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -188,106 +188,10 @@ def run(self):
         Perform frequent directions matrix sketching
         on run subject to initialization parameters.
         """
-
         noImgsToProcess = self.num_images//self.size
         for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))):
             self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd)
 
-    def elu(self,x):
-        if x > 0:
-            return x
-        else:
-            return 0.01*(math.exp(x)-1)
-
-#    def get_formatted_images(self, n, includeUnformatted=False):
-#        """
-#        Fetch n - x image segments from run, where x is the number of 'dead' images.
-#
-#        Parameters
-#        ----------
-#        n : int
-#            number of images to retrieve
-#        start_index : int
-#            start index of subsection of data to retrieve
-#        end_index : int
-#            end index of subsection of data to retrieve
-#
-#        Returns
-#        -------
-#        ndarray, shape (end_index-start_index, n-x)
-#            n-x retrieved image segments of dimension end_index-start_index
-#        """
-#        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-#        # may have to rewrite eventually when number of images becomes large,
-#        # i.e. streamed setting, either that or downsample aggressively
-#        imgs = self.psi.get_images(n, assemble=False)
-#
-#        if includeUnformatted:
-#            imgsCopy = imgs.copy()
-#            imgsCopy = imgsCopy[
-#                [i for i in range(imgsCopy.shape[0]) if not np.isnan(imgsCopy[i : i + 1]).any()]
-#            ]
-#            num_valid_imgsCopy, p, x, y = imgsCopy.shape
-#            img_batchCopy = np.reshape(imgsCopy, (num_valid_imgsCopy, p * x * y)).T
-#            img_batchCopy[img_batchCopy<0] = 0
-#            nimg_batchCopy = []
-#            for img in img_batchCopy.T:
-#                if self.threshold:
-#    #                secondQuartile = np.sort(img)[-1]//4
-#    #                secondQuartile = np.mean(img)
-#    #                secondQuartile = np.median(img)
-#    #                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
-#                    secondQuartile = np.quantile(img, 0.93)
-#                    nimg = (img>secondQuartile)*img
-#    #                elu_v = np.vectorize(self.elu)
-#    #                nimg = elu_v(img-secondQuartile)+secondQuartile
-#                else:
-#                    nimg = img
-#                currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-#                if self.noZeroIntensity and currIntensity<50000:
-#                    continue
-#                else:
-#                    if currIntensity>=50000 and self.normalizeIntensity:
-#                        nimg_batchCopy.append(nimg/currIntensity)
-#                    else:
-#                        nimg_batchCopy.append(nimg)
-#
-#        if self.downsample:
-#            imgs = bin_data(imgs, self.bin_factor)
-#        imgs = imgs[
-#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-#        ]
-#        num_valid_imgs, p, x, y = imgs.shape
-#        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-#        img_batch[img_batch<0] = 0
-#        nimg_batch = []
-#        for img in img_batch.T:
-#            if self.threshold:
-##                secondQuartile = np.sort(img)[-1]//4
-##                secondQuartile = np.mean(img)
-##                secondQuartile = np.median(img)
-##                secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4]
-#                secondQuartile = np.quantile(img, 0.93)
-#                nimg = (img>secondQuartile)*img
-##                elu_v = np.vectorize(self.elu)
-##                nimg = elu_v(img-secondQuartile)+secondQuartile
-#            else:
-#                nimg = img
-#
-#            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-#            if self.noZeroIntensity and currIntensity<50000:
-#                continue
-#            else:
-#                if currIntensity>=50000 and self.normalizeIntensity:
-#                    nimg_batch.append(nimg/currIntensity)
-#                else:
-#                    nimg_batch.append(nimg)
-#        if includeUnformatted:
-#            return (np.array(nimg_batch).T, np.array(nimg_batchCopy).T)
-#        else:
-#            return np.array(nimg_batch).T
-
-
     def fetch_and_update_model(self, n, currInd):
         """
         Fetch images and update model.
@@ -297,10 +201,7 @@ def fetch_and_update_model(self, n, currInd):
         n : int
             number of images to incorporate
         """
-#        img_batch = self.get_formatted_images(n)
-        print("a90wjufipoamfoawfa09opi", self.imgData.shape)
         img_batch = self.imgData[:, currInd*n:currInd*(n+1)]
-        print("1414oiioqdca", img_batch.shape)
 
         if self.samplingFactor <1:
             psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
@@ -620,7 +521,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
-        time.sleep(30)
+        time.sleep(10)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
@@ -628,10 +529,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-#        if self.rank==0:
-#            print("BUFFER SIZES: ", self.buffSizes)
 
-#        print(self.data.shape)
         self.fd.update_model(self.data.T)
 
         self.output_dir = output_dir
@@ -672,9 +570,8 @@ def merge(self):
                         bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double)
                         self.comm.Recv(bufferMe, source=proc, tag=17)
                         bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1]))
-#                        print("BUFFERME SHAPE", bufferMe.shape)
-#                        self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1])))))
-                        self.fd.update_model(bufferMe.T)
+                        self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1],1)))))
+#                        self.fd.update_model(bufferMe.T)
                 else:
                     bufferMe = self.fd.get().copy().flatten()
                     self.comm.Send(bufferMe, dest=root, tag=17)
@@ -701,7 +598,6 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
-#        print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked)
         filename = self.output_dir + '{}_merge.h5'.format(self.currRun)
 
         if self.rank==0:
@@ -712,11 +608,9 @@ def write(self):
 #                    hf.create_dataset("mean",  data=self.fullMean)
                     hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp
                     hf.create_dataset("imgsTracked",  data=self.fullImgsTracked)
-#                print("CREATED FILE: ", filename2)
                 self.comm.send(filename2, dest=ind, tag=ind)
-            print("aodiwjaomwdklmduhi22adjdqoi2jd", self.fullImgsTracked)
         else:
-            print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank))
+            print("{} RECEIVED FILE NAME: {}".format(self.rank, self.comm.recv(source=0, tag=self.rank)))
         self.comm.barrier()
         return filename
 
@@ -782,14 +676,6 @@ def __init__(
 
         self.currRun = currRun
 
-#        self.imgGrabber = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
-#                exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor,
-#                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None)
-#        self.grabberToSaveImages = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun,
-#                exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0,
-#                threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None)
-#        self.batchSize = batchSize
-
         self.num_incorporated_images = 0
 
         readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5"
@@ -797,7 +683,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(30)
+        time.sleep(10)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -816,92 +702,15 @@ def __init__(
 
     def run(self):
         """
-        Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. 
-        """
-#        noImgsToProcess = self.num_imgs//self.size
-#        for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)):
-#        for currInd in range(len(self.imgData)):
-        self.fetch_and_process_data(0)
-#        print("RANK {} IS DONE".format(self.rank))
-#        self.fetch_and_process_data()
-
+        Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file.
 
-    def fetch_and_process_data(self, currInd):
-        """
-        Fetch and downsample data, apply projection algorithm
+        Note: If-Else statement is from previous/future work enabling streaming processing. 
         """
-#        startCounter = self.imgGrabber.psi.counter
-
-#        stimggrab = time.perf_counter()
-#        img_batch,img_batchUnformatted = self.imgGrabber.get_formatted_images(self.batchSize,includeUnformatted=True)
-#        img_batch = self.imgGrabber.get_formatted_images(self.batchSize)
-#        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
-#        etimggrab = time.perf_counter()
-#        print("{} Image Grab TIME: ".format(self.rank), etimggrab - stimggrab)
-
-#        stassemble = time.perf_counter()
-#        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize))
-#        toSave_img_batch = self.assembleImgsToSave(img_batchUnformatted)
-#        etassemble = time.perf_counter()
-#        print("{} Assemble TIME: ".format(self.rank), etassemble - stassemble)
-
-#        stassemble = time.perf_counter()
-
-        img_batch = self.imgData
-        toSave_img_batch = self.thumbnailData
-
         if self.smallImgs is None:
-            self.smallImgs = toSave_img_batch
+            self.smallImgs = self.thumbnailData
         else:
-            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
-#        self.apply_compression((img_batch.T - self.mean).T)
-        self.apply_compression(img_batch)
-#        etassemble = time.perf_counter()
-#        print("{} Apply Compression TIME: ".format(self.rank), etassemble - stassemble)
-
-
-#        noImgsToProcess = self.num_images//self.size
-#        startCounter = self.imgGrabber.psi.counter
-#        img_batch = self.imgGrabber.get_formatted_images(noImgsToProcess)
-#        self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter))
-#        st_compress = time.perf_counter()
-#        self.apply_compression(img_batch)
-#        et_compress = time.perf_counter()
-#        print("COMPRESSION TIME: ", et_compress - st_compress#)
-#
-#        st_assemble = time.perf_counter()
-#        toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(noImgsToProcess))
-#        if self.smallImgs is None:
-#            self.smallImgs = toSave_img_batch
-#        else:
-#            self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0)
-#        et_assemble = time.perf_counter()
-#        print("ASSEMBLE TIME: ", et_assemble-st_assemble)
-
-
-#    def assembleImgsToSave(self, imgs):
-#        """
-#        Form the images from psana pixel index map and downsample images. 
-#
-#        Parameters
-#        ----------
-#        imgs: ndarray
-#            images to downsample
-#        """
-#        pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run))
-#
-#        saveMe = []
-#        for img in imgs.T:
-#            imgRe = np.reshape(img, self.imgGrabber.psi.det.shape())
-#            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-#            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
-#        return np.array(saveMe)
-##        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
-##            self.imgGrabber.psi.det.shape()[0], 
-##            self.imgGrabber.psi.det.shape()[1], 
-##            self.imgGrabber.psi.det.shape()[2]))
-##        return assemble_image_stack_batch(imgsRe, pixel_index_map)
-
+            self.smallImgs = np.concatenate((self.smallImgs, self.thumbnailData), axis=0)
+        self.apply_compression(self.imgData)
 
     def apply_compression(self, X):
         """
@@ -1041,21 +850,6 @@ def random_unique_numbers_from_range(self, start, end, count):
         random.shuffle(all_numbers)
         return all_numbers[:count]
 
-#    def euclidean_distance(self, p1, p2):
-#        return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
-
-#    def compute_medoid(self, points):
-#        min_total_distance = float('inf')
-#        medoid = None
-#        for i, point in enumerate(points):
-#            total_distance = 0
-#            for other_point in points:
-#                total_distance += self.euclidean_distance(point, other_point)
-#            if total_distance < min_total_distance:
-#                min_total_distance = total_distance
-#                medoid = point
-#        return medoid
-
     def compute_medoid(self, points):
         return points[np.argmin(euclidean_distances(points).sum(axis=0))]
 
@@ -1145,12 +939,9 @@ def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
     def genUMAP(self):
-#        for dirval in os.listdir(self.inputFile[:-26]):
-#            print("ITEM IN DIRECTORY:", dirval)
         imgs = None
         projections = None
         for currRank in range(self.nprocs):
-#            print("GETTING CURRENT RANK: ", currRank)
             with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
                 if imgs is None:
                     imgs = hf["SmallImages"][:]
@@ -1447,7 +1238,6 @@ def genHTML(self):
         LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"]
         radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
         radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code="""
-            console.log(datasource.data.ptColor)
             const x = datasource.data.x
             const y = datasource.data.y
             const image = datasource.data.image
@@ -1483,15 +1273,10 @@ def genHTML(self):
         self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
 
     def fullVisualize(self):
-#        print("here 4")
         self.genUMAP()
-#        print("here 5")
         self.genABOD()
-#        print("here 6")
         self.genLabels()
-#        print("here 7")
         self.genHTML()
-#        print("here 8")
 
     def updateLabels(self):
         self.genLabels()
@@ -1574,134 +1359,13 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
 
         self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile)
 
-#    def assembleImgsToSave(self, imgs):
-#        """
-#        Form the images from psana pixel index map and downsample images. 
-#
-#        Parameters
-#        ----------
-#        imgs: ndarray
-#            images to downsample
-#        """
-#        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
-#
-#        saveMe = []
-#        for img in imgs:
-#            imgRe = np.reshape(img, self.psi.det.shape())
-#            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-#            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
-#        return np.array(saveMe)
-##        imgsRe = np.reshape(imgs.T, (imgs.shape[1], 
-##            self.imgGrabber.psi.det.shape()[0], 
-##            self.imgGrabber.psi.det.shape()[1], 
-##            self.imgGrabber.psi.det.shape()[2]))
-##        return assemble_image_stack_batch(imgsRe, pixel_index_map)
-#
-#    def get_formatted_images(self, startInd, n, includeThumbnails=False):
-#        """
-#        Fetch n - x image segments from run, where x is the number of 'dead' images.
-#
-#        Parameters
-#        ----------
-#        n : int
-#            number of images to retrieve
-#        start_index : int
-#            start index of subsection of data to retrieve
-#        end_index : int
-#            end index of subsection of data to retrieve
-#
-#        Returns
-#        -------
-#        ndarray, shape (end_index-start_index, n-x)
-#            n-x retrieved image segments of dimension end_index-start_index
-#        """
-#        self.psi.counter = startInd
-#        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-#        print(self.imgsTracked)
-#
-#        imgs = self.psi.get_images(n, assemble=False)
-#
-#        imgs = imgs[
-#            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-#        ]
-#        if len(imgs.shape)==4:
-#            num_valid_imgs, p, x, y = imgs.shape
-#        else:
-#            p = 1
-#            num_valid_imgs, x, y = imgs.shape
-#        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-#        img_batch[img_batch<0] = 0
-#        nimg_batch = []
-#        for img in img_batch.T:
-#            nimg = img
-#            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-#            if self.threshold:
-#                nimg = self.imageProcessor.threshold(nimg)
-#            if self.noZeroIntensity:
-#                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
-#            if self.normalizeIntensity:
-#                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
-#            if nimg is not None:
-#                nimg_batch.append(nimg)
-#        nimg_batch = np.array(nimg_batch)
-##            self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img))
-##            if self.threshold:
-##                secondQuartile = np.quantile(img, self.thresholdQuantile)
-##                nimg = (img>secondQuartile)*img
-###                elu_v = np.vectorize(self.elu)
-###                nimg = elu_v(img-secondQuartile)+secondQuartile
-##            else:
-##                nimg = img
-##
-##            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-###            print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity))
-##            if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000:
-##                continue
-##            else:
-##                if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity:
-###                if not self.normalizeIntensity:
-##                    nimg_batch.append(nimg/currIntensity)
-##                else:
-###                    nimg_batch.append(nimg)
-##                    nimg_batch.append(np.zeros(nimg.shape))
-##        nimg_batch = np.array(nimg_batch)
-#        if self.downsample:
-#            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
-#            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
-#            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
-##            print(binned_imgs.shape)
-#        else:
-#            binned_imgs = nimg_batch.T
-#        if includeThumbnails:
-#            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))))
-#        else:
-#            return binned_imgs
-
-    @profile(filename="fullFD_profile")
+#    @profile(filename="fullFD_profile")
     def runMe(self):
         stfull = time.perf_counter()
 
-        #DATA RETRIEVAL STEP
-        ##########################################################################################
-#        self.fullImgData = []
-#        self.fullThumbnailData = []
-#        noImgsToProcess = self.num_imgs//self.size
-#        batchSize = int(self.num_components*2//self.samplingFactor)
-#        for batch in range(0, noImgsToProcess, batchSize): 
-#            startInd = startingPoint+batch
-#            binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True)
-#            print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape)
-#            self.fullImgData.append(binned_imgs)
-#            self.fullThumbnailData.append(thumbnails)
-#        print(self.imgsTracked)
-        
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True)
 
-#        filenameTest0 = random.randint(0, 10)
-#        filenameTest0 = self.comm.allgather(filenameTest0) 
-#        print("TEST 0: ", self.rank, filenameTest0)
-
         #SKETCHING STEP
         ##########################################################################################
         freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
@@ -1709,17 +1373,13 @@ def runMe(self):
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
                 threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
                 currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
-        print("STARTING SKETCHING FOR {}".format(self.currRun))
+        print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
         st = time.perf_counter()
         freqDir.run()
         localSketchFilename = freqDir.write()
         et = time.perf_counter()
         print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
 
-#        filenameTest1 = random.randint(0, 10)
-#        filenameTest1 = self.comm.allgather(filenameTest1) 
-#        print("TEST 1: ", self.rank, filenameTest1)
-
         #MERGING STEP
         ##########################################################################################
         if freqDir.rank<10:
@@ -1731,18 +1391,12 @@ def runMe(self):
             allNames.append(fullSketchFilename + str(j) + ".h5")
         mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
                 output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun)
-        #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename,
-        #        dir=writeToHere, allWriteDirecs=allNames, currRun = currRun)
         st = time.perf_counter()
         mergeTree.merge()
         mergedSketchFilename = mergeTree.write()
         et = time.perf_counter()
         print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
 
-#        filenameTest2 = random.randint(0, 10)
-#        filenameTest2 = self.comm.allgather(filenameTest2) 
-#        print("TEST 2: ", self.rank, filenameTest2)
-
         #PROJECTION STEP
         ##########################################################################################
         appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
@@ -1755,44 +1409,42 @@ def runMe(self):
         et = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
         print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
-        
+
         self.comm.barrier()
         self.comm.Barrier()
-#        filenameTest3 = random.randint(0, 10)
-#        filenameTest3 = self.comm.allgather(filenameTest3) 
-#        print("TEST 3: ", self.rank, filenameTest3)
+        filenameTest3 = random.randint(0, 10)
+        filenameTest3 = self.comm.allgather(filenameTest3)
+        print("TEST 3: ", self.rank, filenameTest3)
 
+        #UMAP STEP
         ##########################################################################################
-        
-        
-#        if self.rank==0:
-#            print("here 1")
-#            st = time.perf_counter()
-#
-#            skipSize = 8 
-#            numImgsToUse = int(self.num_imgs/skipSize)
-#            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
-#                            outputFile="./UMAPVis_{}.html".format(self.currRun),
-#                            numImgsToUse=self.num_imgs,
-#                            nprocs=self.size,
-#                            userGroupings=[],
-#                            includeABOD=True,
-#                            skipSize = skipSize,
-#                            umap_n_neighbors=numImgsToUse//40,
-#                            umap_random_state=42,
-#                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
-#                            hdbscan_min_cluster_size=int(numImgsToUse//40),
-#                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
-##            print("here 2")
-#            visMe.fullVisualize()
-##            print("here 3")
-#            visMe.userSave()
-#            et = time.perf_counter()
-#            print("UMAP HTML Generation Processing time: {}".format(et - st))
-#            print("TOTAL PROCESING TIME: {}".format(et - stfull))
+        if self.rank==0:
+            print("here 1")
+            st = time.perf_counter()
+
+            skipSize = 8
+            numImgsToUse = int(self.num_imgs/skipSize)
+            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
+                            outputFile="./UMAPVis_{}.html".format(self.currRun),
+                            numImgsToUse=self.num_imgs,
+                            nprocs=self.size,
+                            userGroupings=[],
+                            includeABOD=True,
+                            skipSize = skipSize,
+                            umap_n_neighbors=numImgsToUse//40,
+                            umap_random_state=42,
+                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
+                            hdbscan_min_cluster_size=int(numImgsToUse//40),
+                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
+#            print("here 2")
+            visMe.fullVisualize()
+#            print("here 3")
+            visMe.userSave()
+            et = time.perf_counter()
+            print("UMAP HTML Generation Processing time: {}".format(et - st))
+            print("TOTAL PROCESING TIME: {}".format(et - stfull))
 
 class FD_ImageProcessing:
-    #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch.
     def __init__(self, minIntensity, thresholdQuantile, eluAlpha):
         self.minIntensity = minIntensity
         self.thresholdQuantile = thresholdQuantile
@@ -1812,7 +1464,6 @@ def eluThreshold(self, img):
             secondQuartile = np.quantile(img, self.thresholdQuantile)
             return(elu_v(img-secondQuartile)+secondQuartile)
 
-
     def threshold(self, img):
         if img is None:
             return img
@@ -1894,7 +1545,7 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False):
         """
         self.psi.counter = startInd
         self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-        print(self.imgsTracked)
+        print("Images tracked:", self.imgsTracked)
 
         imgs = self.psi.get_images(n, assemble=False)
 
@@ -1925,7 +1576,6 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False):
             binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
             binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
             binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
-#            print(binned_imgs.shape)
         else:
             binned_imgs = nimg_batch.T
         if includeThumbnails:

From 7b182bd4f1913fd20e89d9eed9cc40e8dccb97bb Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 5 Sep 2023 21:02:18 -0700
Subject: [PATCH 30/57] Cleaned up code and made it so that you don't need to
 evenly divide everything for things to work

---
 btx/processing/freqdir.py | 179 ++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 106 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 71a0bd6d0..e741a53e8 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -130,23 +130,18 @@ def __init__(
         currRun,
         imgData,
         imgsTracked,
-        alpha=0,
-        rankAdapt=False,
-        merger=False,
-        mergerFeatures=0,
-        downsample=False,
-        bin_factor=2,
-        threshold=False,
-        normalizeIntensity=False,
-        noZeroIntensity=False, 
-        samplingFactor=1.0, 
-        num_components=10, 
-        batch_size = 10,
-        priming=False
+        alpha,
+        rankAdapt,
+        merger,
+        mergerFeatures,
+        downsample,
+        bin_factor,
+        samplingFactor, 
+        num_components, 
     ):
 
         super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
-                num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming,
+                num_images=num_imgs, num_components=num_components, batch_size=0, priming=False,
                 downsample=downsample, bin_factor=bin_factor, output_dir=output_dir)
 
         self.comm = comm
@@ -171,44 +166,27 @@ def __init__(
         self.nextZeroRow = 0
         self.alpha = alpha
 #        self.mean = None
-        self.imgsTracked = imgsTracked
 
         self.rankAdapt = rankAdapt
         self.increaseEll = False
-        self.threshold = threshold
-        self.noZeroIntensity = noZeroIntensity
-        self.normalizeIntensity=normalizeIntensity
 
         self.samplingFactor = samplingFactor
 
         self.imgData = imgData
+        self.imgsTracked = imgsTracked
 
     def run(self):
         """
         Perform frequent directions matrix sketching
         on run subject to initialization parameters.
         """
-        noImgsToProcess = self.num_images//self.size
-        for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))):
-            self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd)
-
-    def fetch_and_update_model(self, n, currInd):
-        """
-        Fetch images and update model.
-
-        Parameters
-        ----------
-        n : int
-            number of images to incorporate
-        """
-        img_batch = self.imgData[:, currInd*n:currInd*(n+1)]
-
+        img_batch = self.imgData
         if self.samplingFactor <1:
-            psamp = PrioritySampling(int(n*self.samplingFactor), self.d)
+            psamp = PrioritySampling(int((img_batch.shape[1])*self.samplingFactor), self.d)
             for row in img_batch.T:
                 psamp.update(row)
             img_batch = np.array(psamp.sketch.get()).T
-
+        self.update_model(img_batch)
 #        if self.mean is None:
 #            self.mean = np.mean(img_batch, axis=1)
 #        else:
@@ -217,8 +195,6 @@ def fetch_and_update_model(self, n, currInd):
 #             self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/(
 #                    self.num_incorporated_images + (img_batch.shape[1]))
 #        self.update_model((img_batch.T - self.mean).T)
-        self.update_model(img_batch)
-
 
     def update_model(self, X):
         """
@@ -521,11 +497,11 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
-        time.sleep(10)
+        time.sleep(15)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None, imgsTracked=None) 
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1)
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -629,7 +605,6 @@ class ApplyCompression:
     normalizeIntensity: whether data should be normalized to have total intensity of one
     noZeroIntensity: whether data with low total intensity should be discarded
     readFile: H5 file with matrix sketch
-    batchSize: Number of images to process at each iteration
     data: numpy array housing current matrix sketch
     mean: geometric mean of data processed
     num_incorporated_images: number of images processed so far
@@ -655,15 +630,9 @@ def __init__(
         det_type,
         readFile,
         output_dir,
-        batchSize,
-        threshold,
-        noZeroIntensity,
-        normalizeIntensity,
         currRun,
         imgData, 
         thumbnailData,
-        downsample=False,
-        bin_factor=2
     ):
 
         self.comm = comm
@@ -683,7 +652,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(10)
+        time.sleep(15)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -1165,7 +1134,7 @@ def genHTML(self):
                 x=[0.25+xind for xind in range(len(self.medoidInds))],
                 y=0,
                 dw=0.5, dh=1,
-                palette="Plasma256", level="image")
+                palette="Turbo256", level="image")
         imgsPlot.axis.visible = False
         imgsPlot.grid.visible = False
         for xind in range(len(self.medoidInds)):
@@ -1320,7 +1289,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize, thresholdQuantile):
+    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1333,12 +1302,13 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
         self.downsample=downsample
         self.bin_factor= bin_factor
         self.threshold= threshold
+        self.eluThreshold = eluThreshold
+        self.eluAlpha = eluAlpha
         self.normalizeIntensity=normalizeIntensity
         self.noZeroIntensity=noZeroIntensity
+        self.minIntensity = minIntensity
         self.samplingFactor=samplingFactor
-        self.priming=priming
         self.divBy = divBy 
-        self.batchSize = batchSize
         self.thresholdQuantile = thresholdQuantile
 
         self.comm = MPI.COMM_WORLD
@@ -1355,24 +1325,22 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
-        self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01)
-
-        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile)
+        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile)
+        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
 #    @profile(filename="fullFD_profile")
     def runMe(self):
         stfull = time.perf_counter()
 
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True)
+        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size)
 
         #SKETCHING STEP
         ##########################################################################################
         freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
                 det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
-                threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
-                currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
+                currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
         st = time.perf_counter()
         freqDir.run()
@@ -1399,10 +1367,7 @@ def runMe(self):
 
         #PROJECTION STEP
         ##########################################################################################
-        appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
-                det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere,
-                batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity,
-                downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData)
+        appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData)
         st = time.perf_counter()
         appComp.run()
         appComp.write()
@@ -1445,10 +1410,25 @@ def runMe(self):
             print("TOTAL PROCESING TIME: {}".format(et - stfull))
 
 class FD_ImageProcessing:
-    def __init__(self, minIntensity, thresholdQuantile, eluAlpha):
+    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
+        self.threshold = threshold
+        self.eluThreshold = eluThreshold
+        self.eluAlpha = eluAlpha
+        self.noZeroIntensity = noZeroIntensity
+        self.normalizeIntensity = normalizeIntensity
         self.minIntensity = minIntensity
         self.thresholdQuantile = thresholdQuantile
-        self.eluAlpha = eluAlpha
+
+    def processImg(self, nimg, currIntensity):
+        if self.threshold:
+            nimg = self.thresholdFunc(nimg)
+        if self.eluThreshold:
+            nimg = self.eluThresholdFunc(nimg)
+        if self.noZeroIntensity:
+            nimg = self.removeZeroIntensityFunc(nimg, currIntensity)
+        if self.normalizeIntensity:
+            nimg = self.normalizeIntensityFunc(nimg, currIntensity)
+        return nimg
 
     def elu(self,x):
         if x > 0:
@@ -1456,7 +1436,7 @@ def elu(self,x):
         else:
             return self.eluAlpha*(math.exp(x)-1)
 
-    def eluThreshold(self, img):
+    def eluThresholdFunc(self, img):
         if img is None:
             return img
         else:
@@ -1464,48 +1444,42 @@ def eluThreshold(self, img):
             secondQuartile = np.quantile(img, self.thresholdQuantile)
             return(elu_v(img-secondQuartile)+secondQuartile)
 
-    def threshold(self, img):
+    def thresholdFunc(self, img):
         if img is None:
             return img
         else:
             secondQuartile = np.quantile(img, self.thresholdQuantile)
             return (img>secondQuartile)*img
 
-    def removeZeroIntensity(self, img, currIntensity):
+    def removeZeroIntensityFunc(self, img, currIntensity):
         if currIntensity<self.minIntensity:
             return None
         else:
             return img
 
-    def normalizeIntensity(self, img, currIntensity):
+    def normalizeIntensityFunc(self, img, currIntensity):
         if img is None:
             return img
         elif currIntensity<self.minIntensity:
-            return np.zeros(img.shape)
+            return np.zeros(img.shape)+1
         else:
-            return img/currIntensity
+            return img/np.sum(img.flatten(), dtype=np.double)
 
 
 class DataRetriever:
-    def __init__(self, exp, det_type, run, start_offset, num_imgs, threshold, noZeroIntensity, normalizeIntensity, downsample, bin_factor, thresholdQuantile):
+    def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
         self.run = run
-        self.start_offset = start_offset
-        self.num_imgs = num_imgs
-        self.threshold = threshold
-        self.noZeroIntensity = noZeroIntensity
-        self.normalizeIntensity = normalizeIntensity
         self.downsample = downsample
         self.bin_factor = bin_factor
-        self.thresholdQuantile = thresholdQuantile
         self.imgsTracked = []
+        self.thumbnailHeight = thumbnailHeight
+        self.thumbnailWidth = thumbnailWidth
 
         self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = self.start_offset
-        
-        self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01)
 
+        self.imageProcessor = imageProcessor
 
     def assembleImgsToSave(self, imgs):
         """
@@ -1522,10 +1496,10 @@ def assembleImgsToSave(self, imgs):
         for img in imgs:
             imgRe = np.reshape(img, self.psi.det.shape())
             imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
-            saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64))))
+            saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth))))
         return np.array(saveMe)
 
-    def get_formatted_images(self, startInd, n, includeThumbnails=False):
+    def get_formatted_images(self, startInd, n):
         """
         Fetch n - x image segments from run, where x is the number of 'dead' images.
 
@@ -1552,33 +1526,26 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False):
         imgs = imgs[
             [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
         ]
-        if len(imgs.shape)==4:
-            num_valid_imgs, p, x, y = imgs.shape
-        else:
-            p = 1
-            num_valid_imgs, x, y = imgs.shape
+        thumbnails = self.assembleImgsToSave(imgs)
+
+        if self.downsample:
+            imgs = bin_data(imgs, self.bin_factor)
+        num_valid_imgs, p, x, y = imgs.shape
         img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
         img_batch[img_batch<0] = 0
+
+        num_valid_thumbnails, tx, ty = thumbnails.shape
+        thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
+
         nimg_batch = []
-        for img in img_batch.T:
-            nimg = img
-            currIntensity = np.sum(nimg.flatten(), dtype=np.double)
-            if self.threshold:
-                nimg = self.imageProcessor.threshold(nimg)
-            if self.noZeroIntensity:
-                nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity)
-            if self.normalizeIntensity:
-                nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity)
+        nthumbnail_batch = []
+        for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
+            currIntensity = np.sum(img.flatten(), dtype=np.double)
+            nimg = self.imageProcessor.processImg(img, currIntensity)
+            nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
             if nimg is not None:
                 nimg_batch.append(nimg)
-        nimg_batch = np.array(nimg_batch)
-        if self.downsample:
-            binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor)
-            binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape
-            binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T
-        else:
-            binned_imgs = nimg_batch.T
-        if includeThumbnails:
-            return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))), self.imgsTracked)
-        else:
-            return (binned_imgs, self.imgsTracked)
+                nthumbnail_batch.append(nthumbnail)
+        nimg_batch = np.array(nimg_batch).T
+        nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+        return (nimg_batch, nthumbnail_batch, self.imgsTracked)

From eb2c66d57ff3f5d8a5198229d71037f29d712870 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdflogin001.sdf.slac.stanford.edu>
Date: Thu, 14 Sep 2023 10:14:17 -0700
Subject: [PATCH 31/57] Checkpoint. Not sure what changed

---
 btx/processing/freqdir.py | 173 ++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 64 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index e741a53e8..318fc5904 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -132,6 +132,7 @@ def __init__(
         imgsTracked,
         alpha,
         rankAdapt,
+        rankAdaptMinError,
         merger,
         mergerFeatures,
         downsample,
@@ -168,6 +169,7 @@ def __init__(
 #        self.mean = None
 
         self.rankAdapt = rankAdapt
+        self.rankAdaptMinError = rankAdaptMinError
         self.increaseEll = False
 
         self.samplingFactor = samplingFactor
@@ -244,8 +246,8 @@ def update_model(self, X):
                         copyBatch = self.sketch[self.ell:,:].copy()
                         self.rotate()
                         if canRankAdapt and self.rankAdapt:
-                            reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch))
-                            if (reconError > 0.08):
+                            reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch))
+                            if (reconError > self.rankAdaptMinError):
                                 self.increaseEll = True
                 self.sketch[self.nextZeroRow,:] = row 
                 self.nextZeroRow += 1
@@ -326,11 +328,12 @@ def reconstructionError(self, matrixCentered):
         	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
                 (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
 
-    def lowMemoryReconstructionError(self, matrixCentered):
+    def lowMemoryReconstructionErrorScaled(self, matrixCentered):
         """ 
         Compute the low memory reconstruction error of the matrix sketch
-        against given data. This si the same as reconstructionError,
-        but estimates the norm computation and does not scale by the matrix. 
+        against given data. This is the same as reconstructionError,
+        but estimates the norm computation and does not scale by the 
+        minimum projection matrix, but rather by the matrix norm itself. 
 
         Parameters
         ----------
@@ -348,7 +351,7 @@ def lowMemoryReconstructionError(self, matrixCentered):
         matSketchT = matSketch.T
         U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
         G = U[:,:k]
-        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/
+        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
                 np.linalg.norm(matrixCenteredT, 'fro')**2)
 
     def estimFrobNormSquared(self, addMe, arrs, its):
@@ -501,10 +504,11 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1)
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1)
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
+        print(self.buffSizes)
 
         self.fd.update_model(self.data.T)
 
@@ -787,7 +791,7 @@ class visualizeFD:
     """
     def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
             skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
-            optics_min_samples, optics_xi, optics_min_cluster_size):
+            optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
         self.inputFile = inputFile
         self.outputFile = outputFile
         output_file(filename=outputFile, title="Static HTML file")
@@ -804,6 +808,7 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use
         self.optics_min_samples=optics_min_samples
         self.optics_xi = optics_xi
         self.optics_min_cluster_size = optics_min_cluster_size
+        self.outlierQuantile = outlierQuantile
 
     def embeddable_image(self, data):
         img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
@@ -877,22 +882,32 @@ def fastABOD(self, pts, nsamples):
                 ac = cpt - apt
                 if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
                     count += 1
+                    print("TOO CLOSE")
                     continue
                 outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
-            abofs.append(np.var(np.array(outlier_factors)))
+            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
+            if(len(outlier_factors)==0):
+                abofs.append(np.inf)
+            else:
+                abofs.append(np.var(np.array(outlier_factors)))
         return abofs
 
-    def getOutliers(self, lst, divBy):
-        lstCopy = lst.copy()
-        lstCopy.sort()
-        quart10 = lstCopy[len(lstCopy)//divBy]
+    def getOutliers(self, lst):
+#        lstCopy = lst.copy()
+#        lstCopy.sort()
+#        quart10 = lstCopy[len(lstCopy)//divBy]
+
+        lstQuant = np.quantile(np.array(lst), self.outlierQuantile)
+        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
         outlierInds = []
         notOutlierInds = []
         for j in range(len(lst)):
-            if lst[j]<quart10:
+            if lst[j]>lstQuant:
                 outlierInds.append(j)
             else:
                 notOutlierInds.append(j)
+        print("OUTLIER INDS: ", outlierInds)
+        print("NOT OUTLIER INDS: ", notOutlierInds)
         return np.array(outlierInds), np.array(notOutlierInds)
 
     def genHist(self, vals, endClass):
@@ -937,6 +952,7 @@ def genUMAP(self):
             n_neighbors=self.umap_n_neighbors,
             random_state=self.umap_random_state,
             n_components=2,
+            min_dist=0.25,
         ).fit_transform(self.projections)
 
         self.labels = hdbscan.HDBSCAN(
@@ -948,15 +964,13 @@ def genUMAP(self):
 
         self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
         self.opticsClust.fit(self.clusterable_embedding)
-#        self.opticsLabels = cluster_optics_dbscan(
-#            reachability=self.opticsClust.reachability_,
-#            core_distances=self.opticsClust.core_distances_,
-#            ordering=self.opticsClust.ordering_,
-#            eps=2,
-#        )
-
-#        self.opticsLabels = self.opticsClust.labels_[self.opticsClust.ordering_]
-        self.opticsLabels = self.opticsClust.labels_
+        self.opticsLabels = cluster_optics_dbscan(
+            reachability=self.opticsClust.reachability_,
+            core_distances=self.opticsClust.core_distances_,
+            ordering=self.opticsClust.ordering_,
+            eps=2.5,
+        )
+#        self.opticsLabels = self.opticsClust.labels_
 
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
@@ -965,7 +979,7 @@ def genUMAP(self):
     def genABOD(self):
         if self.includeABOD:
             abod = self.fastABOD(self.projections, 10)
-            outliers, notOutliers = self.getOutliers(abod, 10)
+            outliers, notOutliers = self.getOutliers(abod)
         else:
             outliers = []
             notOutliers = []
@@ -1023,7 +1037,8 @@ def genLabels(self):
                 opticsNewLabels.append(j)
         opticsNewLabels = list(np.array(opticsNewLabels) + 1)
         self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
-        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
+#        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
+        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels]
 
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
@@ -1184,7 +1199,8 @@ def genHTML(self):
             width = 2000, height = 400
         )
         space = np.arange(self.numImgsToUse)
-        reachability = self.opticsClust.reachability_
+        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
+#        reachability = self.opticsClust.reachability_
         opticsData_df = pd.DataFrame({'x':space,'y':reachability})
         opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels]
         opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]]
@@ -1289,7 +1305,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
+    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1299,6 +1315,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
         self.num_components=num_components
         self.alpha = alpha
         self.rankAdapt = rankAdapt
+        self.rankAdaptMinError = rankAdaptMinError
         self.downsample=downsample
         self.bin_factor= bin_factor
         self.threshold= threshold
@@ -1318,6 +1335,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_
         self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
         self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
         self.imgsTracked = []
+        self.grabImgSteps = grabImgSteps
 
         if self.rank==0:
             self.currRun = datetime.now().strftime("%y%m%d%H%M%S")
@@ -1333,12 +1351,12 @@ def runMe(self):
         stfull = time.perf_counter()
 
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size)
+        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps)
 
         #SKETCHING STEP
         ##########################################################################################
         freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
-                det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt,
+                det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
                 currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
@@ -1400,7 +1418,8 @@ def runMe(self):
                             umap_random_state=42,
                             hdbscan_min_samples=int(numImgsToUse*0.75//40),
                             hdbscan_min_cluster_size=int(numImgsToUse//40),
-                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05)
+                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05, 
+                            outlierQuantile=0.3)
 #            print("here 2")
             visMe.fullVisualize()
 #            print("here 3")
@@ -1473,7 +1492,6 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.run = run
         self.downsample = downsample
         self.bin_factor = bin_factor
-        self.imgsTracked = []
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
@@ -1499,7 +1517,23 @@ def assembleImgsToSave(self, imgs):
             saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth))))
         return np.array(saveMe)
 
-    def get_formatted_images(self, startInd, n):
+    def split_range(self, start, end, num_tuples):
+        if start==end:
+            raise ValueError('Range processing error: start value equals end value, which leads to no images processed.')
+            return
+        total_elements = end - start
+        batch_size = total_elements // num_tuples
+        tuples = []
+        for i in range(num_tuples - 1):
+            batch_start = start + i * batch_size
+            batch_end = batch_start + batch_size
+            tuples.append((batch_start, batch_end))
+        last_batch_start = start + (num_tuples - 1) * batch_size
+        last_batch_end = end
+        tuples.append((last_batch_start, last_batch_end))
+        return tuples    
+
+    def get_formatted_images(self, startInd, n, num_steps):
         """
         Fetch n - x image segments from run, where x is the number of 'dead' images.
 
@@ -1517,35 +1551,46 @@ def get_formatted_images(self, startInd, n):
         ndarray, shape (end_index-start_index, n-x)
             n-x retrieved image segments of dimension end_index-start_index
         """
-        self.psi.counter = startInd
-        self.imgsTracked.append((self.psi.counter, self.psi.counter + n))
-        print("Images tracked:", self.imgsTracked)
-
-        imgs = self.psi.get_images(n, assemble=False)
-
-        imgs = imgs[
-            [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-        ]
-        thumbnails = self.assembleImgsToSave(imgs)
-
-        if self.downsample:
-            imgs = bin_data(imgs, self.bin_factor)
-        num_valid_imgs, p, x, y = imgs.shape
-        img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-        img_batch[img_batch<0] = 0
-
-        num_valid_thumbnails, tx, ty = thumbnails.shape
-        thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
-
-        nimg_batch = []
-        nthumbnail_batch = []
-        for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
-            currIntensity = np.sum(img.flatten(), dtype=np.double)
-            nimg = self.imageProcessor.processImg(img, currIntensity)
-            nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
-            if nimg is not None:
-                nimg_batch.append(nimg)
-                nthumbnail_batch.append(nthumbnail)
-        nimg_batch = np.array(nimg_batch).T
-        nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
-        return (nimg_batch, nthumbnail_batch, self.imgsTracked)
+        fullimgs = None
+        fullthumbnails = None
+        imgsTracked = []
+        runs = self.split_range(startInd, startInd+n, num_steps)
+        for runStart, runEnd in runs:
+            self.psi.counter = runStart
+            imgsTracked.append((runStart, runEnd))
+
+            imgs = self.psi.get_images(runEnd-runStart, assemble=False)
+
+            imgs = imgs[
+                [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+            ]
+            thumbnails = self.assembleImgsToSave(imgs)
+
+            if self.downsample:
+                imgs = bin_data(imgs, self.bin_factor)
+            num_valid_imgs, p, x, y = imgs.shape
+            img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+            img_batch[img_batch<0] = 0
+    
+            num_valid_thumbnails, tx, ty = thumbnails.shape
+            thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
+
+            nimg_batch = []
+            nthumbnail_batch = []
+            for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
+                currIntensity = np.sum(img.flatten(), dtype=np.double)
+                nimg = self.imageProcessor.processImg(img, currIntensity)
+                nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
+                if nimg is not None:
+                    nimg_batch.append(nimg)
+                    nthumbnail_batch.append(nthumbnail)
+            nimg_batch = np.array(nimg_batch).T
+            nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+            if fullimgs is None:
+                fullimgs = nimg_batch
+                fullthumbnails = nthumbnail_batch
+            else:
+                fullimgs = np.hstack((fullimgs, nimg_batch))
+                fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
+        print("Images tracked:", imgsTracked)
+        return (fullimgs, fullthumbnails, imgsTracked)

From 07c3ee8ef4727bfb807ca7ef8b68110df713cdf3 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Thu, 14 Sep 2023 14:10:13 -0700
Subject: [PATCH 32/57] disabling common mode correction in FredDir
 DataRetriever. Seems to be a time bottleneck

---
 btx/interfaces/ipsana.py  | 17 +++++++++++++----
 btx/processing/freqdir.py |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py
index b3f7cedf6..cb5f53e1f 100644
--- a/btx/interfaces/ipsana.py
+++ b/btx/interfaces/ipsana.py
@@ -11,7 +11,8 @@ class PsanaInterface:
 
     def __init__(self, exp, run, det_type,
                  event_receiver=None, event_code=None, event_logic=True,
-                 ffb_mode=False, track_timestamps=False, calibdir=None):
+                 ffb_mode=False, track_timestamps=False, calibdir=None,
+                 no_cmod=False):
         self.exp = exp # experiment name, str
         self.hutch = exp[:3] # hutch name, str
         self.run = run # run number, int
@@ -21,10 +22,10 @@ def __init__(self, exp, run, det_type,
         self.event_receiver = event_receiver # 'evr0' or 'evr1', str
         self.event_code = event_code # event code, int
         self.event_logic = event_logic # bool, if True, retain events with event_code; if False, keep all other events
-        self.set_up(det_type, ffb_mode, calibdir)
+        self.set_up(det_type, ffb_mode, calibdir, no_cmod)
         self.counter = 0
 
-    def set_up(self, det_type, ffb_mode, calibdir=None):
+    def set_up(self, det_type, ffb_mode, calibdir=None, no_cmod=False):
         """
         Instantiate DataSource and Detector objects; use the run 
         functionality to retrieve all psana.EventTimes.
@@ -37,6 +38,8 @@ def set_up(self, det_type, ffb_mode, calibdir=None):
             if True, set up in an FFB-compatible style
         calibdir: str
             directory to alternative calibration files
+        no_cmod: bool
+            if True, deactivate common mode detector correction
         """
         ds_args=f'exp={self.exp}:run={self.run}:idx'
         if ffb_mode:
@@ -52,16 +55,19 @@ def set_up(self, det_type, ffb_mode, calibdir=None):
         if calibdir is not None:
             setOption('psana.calib_dir', calibdir)
         self._calib_data_available()
+        self.no_cmod = no_cmod
 
     def _calib_data_available(self):
         """
         Check whether calibration data is available.
         """
         self.calibrate = True
+        self.no_cmod = no_cmod
         evt = self.runner.event(self.times[0])
         if (self.det.pedestals(evt) is None) or (self.det.gain(evt) is None):
             logger.warning("Warning: calibration data unavailable, returning uncalibrated data")
             self.calibrate = False
+            self.no_cmod = True
 
     def turn_calibration_off(self):
         """
@@ -361,7 +367,10 @@ def get_images(self, num_images, assemble=True):
                         img = self.det.image(evt=evt)
                 else:
                     if self.calibrate:
-                        img = self.det.calib(evt=evt)
+                        cmod = self.det.common_mode(evt=evt)
+                        if self.no_cmod:
+                            cmod[1] = 0
+                        img = self.det.calib(evt=evt, cmpars=cmod)
                     else:
                         img = self.det.raw(evt=evt)
                         if self.det_type == 'epix10k2M':
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 318fc5904..194210e91 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1495,7 +1495,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
 
         self.imageProcessor = imageProcessor
 

From 6f5fa51b56c18c14e01a66cb8bf7a34ca4b6a58d Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Thu, 14 Sep 2023 14:12:53 -0700
Subject: [PATCH 33/57] typo

---
 btx/interfaces/ipsana.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py
index cb5f53e1f..e2ae4a52a 100644
--- a/btx/interfaces/ipsana.py
+++ b/btx/interfaces/ipsana.py
@@ -62,12 +62,10 @@ def _calib_data_available(self):
         Check whether calibration data is available.
         """
         self.calibrate = True
-        self.no_cmod = no_cmod
         evt = self.runner.event(self.times[0])
         if (self.det.pedestals(evt) is None) or (self.det.gain(evt) is None):
             logger.warning("Warning: calibration data unavailable, returning uncalibrated data")
             self.calibrate = False
-            self.no_cmod = True
 
     def turn_calibration_off(self):
         """

From 999a1f9330d9511247d517ec330d1e3f99dbc4c6 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Thu, 14 Sep 2023 14:20:50 -0700
Subject: [PATCH 34/57] fixed cmpars behavior to disable common mode correction
 if requested

---
 btx/interfaces/ipsana.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py
index e2ae4a52a..f404a276b 100644
--- a/btx/interfaces/ipsana.py
+++ b/btx/interfaces/ipsana.py
@@ -365,10 +365,10 @@ def get_images(self, num_images, assemble=True):
                         img = self.det.image(evt=evt)
                 else:
                     if self.calibrate:
-                        cmod = self.det.common_mode(evt=evt)
+                        cmpars = None
                         if self.no_cmod:
-                            cmod[1] = 0
-                        img = self.det.calib(evt=evt, cmpars=cmod)
+                            cmpars = 0
+                        img = self.det.calib(evt=evt, cmpars)
                     else:
                         img = self.det.raw(evt=evt)
                         if self.det_type == 'epix10k2M':

From b4551fe108d2b3f37996f6bef33c12dd5ca40642 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Thu, 14 Sep 2023 14:51:39 -0700
Subject: [PATCH 35/57] created FD sketch tasks and workflow.

---
 btx/processing/freqdir.py         |  5 +++--
 dags/frequent_direction_sketch.py | 27 ++++++++++++++++++++++++
 scripts/tasks.py                  | 34 +++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 dags/frequent_direction_sketch.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 194210e91..0aedbd523 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1305,7 +1305,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
+    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1399,6 +1399,7 @@ def runMe(self):
         filenameTest3 = self.comm.allgather(filenameTest3)
         print("TEST 3: ", self.rank, filenameTest3)
 
+    def visualizeMe(self):
         #UMAP STEP
         ##########################################################################################
         if self.rank==0:
@@ -1426,7 +1427,7 @@ def runMe(self):
             visMe.userSave()
             et = time.perf_counter()
             print("UMAP HTML Generation Processing time: {}".format(et - st))
-            print("TOTAL PROCESING TIME: {}".format(et - stfull))
+            #print("TOTAL PROCESING TIME: {}".format(et - stfull))
 
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
diff --git a/dags/frequent_direction_sketch.py b/dags/frequent_direction_sketch.py
new file mode 100644
index 000000000..02049a8d0
--- /dev/null
+++ b/dags/frequent_direction_sketch.py
@@ -0,0 +1,27 @@
+from datetime import datetime
+import os
+from airflow import DAG
+from plugins.jid import JIDSlurmOperator
+
+# DAG SETUP
+description='BTX frequent direction sketch DAG'
+dag_name = os.path.splitext(os.path.basename(__file__))[0]
+
+dag = DAG(
+    dag_name,
+    start_date=datetime( 2022,4,1 ),
+    schedule_interval=None,
+    description=description,
+  )
+
+
+# Tasks SETUP
+task_id='draw_sketch'
+draw_sketch = JIDSlurmOperator(task_id=task_id, dag=dag)
+
+task_id='show_sketch'
+show_sketch = JIDSlurmOperator(task_id = task_id, dag=dag)
+
+
+# Draw the DAG
+draw_sketch >> show_sketch
\ No newline at end of file
diff --git a/scripts/tasks.py b/scripts/tasks.py
index 3908f4d3e..6e82f3689 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -509,3 +509,37 @@ def timetool_correct(config):
             logger.info('No model found! Will return the nominal delay uncorrected!')
 
     tt.timetool_correct(run, nominal, model, figs)
+
+def draw_sketch(config):
+    from btx.processing.freqdir import WrapperFullFD
+    setup = config.setup
+    task = config.draw_sketch
+    """ Perform Frequent Direction Sketching on run. """
+    taskdir = os.path.join(setup.root_dir, 'draw_sketch')
+    os.makedirs(taskdir, exist_ok=True)
+    fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
+                       task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,
+                       task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError,
+                       task.downsample, task.bin_factor, task.threshold, task.eluThreshold,
+                       task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity,
+                       task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile)
+    logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...')
+    fd.runMe()
+    logger.debug('Done!')
+
+def show_sketch():
+    from btx.processing.freqdir import WrapperFullFD
+    setup = config.setup
+    task = config.show_sketch
+    """ Display Sketch. """
+    taskdir = os.path.join(setup.root_dir, 'show_sketch')
+    os.makedirs(taskdir, exist_ok=True)
+    fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
+                       task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,
+                       task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError,
+                       task.downsample, task.bin_factor, task.threshold, task.eluThreshold,
+                       task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity,
+                       task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile)
+    logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...')
+    fd.visualizeMe()
+    logger.debug('Done!')
\ No newline at end of file

From 71748dadc2aca149f16518a9978037a2b511623d Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Thu, 14 Sep 2023 14:55:31 -0700
Subject: [PATCH 36/57] created FD sketch tasks and workflow.

---
 scripts/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/tasks.py b/scripts/tasks.py
index 6e82f3689..7c276b81a 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -515,7 +515,7 @@ def draw_sketch(config):
     setup = config.setup
     task = config.draw_sketch
     """ Perform Frequent Direction Sketching on run. """
-    taskdir = os.path.join(setup.root_dir, 'draw_sketch')
+    taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
     fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
                        task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,
@@ -532,7 +532,7 @@ def show_sketch():
     setup = config.setup
     task = config.show_sketch
     """ Display Sketch. """
-    taskdir = os.path.join(setup.root_dir, 'show_sketch')
+    taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
     fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
                        task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,

From d0b8545b1d2954de24ca270ee417dc5b6e61f599 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Fri, 15 Sep 2023 12:05:18 -0700
Subject: [PATCH 37/57] Moved psana. Resolved cmpar bug. 430hz processing time
 not including visualization step.

---
 btx/interfaces/ipsana.py  |  4 ++--
 btx/processing/dimRed.py  | 10 +++++++---
 btx/processing/freqdir.py | 29 +++++++++++++++--------------
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py
index f404a276b..6570a30d9 100644
--- a/btx/interfaces/ipsana.py
+++ b/btx/interfaces/ipsana.py
@@ -367,8 +367,8 @@ def get_images(self, num_images, assemble=True):
                     if self.calibrate:
                         cmpars = None
                         if self.no_cmod:
-                            cmpars = 0
-                        img = self.det.calib(evt=evt, cmpars)
+                            cmpars = [0,0,0] 
+                        img = self.det.calib(evt=evt, cmpars=cmpars)
                     else:
                         img = self.det.raw(evt=evt)
                         if self.det_type == 'epix10k2M':
diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py
index 0bd1db85d..493ecc1e8 100644
--- a/btx/processing/dimRed.py
+++ b/btx/processing/dimRed.py
@@ -39,15 +39,19 @@ def __init__(
         priming=False,
         downsample=False,
         bin_factor=2,
-        output_dir=""
+        output_dir="", 
+        psi=None
     ):
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = start_offset
+        if psi is None:
+            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi.counter = start_offset
+        else:
+            self.psi = psi
         self.start_offset = start_offset
 
         self.priming = priming
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 0aedbd523..dab7adec8 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -138,12 +138,13 @@ def __init__(
         downsample,
         bin_factor,
         samplingFactor, 
-        num_components, 
+        num_components,
+        psi,
     ):
 
         super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
                 num_images=num_imgs, num_components=num_components, batch_size=0, priming=False,
-                downsample=downsample, bin_factor=bin_factor, output_dir=output_dir)
+                downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi)
 
         self.comm = comm
         self.rank= rank
@@ -456,7 +457,7 @@ def write(self):
         filename : string
             Name of h5 file where sketch, mean of data, and indices of data processed is written
         """
-        self.comm.barrier()
+#        self.comm.barrier()
         filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
@@ -493,18 +494,18 @@ class MergeTree:
     currRun: Current datetime used to identify run
     """
 
-    def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun):
+    def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi):
         self.comm = comm
         self.rank = rank
         self.size = size
         
         self.divBy = divBy
         
-        time.sleep(15)
+#        time.sleep(5)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1)
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi)
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -656,7 +657,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(15)
+#        time.sleep(5)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -1358,7 +1359,7 @@ def runMe(self):
         freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
                 det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
-                currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked)
+                currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi)
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
         st = time.perf_counter()
         freqDir.run()
@@ -1376,7 +1377,7 @@ def runMe(self):
         for j in range(freqDir.size):
             allNames.append(fullSketchFilename + str(j) + ".h5")
         mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
-                output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun)
+                output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi)
         st = time.perf_counter()
         mergeTree.merge()
         mergedSketchFilename = mergeTree.write()
@@ -1393,11 +1394,11 @@ def runMe(self):
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
         print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
 
-        self.comm.barrier()
-        self.comm.Barrier()
-        filenameTest3 = random.randint(0, 10)
-        filenameTest3 = self.comm.allgather(filenameTest3)
-        print("TEST 3: ", self.rank, filenameTest3)
+#        self.comm.barrier()
+#        self.comm.Barrier()
+#        filenameTest3 = random.randint(0, 10)
+#        filenameTest3 = self.comm.allgather(filenameTest3)
+#        print("TEST 3: ", self.rank, filenameTest3)
 
     def visualizeMe(self):
         #UMAP STEP

From 8dd6ccc11c3c468ddb67faf02559f720ced800ff Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 17 Sep 2023 14:58:10 -0700
Subject: [PATCH 38/57] Fixed reconstruction error. Removed double psana
 initialization. Moved thumbnail generation outside of sketching. Other small
 updates.

---
 btx/processing/freqdir.py | 369 +++++++++++++++++++++++++-------------
 1 file changed, 240 insertions(+), 129 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index dab7adec8..70402e87a 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -248,6 +248,7 @@ def update_model(self, X):
                         self.rotate()
                         if canRankAdapt and self.rankAdapt:
                             reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch))
+                            print("RANK ADAPT RECON ERROR: ", reconError)
                             if (reconError > self.rankAdaptMinError):
                                 self.increaseEll = True
                 self.sketch[self.nextZeroRow,:] = row 
@@ -329,76 +330,96 @@ def reconstructionError(self, matrixCentered):
         	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
                 (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
 
-    def lowMemoryReconstructionErrorScaled(self, matrixCentered):
-        """ 
-        Compute the low memory reconstruction error of the matrix sketch
-        against given data. This is the same as reconstructionError,
-        but estimates the norm computation and does not scale by the 
-        minimum projection matrix, but rather by the matrix norm itself. 
-
-        Parameters
-        ----------
-        matrixCentered: ndarray
-           Data to compare matrix sketch to 
+#    def lowMemoryReconstructionErrorScaled(self, matrixCentered):
+#        """ 
+#        Compute the low memory reconstruction error of the matrix sketch
+#        against given data. This is the same as reconstructionError,
+#        but estimates the norm computation and does not scale by the 
+#        minimum projection matrix, but rather by the matrix norm itself. 
+#
+#        Parameters
+#        ----------
+#        matrixCentered: ndarray
+#           Data to compare matrix sketch to 
+#
+#        Returns
+#        -------
+#        float,
+#            Data subtracted by data projected onto sketched space, scaled by matrix elements
+#       """
+#        matSketch = self.sketch[:self.ell, :]
+#        print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape)
+##        k = 10
+#        matrixCenteredT = matrixCentered.T
+#        matSketchT = matSketch.T
+#        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
+##        G = U[:,:k]
+#        G = U
+#        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
+#                np.linalg.norm(matrixCenteredT, 'fro')**2)
 
-        Returns
-        -------
-        float,
-            Data subtracted by data projected onto sketched space, scaled by matrix elements
-       """
-        matSketch = self.sketch
-        k = 10
+    def lowMemoryReconstructionErrorScaled(self, matrixCentered):
+        matSketch = self.sketch[:self.ell, :]
         matrixCenteredT = matrixCentered.T
         matSketchT = matSketch.T
         U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
-        G = U[:,:k]
-        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
-                np.linalg.norm(matrixCenteredT, 'fro')**2)
-
-    def estimFrobNormSquared(self, addMe, arrs, its):
-        """ 
-        Estimate the Frobenius Norm of product of arrs matrices 
-        plus addME matrix using its iterations. 
-
-        Parameters
-        ----------
-        arrs: list of ndarray
-           Matrices to multiply together
+        G = U
+        return self.estimFrobNormJ(matrixCenteredT, [G,G.T,matrixCenteredT], 20)/np.linalg.norm(matrixCenteredT, 'fro')
 
-        addMe: ndarray
-            Matrix to add to others
-
-        its: int
-            Number of iterations to average over
-
-        Returns
-        -------
-        sumMe/its*no_rows : float
-            Estimate of frobenius norm of product
-            of arrs matrices plus addMe matrix
-
-        Notes
-        -----
-        Frobenius estimation is the expected value of matrix
-        multiplied by random vector from multivariate normal distribution
-        based on [1]. 
-
-        [1] Norm and Trace Estimation with Random Rank-one Vectors 
-        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
-        Analysis and Applications 2021 42:1, 202-223
-       """
-        no_rows = arrs[-1].shape[1]
-        v = np.random.normal(size=no_rows)
-        v_hat = v / np.linalg.norm(v)
+    def estimFrobNormJ(self, addMe, arrs, k):
+        m, n = addMe.shape
+        randMat = np.random.normal(0, 1, size=(n, k))
+        minusMe = addMe @ randMat
         sumMe = 0
-        for j in range(its):
-            v = np.random.normal(size=no_rows)
-            v_hat = v / np.linalg.norm(v)
-            v_addMe = addMe @ v_hat
-            for arr in arrs[::-1]:
-                v_hat = arr @ v_hat
-            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-        return sumMe/its*no_rows
+        for arr in arrs[::-1]:
+            randMat = arr @ randMat
+        sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro')
+        return sumMe
+
+#    def estimFrobNormSquared(self, addMe, arrs, its):
+#        """ 
+#        Estimate the Frobenius Norm of product of arrs matrices 
+#        plus addME matrix using its iterations. 
+#
+#        Parameters
+#        ----------
+#        arrs: list of ndarray
+#           Matrices to multiply together
+#
+#        addMe: ndarray
+#            Matrix to add to others
+#
+#        its: int
+#            Number of iterations to average over
+#
+#        Returns
+#        -------
+#        sumMe/its*no_rows : float
+#            Estimate of frobenius norm of product
+#            of arrs matrices plus addMe matrix
+#
+#        Notes
+#        -----
+#        Frobenius estimation is the expected value of matrix
+#        multiplied by random vector from multivariate normal distribution
+#        based on [1]. 
+#
+#        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+#        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+#        Analysis and Applications 2021 42:1, 202-223
+#       """
+#        no_rows = arrs[-1].shape[1]
+#        v = np.random.normal(size=no_rows)
+#        v_hat = v / np.linalg.norm(v)
+#        sumMe = 0
+#        for j in range(its):
+#            v = np.random.normal(size=no_rows)
+#            v_hat = v / np.linalg.norm(v)
+#            v_addMe = addMe @ v_hat
+#            for arr in arrs[::-1]:
+#                v_hat = arr @ v_hat
+#            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+#        return sumMe/its*no_rows
 
 
     def gatherFreqDirsSerial(self):
@@ -509,7 +530,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-        print(self.buffSizes)
+#        print(self.buffSizes)
 
         self.fd.update_model(self.data.T)
 
@@ -620,7 +641,6 @@ class ApplyCompression:
     non-downsampled data for thumbnail generation
     components: Principal Components of matrix sketch
     processedData: Data projected onto matrix sketch range
-    smallImages: Downsampled images for visualization purposes 
     """
 
     def __init__(
@@ -637,7 +657,6 @@ def __init__(
         output_dir,
         currRun,
         imgData, 
-        thumbnailData,
     ):
 
         self.comm = comm
@@ -657,7 +676,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-#        time.sleep(5)
+        time.sleep(5)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -666,12 +685,10 @@ def __init__(
         self.components = Vt
         
         self.processedData = None
-        self.smallImgs = None
 
         self.imageIndicesProcessed = []
 
         self.imgData = imgData
-        self.thumbnailData = thumbnailData
 
 
     def run(self):
@@ -680,11 +697,8 @@ def run(self):
 
         Note: If-Else statement is from previous/future work enabling streaming processing. 
         """
-        if self.smallImgs is None:
-            self.smallImgs = self.thumbnailData
-        else:
-            self.smallImgs = np.concatenate((self.smallImgs, self.thumbnailData), axis=0)
         self.apply_compression(self.imgData)
+        return self.data
 
     def apply_compression(self, X):
         """
@@ -707,7 +721,6 @@ def write(self):
         filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank)
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
-            hf.create_dataset("SmallImages", data=self.smallImgs)
 #        print("CREATED FILE: ", filename)
         self.comm.barrier()
         return filename
@@ -883,10 +896,10 @@ def fastABOD(self, pts, nsamples):
                 ac = cpt - apt
                 if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
                     count += 1
-                    print("TOO CLOSE")
+#                    print("TOO CLOSE")
                     continue
                 outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
-            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
+#            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
             if(len(outlier_factors)==0):
                 abofs.append(np.inf)
             else:
@@ -899,7 +912,7 @@ def getOutliers(self, lst):
 #        quart10 = lstCopy[len(lstCopy)//divBy]
 
         lstQuant = np.quantile(np.array(lst), self.outlierQuantile)
-        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
+#        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
         outlierInds = []
         notOutlierInds = []
         for j in range(len(lst)):
@@ -907,8 +920,8 @@ def getOutliers(self, lst):
                 outlierInds.append(j)
             else:
                 notOutlierInds.append(j)
-        print("OUTLIER INDS: ", outlierInds)
-        print("NOT OUTLIER INDS: ", notOutlierInds)
+#        print("OUTLIER INDS: ", outlierInds)
+#        print("NOT OUTLIER INDS: ", notOutlierInds)
         return np.array(outlierInds), np.array(notOutlierInds)
 
     def genHist(self, vals, endClass):
@@ -1281,27 +1294,6 @@ def userShow(self):
         output_notebook()
         show(self.viewResults)
 
-def profile(filename=None, comm=MPI.COMM_WORLD):
-  def prof_decorator(f):
-    def wrap_f(*args, **kwargs):
-      pr = cProfile.Profile()
-      pr.enable()
-      result = f(*args, **kwargs)
-      pr.disable()
-
-      if filename is None:
-        pr.print_stats()
-      else:
-        filename_r = filename + ".{}".format(comm.rank)
-        pr.dump_stats(filename_r)
-
-      return result
-    return wrap_f
-  return prof_decorator
-
-def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-
 class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
@@ -1347,12 +1339,99 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile)
         self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
-#    @profile(filename="fullFD_profile")
+#    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
+#        """ 
+#        Compute the low memory reconstruction error of the matrix sketch
+#        against given data. This is the same as reconstructionError,
+#        but estimates the norm computation and does not scale by the 
+#        minimum projection matrix, but rather by the matrix norm itself. 
+#
+#        Parameters
+#        ----------
+#        matrixCentered: ndarray
+#           Data to compare matrix sketch to 
+#
+#        Returns
+#        -------
+#        float,
+#            Data subtracted by data projected onto sketched space, scaled by matrix elements
+#       """
+##        k = 10
+#        matrixCenteredT = matrixCentered.T
+#        matSketchT = matSketch.T
+#        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
+##        G = U[:,:k]
+#        G = U
+#        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
+#                np.linalg.norm(matrixCenteredT, 'fro')**2)
+#
+#    def estimFrobNormSquared(self, addMe, arrs, its):
+#        """ 
+#        Estimate the Frobenius Norm of product of arrs matrices 
+#        plus addME matrix using its iterations. 
+#
+#        Parameters
+#        ----------
+#        arrs: list of ndarray
+#           Matrices to multiply together
+#
+#        addMe: ndarray
+#            Matrix to add to others
+##
+#        its: int
+#            Number of iterations to average over
+#
+#        Returns
+#        -------
+#        sumMe/its*no_rows : float
+#            Estimate of frobenius norm of product
+#            of arrs matrices plus addMe matrix
+#
+#        Notes
+#        -----
+#        Frobenius estimation is the expected value of matrix
+#        multiplied by random vector from multivariate normal distribution
+#        based on [1]. 
+#
+#        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+#        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+#        Analysis and Applications 2021 42:1, 202-223
+#       """
+#        no_rows = arrs[-1].shape[1]
+#        v = np.random.normal(size=no_rows)
+#        v_hat = v / np.linalg.norm(v)
+#        sumMe = 0
+#        for j in range(its):
+#            v = np.random.normal(size=no_rows)
+#            v_hat = v / np.linalg.norm(v)
+#            v_addMe = addMe @ v_hat
+#            for arr in arrs[::-1]:
+#                v_hat = arr @ v_hat
+#            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+#        return sumMe/its*no_rows
+
+    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
+        G = U
+        return self.estimFrobNormJ(matrixCenteredT, [G,G.T,matrixCenteredT], 20)/np.linalg.norm(matrixCenteredT, 'fro')
+
+    def estimFrobNormJ(self, addMe, arrs, k):
+        m, n = addMe.shape
+        randMat = np.random.normal(0, 1, size=(n, k))
+        minusMe = addMe @ randMat
+        sumMe = 0
+        for arr in arrs[::-1]:
+            randMat = arr @ randMat
+        sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro')
+        return sumMe
+
     def runMe(self):
         stfull = time.perf_counter()
 
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps)
+        self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
 
         #SKETCHING STEP
         ##########################################################################################
@@ -1386,9 +1465,9 @@ def runMe(self):
 
         #PROJECTION STEP
         ##########################################################################################
-        appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData)
+        appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData)
         st = time.perf_counter()
-        appComp.run()
+        self.matSketch = appComp.run()
         appComp.write()
         et = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
@@ -1400,12 +1479,25 @@ def runMe(self):
 #        filenameTest3 = self.comm.allgather(filenameTest3)
 #        print("TEST 3: ", self.rank, filenameTest3)
 
+    def addThumbnailsToProjectH5(self):
+#        print("Gathering thumbnails")
+        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
+        _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
+        file_name = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
+        f1 = h5py.File(file_name, 'r+')
+        f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
+        f1.close()
+        self.comm.barrier()
+
+
     def visualizeMe(self):
+        st = time.perf_counter()
+        self.addThumbnailsToProjectH5()
         #UMAP STEP
         ##########################################################################################
         if self.rank==0:
-            print("here 1")
-            st = time.perf_counter()
+
+#            print("here 1")
 
             skipSize = 8
             numImgsToUse = int(self.num_imgs/skipSize)
@@ -1535,7 +1627,7 @@ def split_range(self, start, end, num_tuples):
         tuples.append((last_batch_start, last_batch_end))
         return tuples    
 
-    def get_formatted_images(self, startInd, n, num_steps):
+    def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
         """
         Fetch n - x image segments from run, where x is the number of 'dead' images.
 
@@ -1566,7 +1658,8 @@ def get_formatted_images(self, startInd, n, num_steps):
             imgs = imgs[
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
-            thumbnails = self.assembleImgsToSave(imgs)
+            if getThumbnails:
+                thumbnails = self.assembleImgsToSave(imgs)
 
             if self.downsample:
                 imgs = bin_data(imgs, self.bin_factor)
@@ -1574,25 +1667,43 @@ def get_formatted_images(self, startInd, n, num_steps):
             img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
             img_batch[img_batch<0] = 0
     
-            num_valid_thumbnails, tx, ty = thumbnails.shape
-            thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
-
-            nimg_batch = []
-            nthumbnail_batch = []
-            for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
-                currIntensity = np.sum(img.flatten(), dtype=np.double)
-                nimg = self.imageProcessor.processImg(img, currIntensity)
-                nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
-                if nimg is not None:
-                    nimg_batch.append(nimg)
-                    nthumbnail_batch.append(nthumbnail)
-            nimg_batch = np.array(nimg_batch).T
-            nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
-            if fullimgs is None:
-                fullimgs = nimg_batch
-                fullthumbnails = nthumbnail_batch
+            if getThumbnails:
+                num_valid_thumbnails, tx, ty = thumbnails.shape
+                thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
+
+            if getThumbnails:
+                nimg_batch = []
+                nthumbnail_batch = []
+                for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
+                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
+                    if nimg is not None:
+                        nimg_batch.append(nimg)
+                        nthumbnail_batch.append(nthumbnail)
+                nimg_batch = np.array(nimg_batch).T
+                nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+                if fullimgs is None:
+                    fullimgs = nimg_batch
+                    fullthumbnails = nthumbnail_batch
+                else:
+                    fullimgs = np.hstack((fullimgs, nimg_batch))
+                    fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
             else:
-                fullimgs = np.hstack((fullimgs, nimg_batch))
-                fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
+                nimg_batch = []
+                for img in img_batch.T:
+                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    if nimg is not None:
+                        nimg_batch.append(nimg)
+                nimg_batch = np.array(nimg_batch).T
+                if fullimgs is None:
+                    fullimgs = nimg_batch
+                else:
+                    fullimgs = np.hstack((fullimgs, nimg_batch))
+
         print("Images tracked:", imgsTracked)
-        return (fullimgs, fullthumbnails, imgsTracked)
+        if getThumbnails:
+            return (fullimgs, fullthumbnails, imgsTracked)
+        else:
+            return (fullimgs, imgsTracked)

From b8f9ab4d7589d15c8329127c193d951b2d5dbfbc Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Sun, 24 Sep 2023 08:39:55 -0700
Subject: [PATCH 39/57] Checkpoint. I don't think there are any major changes.

---
 btx/processing/freqdir.py | 85 ++++++++++++++++++++++++++++++++-------
 scripts/tasks.py          | 32 ++++++++++-----
 2 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 70402e87a..b4b4fb5d0 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -140,12 +140,28 @@ def __init__(
         samplingFactor, 
         num_components,
         psi,
+        usePSI
     ):
 
-        super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
-                num_images=num_imgs, num_components=num_components, batch_size=0, priming=False,
-                downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi)
-
+########################
+        if usePSI:
+            super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
+                    num_images=num_imgs, num_components=num_components, batch_size=0, priming=False,
+                    downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi)
+        else:
+            self.start_offset = start_offset
+            self.downsample = False
+            self.bin_factor = 0
+            self.output_dir = output_dir
+            self.num_components = num_components
+            self.num_features,self.num_images = imgData.shape 
+            print("NUM IMAGES: ", self.num_images)
+
+            self.task_durations = dict({})
+
+            self.num_incorporated_images = 0
+            self.outliers, self.pc_data = [], []
+########################
         self.comm = comm
         self.rank= rank
         self.size = size
@@ -526,7 +542,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi)
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=True)
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -798,7 +814,6 @@ def update(self, vec):
 
 
 
-
 class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
@@ -1298,7 +1313,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile):
+    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1325,11 +1340,16 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
-        self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
         self.imgsTracked = []
         self.grabImgSteps = grabImgSteps
 
+        self.usePSI = usePSI
+        if usePSI:
+            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
+        else:
+            self.psi = None
+
         if self.rank==0:
             self.currRun = datetime.now().strftime("%y%m%d%H%M%S")
         else:
@@ -1427,18 +1447,43 @@ def estimFrobNormJ(self, addMe, arrs, k):
         sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro')
         return sumMe
 
+    def retrieveImages(self):
+        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
+        self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
+
+    def genSynthData(self):
+        self.fullImgData = np.random.rand(70000, 100000//self.size)
+        self.imgsTracked = [(0, self.rank)]
+
+    def genDecayingSVD(self):
+        A = np.random.rand(matrixSize, matrixSize)\n
+        A = A.T @ A\n
+        eigVals, eigVecs = np.linalg.eig(A)\n
+        diag_entries = list(np.random.rand(matrixSize))\n
+        diag_entries.sort()\n
+        diag_entries = np.array(diag_entries[::-1])\n
+        D = np.diag(diag_entries) + np.eye(matrixSize)\n
+        return (eigVecs @ (D) @ eigVecs.T)
+
     def runMe(self):
+
         stfull = time.perf_counter()
 
-        startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
+        #DATA RETRIEVAL STEP
+        ##########################################################################################
+        if self.usePSI:
+            self.retrieveImages()
+        else:
+            self.genSynthData()
+        et = time.perf_counter()
+        print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
 
         #SKETCHING STEP
         ##########################################################################################
         freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,
                 det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError,
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
-                currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi)
+                currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi, usePSI=self.usePSI)
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
         st = time.perf_counter()
         freqDir.run()
@@ -1489,7 +1534,6 @@ def addThumbnailsToProjectH5(self):
         f1.close()
         self.comm.barrier()
 
-
     def visualizeMe(self):
         st = time.perf_counter()
         self.addThumbnailsToProjectH5()
@@ -1649,25 +1693,35 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
         fullthumbnails = None
         imgsTracked = []
         runs = self.split_range(startInd, startInd+n, num_steps)
+        print(runs) 
         for runStart, runEnd in runs:
+#            print("RETRIEVING: [", runStart, ":", runEnd,"]")
             self.psi.counter = runStart
             imgsTracked.append((runStart, runEnd))
 
+#            print("getting images")
             imgs = self.psi.get_images(runEnd-runStart, assemble=False)
 
+#            print("Removing nan images")
             imgs = imgs[
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
+
             if getThumbnails:
+#                print("Assembling thumbnails")
                 thumbnails = self.assembleImgsToSave(imgs)
 
             if self.downsample:
+#                print("Downsampling images")
                 imgs = bin_data(imgs, self.bin_factor)
+#            print("Flattening images")
             num_valid_imgs, p, x, y = imgs.shape
             img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
+#            print("Image values less than 0 setting to 0")
             img_batch[img_batch<0] = 0
     
             if getThumbnails:
+#                print("FLattening thumbnails")
                 num_valid_thumbnails, tx, ty = thumbnails.shape
                 thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
 
@@ -1693,16 +1747,19 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 nimg_batch = []
                 for img in img_batch.T:
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
+#                    print("Starting image processing of size {}".format(img_batch.T.shape))
                     nimg = self.imageProcessor.processImg(img, currIntensity)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                 nimg_batch = np.array(nimg_batch).T
+#                print("hstacking")
                 if fullimgs is None:
+
                     fullimgs = nimg_batch
                 else:
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
-        print("Images tracked:", imgsTracked)
+#        print("Images tracked:", imgsTracked)
         if getThumbnails:
             return (fullimgs, fullthumbnails, imgsTracked)
         else:
diff --git a/scripts/tasks.py b/scripts/tasks.py
index 7c276b81a..0603a2498 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -518,11 +518,16 @@ def draw_sketch(config):
     taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
     fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
-                       task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,
-                       task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError,
-                       task.downsample, task.bin_factor, task.threshold, task.eluThreshold,
-                       task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity,
-                       task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile)
+                       start_offset=task.start_offset, num_imgs=task.num_imgs, 
+                       writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps,
+                       num_components=task.num_components, alpha=task.alpha, 
+                       rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError,
+                       downsample=task.downsample, bin_factor=task.bin_factor, 
+                       threshold=task.threshold, eluThreshold=task.eluThreshold,
+                       eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, 
+                       noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, 
+                       samplingFactor=task.samplingFactor, divBy=task.divBy, 
+                       thresholdQuantile=task.thresholdQuantile)
     logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...')
     fd.runMe()
     logger.debug('Done!')
@@ -535,11 +540,16 @@ def show_sketch():
     taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
     fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
-                       task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps,
-                       task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError,
-                       task.downsample, task.bin_factor, task.threshold, task.eluThreshold,
-                       task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity,
-                       task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile)
+                       start_offset=task.start_offset, num_imgs=task.num_imgs, 
+                       writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps,
+                       num_components=task.num_components, alpha=task.alpha, 
+                       rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError,
+                       downsample=task.downsample, bin_factor=task.bin_factor, 
+                       threshold=task.threshold, eluThreshold=task.eluThreshold,
+                       eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, 
+                       noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, 
+                       samplingFactor=task.samplingFactor, divBy=task.divBy, 
+                       thresholdQuantile=task.thresholdQuantile)
     logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...')
     fd.visualizeMe()
-    logger.debug('Done!')
\ No newline at end of file
+    logger.debug('Done!')

From 068389651bdb21baabe7167982d29158c98b9ad0 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 3 Oct 2023 12:48:52 -0700
Subject: [PATCH 40/57] Checkpoint. Runs well and synthetic data fixed.

---
 btx/processing/freqdir.py | 227 +++++++++++++++++++++++++++++++-------
 1 file changed, 190 insertions(+), 37 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index b4b4fb5d0..762ef85f0 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -308,10 +308,12 @@ def rotate(self):
             toShrink[-1] = 0
             toShrink = sqrt(toShrink)
             toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))]
-            self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:])
+            #self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) #JOHN: Removed this extra colon 10/01/2023
+            self.sketch[:self.ell,:] = dot(diag(toShrink), Vt[:self.ell,:])
             self.sketch[self.ell:,:] = 0
             self.nextZeroRow = self.ell
         else:
+            print(S.shape, self.ell)
             self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:]
             self.sketch[ssize:,:] = 0
             self.nextZeroRow = ssize
@@ -531,18 +533,18 @@ class MergeTree:
     currRun: Current datetime used to identify run
     """
 
-    def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi):
+    def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi, usePSI):
         self.comm = comm
         self.rank = rank
         self.size = size
         
         self.divBy = divBy
         
-#        time.sleep(5)
+        time.sleep(10)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
-        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=True)
+        self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = np.random.rand(2, 2), imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=usePSI)
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
@@ -692,7 +694,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(5)
+        time.sleep(10)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -963,6 +965,8 @@ def genUMAP(self):
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
 
+        print("AOIDWJOIAWDJ", len(imgs), len(projections))
+
         intensities = []
         for img in imgs:
             intensities.append(np.sum(img.flatten()))
@@ -981,7 +985,8 @@ def genUMAP(self):
             n_neighbors=self.umap_n_neighbors,
             random_state=self.umap_random_state,
             n_components=2,
-            min_dist=0.25,
+#            min_dist=0.25,
+            min_dist=0.1,
         ).fit_transform(self.projections)
 
         self.labels = hdbscan.HDBSCAN(
@@ -1357,7 +1362,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile)
-        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
+        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 150, thumbnailWidth = 150)
 
 #    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
 #        """ 
@@ -1455,15 +1460,36 @@ def genSynthData(self):
         self.fullImgData = np.random.rand(70000, 100000//self.size)
         self.imgsTracked = [(0, self.rank)]
 
-    def genDecayingSVD(self):
-        A = np.random.rand(matrixSize, matrixSize)\n
-        A = A.T @ A\n
-        eigVals, eigVecs = np.linalg.eig(A)\n
-        diag_entries = list(np.random.rand(matrixSize))\n
-        diag_entries.sort()\n
-        diag_entries = np.array(diag_entries[::-1])\n
-        D = np.diag(diag_entries) + np.eye(matrixSize)\n
-        return (eigVecs @ (D) @ eigVecs.T)
+#    def genDecayingSVD(self):
+#        numFeats = 70000
+#        numSamps = 100000//self.size
+#        A = np.random.rand(matrixSize, matrixSize)
+##        A = A.T @ A
+#        eigVals, eigVecs = np.linalg.eig(A)
+#        diag_entries = list(np.random.rand(matrixSize))
+##        diag_entries.sort()
+#        multMe = np.ones(numSamps)
+##        diag_entries = np.array(diag_entries[::-1])
+#        D = np.diag(diag_entries) + np.eye(matrixSize)
+#        return (eigVecs @ (D) @ eigVecs.T)
+
+    def compDecayingSVD(self, seedMe, a, b):
+        numFeats = a
+        numSamps = b//self.size
+        perturbation = np.random.rand(numSamps, numFeats)*0.1
+        np.random.seed(seedMe)
+        A1 = np.random.rand(numSamps, numFeats) 
+        Q1, R1 = np.linalg.qr(A1)
+        Q1 = Q1 + perturbation
+        A2 = np.random.rand(numFeats, numFeats) #Modify
+        Q2, R2 = np.linalg.qr(A2)
+        S = list(np.random.rand(numFeats)) #Modify
+        S.sort()
+        S = S[::-1]
+        for j in range(len(S)): #Modify
+            S[j] = (2**(-16*(j+1)/len(S)))*S[j]
+        self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+        self.imgsTracked = [(0, numSamps)]
 
     def runMe(self):
 
@@ -1471,12 +1497,13 @@ def runMe(self):
 
         #DATA RETRIEVAL STEP
         ##########################################################################################
-        if self.usePSI:
-            self.retrieveImages()
-        else:
-            self.genSynthData()
-        et = time.perf_counter()
-        print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
+#        if self.usePSI:
+#            self.retrieveImages()
+#        else:
+#            self.compDecayingSVD()
+##            self.genSynthData()
+#        et = time.perf_counter()
+#        print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
 
         #SKETCHING STEP
         ##########################################################################################
@@ -1485,11 +1512,11 @@ def runMe(self):
                 merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor,
                 currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi, usePSI=self.usePSI)
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
-        st = time.perf_counter()
+        st1 = time.perf_counter()
         freqDir.run()
         localSketchFilename = freqDir.write()
-        et = time.perf_counter()
-        print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
+        et1 = time.perf_counter()
+        print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et1 - st1))
 
         #MERGING STEP
         ##########################################################################################
@@ -1501,22 +1528,24 @@ def runMe(self):
         for j in range(freqDir.size):
             allNames.append(fullSketchFilename + str(j) + ".h5")
         mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename,
-                output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi)
-        st = time.perf_counter()
+                output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI)
+        st2 = time.perf_counter()
         mergeTree.merge()
         mergedSketchFilename = mergeTree.write()
-        et = time.perf_counter()
-        print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
+        et2 = time.perf_counter()
+        print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et2 - st2))
 
         #PROJECTION STEP
         ##########################################################################################
         appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData)
-        st = time.perf_counter()
+        st3 = time.perf_counter()
         self.matSketch = appComp.run()
         appComp.write()
-        et = time.perf_counter()
-        print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st))
-        print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull))
+        et3 = time.perf_counter()
+        print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3))
+        print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull))
+
+        return (et1 + et2 + et3 - st1 - st2 - st3)
 
 #        self.comm.barrier()
 #        self.comm.Barrier()
@@ -1528,7 +1557,7 @@ def addThumbnailsToProjectH5(self):
 #        print("Gathering thumbnails")
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
-        file_name = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
+        file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
         f1 = h5py.File(file_name, 'r+')
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
         f1.close()
@@ -1543,16 +1572,17 @@ def visualizeMe(self):
 
 #            print("here 1")
 
-            skipSize = 8
+            skipSize = 1
             numImgsToUse = int(self.num_imgs/skipSize)
-            visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun),
+            visMe = visualizeFD(inputFile=self.writeToHere+"{}_ProjectedData".format(self.currRun),
                             outputFile="./UMAPVis_{}.html".format(self.currRun),
                             numImgsToUse=self.num_imgs,
                             nprocs=self.size,
                             userGroupings=[],
                             includeABOD=True,
                             skipSize = skipSize,
-                            umap_n_neighbors=numImgsToUse//40,
+#                            umap_n_neighbors=numImgsToUse//40,
+                            umap_n_neighbors=numImgsToUse//4000,
                             umap_random_state=42,
                             hdbscan_min_samples=int(numImgsToUse*0.75//40),
                             hdbscan_min_cluster_size=int(numImgsToUse//40),
@@ -1764,3 +1794,126 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             return (fullimgs, fullthumbnails, imgsTracked)
         else:
             return (fullimgs, imgsTracked)
+
+
+class SinglePanelDataRetriever:
+    def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
+        self.exp = exp
+        self.det_type = det_type
+        self.run = run
+        self.thumbnailHeight = thumbnailHeight
+        self.thumbnailWidth = thumbnailWidth
+
+        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+
+        self.imageProcessor = imageProcessor
+
+    def split_range(self, start, end, num_tuples):
+        if start==end:
+            raise ValueError('Range processing error: start value equals end value, which leads to no images processed.')
+            return
+        total_elements = end - start
+        batch_size = total_elements // num_tuples
+        tuples = []
+        for i in range(num_tuples - 1):
+            batch_start = start + i * batch_size
+            batch_end = batch_start + batch_size
+            tuples.append((batch_start, batch_end))
+        last_batch_start = start + (num_tuples - 1) * batch_size
+        last_batch_end = end
+        tuples.append((last_batch_start, last_batch_end))
+        return tuples    
+
+    def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
+        """
+        Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+        Parameters
+        ----------
+        n : int
+            number of images to retrieve
+        start_index : int
+            start index of subsection of data to retrieve
+        end_index : int
+            end index of subsection of data to retrieve
+
+        Returns
+        -------
+        ndarray, shape (end_index-start_index, n-x)
+            n-x retrieved image segments of dimension end_index-start_index
+        """
+        fullimgs = None
+        fullthumbnails = None
+        imgsTracked = []
+        runs = self.split_range(startInd, startInd+n, num_steps)
+        print(runs) 
+        for runStart, runEnd in runs:
+#            print("RETRIEVING: [", runStart, ":", runEnd,"]")
+            self.psi.counter = runStart
+            imgsTracked.append((runStart, runEnd))
+
+#            print("getting images")
+            imgs = self.psi.get_images(runEnd-runStart, assemble=False)
+
+#            print("Removing nan images")
+            imgs = imgs[
+                [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+            ]
+
+            if getThumbnails:
+                saveMe = []
+                for img in imgs:
+                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
+                thumbnails = np.array(saveMe)
+                print("thumbaaowdijaoiajw", len(imgs), len(thumbnails))
+
+            num_valid_imgs, x, y = imgs.shape
+            img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T
+#            print("Image values less than 0 setting to 0")
+            img_batch[img_batch<0] = 0
+    
+            if getThumbnails:
+#                print("FLattening thumbnails")
+                num_valid_thumbnails, tx, ty = thumbnails.shape
+                thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
+
+            if getThumbnails:
+                nimg_batch = []
+                nthumbnail_batch = []
+                for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
+                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
+                    if nimg is not None:
+                        nimg_batch.append(nimg)
+                        nthumbnail_batch.append(nthumbnail)
+                nimg_batch = np.array(nimg_batch).T
+                nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+                if fullimgs is None:
+                    fullimgs = nimg_batch
+                    fullthumbnails = nthumbnail_batch
+                else:
+                    fullimgs = np.hstack((fullimgs, nimg_batch))
+                    fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
+            else:
+                nimg_batch = []
+                for img in img_batch.T:
+                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+#                    print("Starting image processing of size {}".format(img_batch.T.shape))
+                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    if nimg is not None:
+                        nimg_batch.append(nimg)
+                nimg_batch = np.array(nimg_batch).T
+#                print("hstacking")
+                if fullimgs is None:
+
+                    fullimgs = nimg_batch
+                else:
+                    fullimgs = np.hstack((fullimgs, nimg_batch))
+
+#        print("Images tracked:", imgsTracked)
+        if getThumbnails:
+            print(fullimgs.shape, fullthumbnails.shape, imgsTracked)
+            return (fullimgs, fullthumbnails, imgsTracked)
+        else:
+            return (fullimgs, imgsTracked)

From 9ccfb95fce149dbf5f9cfd27b4654e9649e8257a Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 3 Oct 2023 14:32:54 -0700
Subject: [PATCH 41/57] Separated visualization from sketching file

---
 btx/processing/freqdir.py |  12 +-
 btx/processing/vizfreq.py | 544 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 551 insertions(+), 5 deletions(-)
 create mode 100644 btx/processing/vizfreq.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 762ef85f0..2ccd0278a 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -39,8 +39,8 @@
 
 from datetime import datetime
 
-import umap
-import hdbscan
+#import umap
+#import hdbscan
 from sklearn.cluster import OPTICS, cluster_optics_dbscan
 
 from matplotlib import colors
@@ -815,7 +815,7 @@ def update(self, vec):
         self.sketch.push(vec, pi, wi)
 
 
-
+'''
 class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
@@ -1313,6 +1313,7 @@ def userShow(self):
         from bokeh.io import output_notebook
         output_notebook()
         show(self.viewResults)
+'''
 
 class WrapperFullFD:
     """
@@ -1544,7 +1545,7 @@ def runMe(self):
         et3 = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3))
         print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull))
-
+        self.addThumbnailsToProjectH5()
         return (et1 + et2 + et3 - st1 - st2 - st3)
 
 #        self.comm.barrier()
@@ -1562,7 +1563,7 @@ def addThumbnailsToProjectH5(self):
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
         f1.close()
         self.comm.barrier()
-
+'''
     def visualizeMe(self):
         st = time.perf_counter()
         self.addThumbnailsToProjectH5()
@@ -1595,6 +1596,7 @@ def visualizeMe(self):
             et = time.perf_counter()
             print("UMAP HTML Generation Processing time: {}".format(et - st))
             #print("TOTAL PROCESING TIME: {}".format(et - stfull))
+'''
 
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
diff --git a/btx/processing/vizfreq.py b/btx/processing/vizfreq.py
new file mode 100644
index 000000000..14340cc3f
--- /dev/null
+++ b/btx/processing/vizfreq.py
@@ -0,0 +1,544 @@
+import sys
+sys.path.append("/sdf/home/w/winnicki/btx/")
+from btx.processing.dimRed import *
+
+import os, csv, argparse
+import math
+import time
+import random
+from collections import Counter
+import h5py
+
+import numpy as np
+from numpy import zeros, sqrt, dot, diag
+from numpy.linalg import svd, LinAlgError
+from scipy.linalg import svd as scipy_svd
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+from sklearn.metrics.pairwise import euclidean_distances
+import heapq
+
+from mpi4py import MPI
+
+from matplotlib import pyplot as plt
+from matplotlib import colors
+
+from PIL import Image
+from io import BytesIO
+import base64
+
+from datetime import datetime
+
+import umap
+import hdbscan
+from sklearn.cluster import OPTICS, cluster_optics_dbscan
+
+from matplotlib import colors
+import matplotlib as mpl
+from matplotlib import cm
+
+from bokeh.plotting import figure, show, output_file, save
+from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
+from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3
+from bokeh.layouts import column, row
+
+import cProfile
+import string
+
+class visualizeFD:
+    """
+    Visualize FD Dimension Reduction using UMAP and DBSCAN
+    """
+    def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
+            skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
+            optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
+        self.inputFile = inputFile
+        self.outputFile = outputFile
+        output_file(filename=outputFile, title="Static HTML file")
+        self.viewResults = None
+        self.numImgsToUse = numImgsToUse
+        self.nprocs = nprocs
+        self.includeABOD = includeABOD
+        self.userGroupings = userGroupings
+        self.skipSize = skipSize
+        self.umap_n_neighbors = umap_n_neighbors
+        self.umap_random_state = umap_random_state
+        self.hdbscan_min_samples=hdbscan_min_samples
+        self.hdbscan_min_cluster_size=hdbscan_min_cluster_size
+        self.optics_min_samples=optics_min_samples
+        self.optics_xi = optics_xi
+        self.optics_min_cluster_size = optics_min_cluster_size
+        self.outlierQuantile = outlierQuantile
+
+    def embeddable_image(self, data):
+        img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
+#        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
+        image = Image.fromarray(img_data, mode='RGBA')
+        buffer = BytesIO()
+        image.save(buffer, format='png')
+        for_encoding = buffer.getvalue()
+        return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8')
+
+    def random_unique_numbers_from_range(self, start, end, count):
+        all_numbers = list(range(start, end + 1))
+        random.shuffle(all_numbers)
+        return all_numbers[:count]
+
+    def compute_medoid(self, points):
+        return points[np.argmin(euclidean_distances(points).sum(axis=0))]
+
+    def genMedoids(self, medoidLabels, clusterPoints):
+        dictMe = {}
+        for j in set(medoidLabels):
+            dictMe[j] = []
+        for index, class_name in enumerate(medoidLabels):
+            dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1]))
+        medoid_lst = []
+        for k, v in dictMe.items():
+            lst = [(x[1], x[2]) for x in v]
+            medoid_point = self.compute_medoid(lst)
+            for test_index, test_point in enumerate(lst):
+                if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]):
+                    fin_ind = test_index
+            medoid_lst.append((k, v[fin_ind][0]))
+        return medoid_lst
+
+    def relabel_to_closest_zero(self, labels):
+        unique_labels = sorted(set(labels))
+        relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)}
+        relabeled = [relabel_dict[label] for label in labels]
+        return relabeled
+
+    def regABOD(self, pts):
+        abofs = []
+        for a in range(len(pts)):
+            test_list = [x for x in range(len(pts)) if x != a]
+            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
+            outlier_factors = []
+            for b, c in otherPts:
+                apt = pts[a]
+                bpt = pts[b]
+                cpt = pts[c]
+                ab = bpt - apt
+                ac = cpt - apt
+                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
+            abofs.append(np.var(np.array(outlier_factors)))
+        return abofs
+
+    def fastABOD(self, pts, nsamples):
+        nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts)
+        k_inds = nbrs.kneighbors(pts)[1]
+        abofs = []
+        count = 0
+        for a in range(len(pts)):
+            test_list = k_inds[a][1:]
+            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
+            outlier_factors = []
+            for (b, c) in otherPts:
+                apt = pts[a]
+                bpt = pts[b]
+                cpt = pts[c]
+                ab = bpt - apt
+                ac = cpt - apt
+                if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
+                    count += 1
+#                    print("TOO CLOSE")
+                    continue
+                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
+#            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
+            if(len(outlier_factors)==0):
+                abofs.append(np.inf)
+            else:
+                abofs.append(np.var(np.array(outlier_factors)))
+        return abofs
+
+    def getOutliers(self, lst):
+#        lstCopy = lst.copy()
+#        lstCopy.sort()
+#        quart10 = lstCopy[len(lstCopy)//divBy]
+
+        lstQuant = np.quantile(np.array(lst), self.outlierQuantile)
+#        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
+        outlierInds = []
+        notOutlierInds = []
+        for j in range(len(lst)):
+            if lst[j]>lstQuant:
+                outlierInds.append(j)
+            else:
+                notOutlierInds.append(j)
+#        print("OUTLIER INDS: ", outlierInds)
+#        print("NOT OUTLIER INDS: ", notOutlierInds)
+        return np.array(outlierInds), np.array(notOutlierInds)
+
+    def genHist(self, vals, endClass):
+        totNum = endClass + 1
+        countVals = Counter(vals)
+        hist = [0]*(totNum)
+        for val in set(countVals):
+            hist[val] = countVals[val]
+        maxval = max(countVals.values())
+        return hist, maxval
+
+    def genLeftRight(self, endClass):
+        return [*range(endClass+1)], [*range(1, endClass+2)]
+
+    def genUMAP(self):
+        imgs = None
+        projections = None
+        for currRank in range(self.nprocs):
+            with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
+                if imgs is None:
+                    imgs = hf["SmallImages"][:]
+                    projections = hf["ProjectedData"][:]
+                else:
+                    imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
+                    projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+
+        print("AOIDWJOIAWDJ", len(imgs), len(projections))
+
+        intensities = []
+        for img in imgs:
+            intensities.append(np.sum(img.flatten()))
+        intensities = np.array(intensities)
+
+        self.imgs = imgs[:self.numImgsToUse:self.skipSize]
+        self.projections = projections[:self.numImgsToUse:self.skipSize]
+        self.intensities = intensities[:self.numImgsToUse:self.skipSize]
+
+        self.numImgsToUse = int(self.numImgsToUse/self.skipSize)
+
+        if len(self.imgs)!= self.numImgsToUse:
+            raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
+
+        self.clusterable_embedding = umap.UMAP(
+            n_neighbors=self.umap_n_neighbors,
+            random_state=self.umap_random_state,
+            n_components=2,
+#            min_dist=0.25,
+            min_dist=0.1,
+        ).fit_transform(self.projections)
+
+        self.labels = hdbscan.HDBSCAN(
+            min_samples = self.hdbscan_min_samples,
+            min_cluster_size = self.hdbscan_min_cluster_size
+        ).fit_predict(self.clusterable_embedding)
+        exclusionList = np.array([])
+        self.clustered = np.isin(self.labels, exclusionList, invert=True)
+
+        self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
+        self.opticsClust.fit(self.clusterable_embedding)
+        self.opticsLabels = cluster_optics_dbscan(
+            reachability=self.opticsClust.reachability_,
+            core_distances=self.opticsClust.core_distances_,
+            ordering=self.opticsClust.ordering_,
+            eps=2.5,
+        )
+#        self.opticsLabels = self.opticsClust.labels_
+
+        self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
+        self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
+        self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
+
+    def genABOD(self):
+        if self.includeABOD:
+            abod = self.fastABOD(self.projections, 10)
+            outliers, notOutliers = self.getOutliers(abod)
+        else:
+            outliers = []
+            notOutliers = []
+        outlierLabels = []
+        for j in range(self.numImgsToUse):
+            if j in outliers:
+                outlierLabels.append(str(6))
+            else:
+                outlierLabels.append(str(0))
+        self.experData_df['anomDet'] = outlierLabels
+        self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels]
+
+    def setUserGroupings(self, userGroupings):
+        """
+        Set User Grouping. An adjustment is made at the beginning of this function,
+        whereby 1 is added to each label. This is because internally, the clusters are stored
+        starting at -1 rather than 0.
+        """
+        self.userGroupings = [[x-1 for x in grouping] for grouping in userGroupings]
+
+    def genLabels(self):
+        newLabels = []
+        for j in self.labels[self.clustered]:
+            doneChecking = False
+            for grouping in self.userGroupings:
+                if j in grouping and not doneChecking:
+                    newLabels.append(min(grouping))
+                    doneChecking=True
+            if not doneChecking:
+                newLabels.append(j)
+        newLabels = list(np.array(newLabels) + 1)
+        self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
+        self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
+        self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
+        self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels]
+        self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
+        medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
+        self.medoidInds = [x[1] for x in medoid_lst]
+        medoidBold = []
+        for ind in range(self.numImgsToUse):
+            if ind in self.medoidInds:
+                medoidBold.append(12)
+            else:
+                medoidBold.append(4)
+        self.experData_df['medoidBold'] = medoidBold
+
+        opticsNewLabels = []
+        for j in self.opticsLabels[self.clustered]:
+            doneChecking = False
+            for grouping in self.userGroupings:
+                if j in grouping and not doneChecking:
+                    opticsNewLabels.append(min(grouping))
+                    doneChecking=True
+            if not doneChecking:
+                opticsNewLabels.append(j)
+        opticsNewLabels = list(np.array(opticsNewLabels) + 1)
+        self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
+#        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
+        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels]
+
+    def genHTML(self):
+        datasource = ColumnDataSource(self.experData_df)
+        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        plot_figure = figure(
+            title='UMAP projection with DBSCAN clustering of the LCLS dataset',
+            tools=('pan, wheel_zoom, reset'),
+            width = 2000, height = 600
+        )
+        plot_figure.add_tools(HoverTool(tooltips="""
+        <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 5px 0px">
+            <div style='width: 64; height: 64; float: left;'>
+                <img src='@image'; float: left;'/>
+            </div>
+            <div style="height: 64;">
+                <div style='margin-left: 75; margin-top: 10'>
+                    <span style='font-size: 15px; color: #224499'>Cluster </span>
+                    <span style='font-size: 15px'>@cluster</span>
+                </div>
+                <div style='margin-left: 75; margin-top: 10'>
+                    <span style='font-size: 15px; color: #224499'>Image </span>
+                    <span style='font-size: 15px'>@imgind</span>
+                </div>
+            </div>
+        </div>
+        """))
+        plot_figure.circle(
+            'x',
+            'y',
+            source=datasource,
+            color=dict(field='ptColor', transform=color_mapping),
+            line_alpha=0.6,
+            fill_alpha=0.6,
+            size='medoidBold',
+            legend_field='cluster'
+        )
+        plot_figure.sizing_mode = 'scale_both'
+        plot_figure.legend.location = "bottom_right"
+        plot_figure.legend.title = "Clusters"
+
+        vals = [x for x in self.newLabels]
+        trueSource = ColumnDataSource(data=dict(vals = vals))
+        hist, maxCount = self.genHist(vals, max(vals))
+        left, right = self.genLeftRight(max(vals))
+        histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right))
+        p = figure(width=2000, height=450, toolbar_location=None,
+                   title="Histogram Testing")
+        p.quad(source=histsource, top='hist', bottom=0, left='left', right='right',
+                 fill_color='skyblue', line_color="white")
+        p.y_range = Range1d(0, maxCount)
+        p.x_range = Range1d(0, max(vals)+1)
+        p.xaxis.axis_label = "Cluster Label"
+        p.yaxis.axis_label = "Count"
+
+        indexCDS = ColumnDataSource(dict(
+            index=[*range(0, self.numImgsToUse, 2)]
+            )
+        )
+        cols = RangeSlider(title="ET",
+                start=0,
+                end=self.numImgsToUse,
+                value=(0, self.numImgsToUse-1),
+                step=1, sizing_mode="stretch_width")
+        callback = CustomJS(args=dict(cols=cols, trueSource = trueSource,
+                                      histsource = histsource, datasource=datasource, indexCDS=indexCDS), code="""
+        function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) {
+            let counts = new Array(largestVal-smallestVal); for (let i=0; i<largestVal-smallestVal; ++i) counts[i] = 0;
+            for (let i = Math.round(startInd); i <= Math.round(endInd); i++) {
+                let numMe = numbers[i];
+                if (typeof counts[numMe] === 'undefined') {
+                  counts[numMe] = 1;
+                } else {
+                  counts[numMe]++;
+                }
+            }
+            return counts;
+            }
+        const vals = trueSource.data.vals
+        const leftVal = cols.value[0]
+        const rightVal = cols.value[1]
+        const oldhist = histsource.data.hist
+        const left = histsource.data.left
+        const right = histsource.data.right
+        const hist = countNumbersAtIndices(vals, leftVal, rightVal, left[0], right.slice(-1))
+        histsource.data = { hist, left, right }
+        let medoidBold = new Array(datasource.data.medoidBold.length); for (let i=0; i<datasource.data.medoidBold.length; ++i) medoidBold[i] = 0;
+                for (let i = Math.round(leftVal); i < Math.round(rightVal); i++) {
+            medoidBold[i] = 5
+        }
+        const x = datasource.data.x
+        const y = datasource.data.y
+        const image = datasource.data.image
+        const cluster = datasource.data.cluster
+        const ptColor = datasource.data.ptColor
+        const anomDet = datasource.data.anomDet
+        const imgind = datasource.data.imgind
+        const backgroundColor = datasource.data.backgroundColor
+        const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
+        const anom_backgroundColor = datasource.data.anom_backgroundColor
+        const optics_backgroundColor = datasource.data.optics_backgroundColor
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
+        """)
+        cols.js_on_change('value', callback)
+
+
+        imgsPlot = figure(width=2000, height=150, toolbar_location=None)
+        imgsPlot.image(image=[self.imgs[imgindMe][::-1] for imgindMe in self.medoidInds],
+                x=[0.25+xind for xind in range(len(self.medoidInds))],
+                y=0,
+                dw=0.5, dh=1,
+                palette="Turbo256", level="image")
+        imgsPlot.axis.visible = False
+        imgsPlot.grid.visible = False
+        for xind in range(len(self.medoidInds)):
+            mytext = Label(x=0.375+xind, y=-0.25, text='Cluster {}'.format(xind))
+            imgsPlot.add_layout(mytext)
+        imgsPlot.y_range = Range1d(-0.3, 1.1)
+        imgsPlot.x_range = Range1d(0, max(vals)+1)
+
+        toggl = Toggle(label='► Play',active=False)
+        toggl_js = CustomJS(args=dict(slider=cols,indexCDS=indexCDS),code="""
+        // https://discourse.bokeh.org/t/possible-to-use-customjs-callback-from-a-button-to-animate-a-slider/3985/3
+            var check_and_iterate = function(index){
+                var slider_val0 = slider.value[0];
+                var slider_val1 = slider.value[1];
+                var toggle_val = cb_obj.active;
+                if(toggle_val == false) {
+                    cb_obj.label = '► Play';
+                    clearInterval(looop);
+                    }
+                else if(slider_val1 >= index[index.length - 1]) {
+//                    cb_obj.label = '► Play';
+                    slider.value = [0, slider_val1-slider_val0];
+//                   cb_obj.active = false;
+//                    clearInterval(looop);
+                    }
+                else if(slider_val1 !== index[index.length - 1]){
+                    slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]];
+                    }
+                else {
+                clearInterval(looop);
+                    }
+            }
+            if(cb_obj.active == false){
+                cb_obj.label = '► Play';
+                clearInterval(looop);
+            }
+            else {
+                cb_obj.label = '❚❚ Pause';
+                var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']);
+            };
+        """)
+        toggl.js_on_change('active',toggl_js)
+
+        reachabilityDiag = figure(
+            title='OPTICS Reachability Diag',
+            tools=('pan, wheel_zoom, reset'),
+            width = 2000, height = 400
+        )
+        space = np.arange(self.numImgsToUse)
+        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
+#        reachability = self.opticsClust.reachability_
+        opticsData_df = pd.DataFrame({'x':space,'y':reachability})
+        opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels]
+        opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]]
+        opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']]
+        color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))],
+                                               palette=Category20[20])
+        opticssource = ColumnDataSource(opticsData_df)
+        reachabilityDiag.circle(
+            'x',
+            'y',
+            source=opticssource,
+            color=dict(field='ptColor', transform=color_mapping2),
+            line_alpha=0.6,
+            fill_alpha=0.6,
+            legend_field='cluster'
+        )
+        reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed")
+        reachabilityDiag.y_range = Range1d(-1, 10)
+
+        LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"]
+        radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
+        radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code="""
+            const x = datasource.data.x
+            const y = datasource.data.y
+            const image = datasource.data.image
+            const medoidBold = datasource.data.medoidBold
+            const cluster = datasource.data.cluster
+            const anomDet = datasource.data.anomDet
+            const imgind = datasource.data.imgind
+            const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
+            const anom_backgroundColor = datasource.data.anom_backgroundColor
+            const optics_backgroundColor = datasource.data.optics_backgroundColor
+
+            const opticsClust = opticssource.data.clusterForScatterPlot
+
+            let ptColor = null
+            let backgroundColor = null
+
+            if (cb_obj.active==0){
+                ptColor = cluster
+                backgroundColor = dbscan_backgroundColor
+            }
+            else if (cb_obj.active==1){
+                ptColor = opticsClust
+                backgroundColor = optics_backgroundColor
+            }
+            else{
+                ptColor = anomDet
+                backgroundColor = anom_backgroundColor
+            }
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
+        """)
+        radio_button_group.js_on_change("active", radioGroup_js)
+
+        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
+
+    def fullVisualize(self):
+        self.genUMAP()
+        self.genABOD()
+        self.genLabels()
+        self.genHTML()
+
+    def updateLabels(self):
+        self.genLabels()
+        self.genHTML()
+
+    def userSave(self):
+        save(self.viewResults)
+
+    def userShow(self):
+        from IPython.display import display, HTML
+        display(HTML("<style>.container { width:100% !important; }</style>"))
+        display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
+        display(HTML("<style>.container { height:100% !important; }</style>"))
+        display(HTML("<style>.output_result { max-height:100% !important; }</style>"))
+        from bokeh.io import output_notebook
+        output_notebook()
+        show(self.viewResults)

From a64ccb0ae011eeea1423d3dedd4cbfd51b027087 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 3 Oct 2023 15:08:08 -0700
Subject: [PATCH 42/57] Reverted separation changes.

---
 btx/processing/freqdir.py |  11 +-
 btx/processing/vizfreq.py | 544 --------------------------------------
 2 files changed, 4 insertions(+), 551 deletions(-)
 delete mode 100644 btx/processing/vizfreq.py

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 2ccd0278a..d986adfed 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -39,8 +39,8 @@
 
 from datetime import datetime
 
-#import umap
-#import hdbscan
+import umap
+import hdbscan
 from sklearn.cluster import OPTICS, cluster_optics_dbscan
 
 from matplotlib import colors
@@ -815,7 +815,6 @@ def update(self, vec):
         self.sketch.push(vec, pi, wi)
 
 
-'''
 class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
@@ -1313,7 +1312,6 @@ def userShow(self):
         from bokeh.io import output_notebook
         output_notebook()
         show(self.viewResults)
-'''
 
 class WrapperFullFD:
     """
@@ -1563,10 +1561,10 @@ def addThumbnailsToProjectH5(self):
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
         f1.close()
         self.comm.barrier()
-'''
+
     def visualizeMe(self):
         st = time.perf_counter()
-        self.addThumbnailsToProjectH5()
+#        self.addThumbnailsToProjectH5()
         #UMAP STEP
         ##########################################################################################
         if self.rank==0:
@@ -1596,7 +1594,6 @@ def visualizeMe(self):
             et = time.perf_counter()
             print("UMAP HTML Generation Processing time: {}".format(et - st))
             #print("TOTAL PROCESING TIME: {}".format(et - stfull))
-'''
 
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
diff --git a/btx/processing/vizfreq.py b/btx/processing/vizfreq.py
deleted file mode 100644
index 14340cc3f..000000000
--- a/btx/processing/vizfreq.py
+++ /dev/null
@@ -1,544 +0,0 @@
-import sys
-sys.path.append("/sdf/home/w/winnicki/btx/")
-from btx.processing.dimRed import *
-
-import os, csv, argparse
-import math
-import time
-import random
-from collections import Counter
-import h5py
-
-import numpy as np
-from numpy import zeros, sqrt, dot, diag
-from numpy.linalg import svd, LinAlgError
-from scipy.linalg import svd as scipy_svd
-import pandas as pd
-from sklearn.neighbors import NearestNeighbors
-from sklearn.metrics.pairwise import euclidean_distances
-import heapq
-
-from mpi4py import MPI
-
-from matplotlib import pyplot as plt
-from matplotlib import colors
-
-from PIL import Image
-from io import BytesIO
-import base64
-
-from datetime import datetime
-
-import umap
-import hdbscan
-from sklearn.cluster import OPTICS, cluster_optics_dbscan
-
-from matplotlib import colors
-import matplotlib as mpl
-from matplotlib import cm
-
-from bokeh.plotting import figure, show, output_file, save
-from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
-from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3
-from bokeh.layouts import column, row
-
-import cProfile
-import string
-
-class visualizeFD:
-    """
-    Visualize FD Dimension Reduction using UMAP and DBSCAN
-    """
-    def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
-            skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
-            optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
-        self.inputFile = inputFile
-        self.outputFile = outputFile
-        output_file(filename=outputFile, title="Static HTML file")
-        self.viewResults = None
-        self.numImgsToUse = numImgsToUse
-        self.nprocs = nprocs
-        self.includeABOD = includeABOD
-        self.userGroupings = userGroupings
-        self.skipSize = skipSize
-        self.umap_n_neighbors = umap_n_neighbors
-        self.umap_random_state = umap_random_state
-        self.hdbscan_min_samples=hdbscan_min_samples
-        self.hdbscan_min_cluster_size=hdbscan_min_cluster_size
-        self.optics_min_samples=optics_min_samples
-        self.optics_xi = optics_xi
-        self.optics_min_cluster_size = optics_min_cluster_size
-        self.outlierQuantile = outlierQuantile
-
-    def embeddable_image(self, data):
-        img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
-#        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
-        image = Image.fromarray(img_data, mode='RGBA')
-        buffer = BytesIO()
-        image.save(buffer, format='png')
-        for_encoding = buffer.getvalue()
-        return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8')
-
-    def random_unique_numbers_from_range(self, start, end, count):
-        all_numbers = list(range(start, end + 1))
-        random.shuffle(all_numbers)
-        return all_numbers[:count]
-
-    def compute_medoid(self, points):
-        return points[np.argmin(euclidean_distances(points).sum(axis=0))]
-
-    def genMedoids(self, medoidLabels, clusterPoints):
-        dictMe = {}
-        for j in set(medoidLabels):
-            dictMe[j] = []
-        for index, class_name in enumerate(medoidLabels):
-            dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1]))
-        medoid_lst = []
-        for k, v in dictMe.items():
-            lst = [(x[1], x[2]) for x in v]
-            medoid_point = self.compute_medoid(lst)
-            for test_index, test_point in enumerate(lst):
-                if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]):
-                    fin_ind = test_index
-            medoid_lst.append((k, v[fin_ind][0]))
-        return medoid_lst
-
-    def relabel_to_closest_zero(self, labels):
-        unique_labels = sorted(set(labels))
-        relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)}
-        relabeled = [relabel_dict[label] for label in labels]
-        return relabeled
-
-    def regABOD(self, pts):
-        abofs = []
-        for a in range(len(pts)):
-            test_list = [x for x in range(len(pts)) if x != a]
-            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
-            outlier_factors = []
-            for b, c in otherPts:
-                apt = pts[a]
-                bpt = pts[b]
-                cpt = pts[c]
-                ab = bpt - apt
-                ac = cpt - apt
-                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
-            abofs.append(np.var(np.array(outlier_factors)))
-        return abofs
-
-    def fastABOD(self, pts, nsamples):
-        nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts)
-        k_inds = nbrs.kneighbors(pts)[1]
-        abofs = []
-        count = 0
-        for a in range(len(pts)):
-            test_list = k_inds[a][1:]
-            otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]]
-            outlier_factors = []
-            for (b, c) in otherPts:
-                apt = pts[a]
-                bpt = pts[b]
-                cpt = pts[c]
-                ab = bpt - apt
-                ac = cpt - apt
-                if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
-                    count += 1
-#                    print("TOO CLOSE")
-                    continue
-                outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
-#            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
-            if(len(outlier_factors)==0):
-                abofs.append(np.inf)
-            else:
-                abofs.append(np.var(np.array(outlier_factors)))
-        return abofs
-
-    def getOutliers(self, lst):
-#        lstCopy = lst.copy()
-#        lstCopy.sort()
-#        quart10 = lstCopy[len(lstCopy)//divBy]
-
-        lstQuant = np.quantile(np.array(lst), self.outlierQuantile)
-#        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
-        outlierInds = []
-        notOutlierInds = []
-        for j in range(len(lst)):
-            if lst[j]>lstQuant:
-                outlierInds.append(j)
-            else:
-                notOutlierInds.append(j)
-#        print("OUTLIER INDS: ", outlierInds)
-#        print("NOT OUTLIER INDS: ", notOutlierInds)
-        return np.array(outlierInds), np.array(notOutlierInds)
-
-    def genHist(self, vals, endClass):
-        totNum = endClass + 1
-        countVals = Counter(vals)
-        hist = [0]*(totNum)
-        for val in set(countVals):
-            hist[val] = countVals[val]
-        maxval = max(countVals.values())
-        return hist, maxval
-
-    def genLeftRight(self, endClass):
-        return [*range(endClass+1)], [*range(1, endClass+2)]
-
-    def genUMAP(self):
-        imgs = None
-        projections = None
-        for currRank in range(self.nprocs):
-            with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
-                if imgs is None:
-                    imgs = hf["SmallImages"][:]
-                    projections = hf["ProjectedData"][:]
-                else:
-                    imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
-                    projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
-
-        print("AOIDWJOIAWDJ", len(imgs), len(projections))
-
-        intensities = []
-        for img in imgs:
-            intensities.append(np.sum(img.flatten()))
-        intensities = np.array(intensities)
-
-        self.imgs = imgs[:self.numImgsToUse:self.skipSize]
-        self.projections = projections[:self.numImgsToUse:self.skipSize]
-        self.intensities = intensities[:self.numImgsToUse:self.skipSize]
-
-        self.numImgsToUse = int(self.numImgsToUse/self.skipSize)
-
-        if len(self.imgs)!= self.numImgsToUse:
-            raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
-
-        self.clusterable_embedding = umap.UMAP(
-            n_neighbors=self.umap_n_neighbors,
-            random_state=self.umap_random_state,
-            n_components=2,
-#            min_dist=0.25,
-            min_dist=0.1,
-        ).fit_transform(self.projections)
-
-        self.labels = hdbscan.HDBSCAN(
-            min_samples = self.hdbscan_min_samples,
-            min_cluster_size = self.hdbscan_min_cluster_size
-        ).fit_predict(self.clusterable_embedding)
-        exclusionList = np.array([])
-        self.clustered = np.isin(self.labels, exclusionList, invert=True)
-
-        self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
-        self.opticsClust.fit(self.clusterable_embedding)
-        self.opticsLabels = cluster_optics_dbscan(
-            reachability=self.opticsClust.reachability_,
-            core_distances=self.opticsClust.core_distances_,
-            ordering=self.opticsClust.ordering_,
-            eps=2.5,
-        )
-#        self.opticsLabels = self.opticsClust.labels_
-
-        self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
-        self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
-        self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
-
-    def genABOD(self):
-        if self.includeABOD:
-            abod = self.fastABOD(self.projections, 10)
-            outliers, notOutliers = self.getOutliers(abod)
-        else:
-            outliers = []
-            notOutliers = []
-        outlierLabels = []
-        for j in range(self.numImgsToUse):
-            if j in outliers:
-                outlierLabels.append(str(6))
-            else:
-                outlierLabels.append(str(0))
-        self.experData_df['anomDet'] = outlierLabels
-        self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels]
-
-    def setUserGroupings(self, userGroupings):
-        """
-        Set User Grouping. An adjustment is made at the beginning of this function,
-        whereby 1 is added to each label. This is because internally, the clusters are stored
-        starting at -1 rather than 0.
-        """
-        self.userGroupings = [[x-1 for x in grouping] for grouping in userGroupings]
-
-    def genLabels(self):
-        newLabels = []
-        for j in self.labels[self.clustered]:
-            doneChecking = False
-            for grouping in self.userGroupings:
-                if j in grouping and not doneChecking:
-                    newLabels.append(min(grouping))
-                    doneChecking=True
-            if not doneChecking:
-                newLabels.append(j)
-        newLabels = list(np.array(newLabels) + 1)
-        self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
-        self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
-        self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
-        self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels]
-        self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
-        medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
-        self.medoidInds = [x[1] for x in medoid_lst]
-        medoidBold = []
-        for ind in range(self.numImgsToUse):
-            if ind in self.medoidInds:
-                medoidBold.append(12)
-            else:
-                medoidBold.append(4)
-        self.experData_df['medoidBold'] = medoidBold
-
-        opticsNewLabels = []
-        for j in self.opticsLabels[self.clustered]:
-            doneChecking = False
-            for grouping in self.userGroupings:
-                if j in grouping and not doneChecking:
-                    opticsNewLabels.append(min(grouping))
-                    doneChecking=True
-            if not doneChecking:
-                opticsNewLabels.append(j)
-        opticsNewLabels = list(np.array(opticsNewLabels) + 1)
-        self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
-#        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
-        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels]
-
-    def genHTML(self):
-        datasource = ColumnDataSource(self.experData_df)
-        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
-        plot_figure = figure(
-            title='UMAP projection with DBSCAN clustering of the LCLS dataset',
-            tools=('pan, wheel_zoom, reset'),
-            width = 2000, height = 600
-        )
-        plot_figure.add_tools(HoverTool(tooltips="""
-        <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 5px 0px">
-            <div style='width: 64; height: 64; float: left;'>
-                <img src='@image'; float: left;'/>
-            </div>
-            <div style="height: 64;">
-                <div style='margin-left: 75; margin-top: 10'>
-                    <span style='font-size: 15px; color: #224499'>Cluster </span>
-                    <span style='font-size: 15px'>@cluster</span>
-                </div>
-                <div style='margin-left: 75; margin-top: 10'>
-                    <span style='font-size: 15px; color: #224499'>Image </span>
-                    <span style='font-size: 15px'>@imgind</span>
-                </div>
-            </div>
-        </div>
-        """))
-        plot_figure.circle(
-            'x',
-            'y',
-            source=datasource,
-            color=dict(field='ptColor', transform=color_mapping),
-            line_alpha=0.6,
-            fill_alpha=0.6,
-            size='medoidBold',
-            legend_field='cluster'
-        )
-        plot_figure.sizing_mode = 'scale_both'
-        plot_figure.legend.location = "bottom_right"
-        plot_figure.legend.title = "Clusters"
-
-        vals = [x for x in self.newLabels]
-        trueSource = ColumnDataSource(data=dict(vals = vals))
-        hist, maxCount = self.genHist(vals, max(vals))
-        left, right = self.genLeftRight(max(vals))
-        histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right))
-        p = figure(width=2000, height=450, toolbar_location=None,
-                   title="Histogram Testing")
-        p.quad(source=histsource, top='hist', bottom=0, left='left', right='right',
-                 fill_color='skyblue', line_color="white")
-        p.y_range = Range1d(0, maxCount)
-        p.x_range = Range1d(0, max(vals)+1)
-        p.xaxis.axis_label = "Cluster Label"
-        p.yaxis.axis_label = "Count"
-
-        indexCDS = ColumnDataSource(dict(
-            index=[*range(0, self.numImgsToUse, 2)]
-            )
-        )
-        cols = RangeSlider(title="ET",
-                start=0,
-                end=self.numImgsToUse,
-                value=(0, self.numImgsToUse-1),
-                step=1, sizing_mode="stretch_width")
-        callback = CustomJS(args=dict(cols=cols, trueSource = trueSource,
-                                      histsource = histsource, datasource=datasource, indexCDS=indexCDS), code="""
-        function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) {
-            let counts = new Array(largestVal-smallestVal); for (let i=0; i<largestVal-smallestVal; ++i) counts[i] = 0;
-            for (let i = Math.round(startInd); i <= Math.round(endInd); i++) {
-                let numMe = numbers[i];
-                if (typeof counts[numMe] === 'undefined') {
-                  counts[numMe] = 1;
-                } else {
-                  counts[numMe]++;
-                }
-            }
-            return counts;
-            }
-        const vals = trueSource.data.vals
-        const leftVal = cols.value[0]
-        const rightVal = cols.value[1]
-        const oldhist = histsource.data.hist
-        const left = histsource.data.left
-        const right = histsource.data.right
-        const hist = countNumbersAtIndices(vals, leftVal, rightVal, left[0], right.slice(-1))
-        histsource.data = { hist, left, right }
-        let medoidBold = new Array(datasource.data.medoidBold.length); for (let i=0; i<datasource.data.medoidBold.length; ++i) medoidBold[i] = 0;
-                for (let i = Math.round(leftVal); i < Math.round(rightVal); i++) {
-            medoidBold[i] = 5
-        }
-        const x = datasource.data.x
-        const y = datasource.data.y
-        const image = datasource.data.image
-        const cluster = datasource.data.cluster
-        const ptColor = datasource.data.ptColor
-        const anomDet = datasource.data.anomDet
-        const imgind = datasource.data.imgind
-        const backgroundColor = datasource.data.backgroundColor
-        const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
-        const anom_backgroundColor = datasource.data.anom_backgroundColor
-        const optics_backgroundColor = datasource.data.optics_backgroundColor
-        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
-        """)
-        cols.js_on_change('value', callback)
-
-
-        imgsPlot = figure(width=2000, height=150, toolbar_location=None)
-        imgsPlot.image(image=[self.imgs[imgindMe][::-1] for imgindMe in self.medoidInds],
-                x=[0.25+xind for xind in range(len(self.medoidInds))],
-                y=0,
-                dw=0.5, dh=1,
-                palette="Turbo256", level="image")
-        imgsPlot.axis.visible = False
-        imgsPlot.grid.visible = False
-        for xind in range(len(self.medoidInds)):
-            mytext = Label(x=0.375+xind, y=-0.25, text='Cluster {}'.format(xind))
-            imgsPlot.add_layout(mytext)
-        imgsPlot.y_range = Range1d(-0.3, 1.1)
-        imgsPlot.x_range = Range1d(0, max(vals)+1)
-
-        toggl = Toggle(label='► Play',active=False)
-        toggl_js = CustomJS(args=dict(slider=cols,indexCDS=indexCDS),code="""
-        // https://discourse.bokeh.org/t/possible-to-use-customjs-callback-from-a-button-to-animate-a-slider/3985/3
-            var check_and_iterate = function(index){
-                var slider_val0 = slider.value[0];
-                var slider_val1 = slider.value[1];
-                var toggle_val = cb_obj.active;
-                if(toggle_val == false) {
-                    cb_obj.label = '► Play';
-                    clearInterval(looop);
-                    }
-                else if(slider_val1 >= index[index.length - 1]) {
-//                    cb_obj.label = '► Play';
-                    slider.value = [0, slider_val1-slider_val0];
-//                   cb_obj.active = false;
-//                    clearInterval(looop);
-                    }
-                else if(slider_val1 !== index[index.length - 1]){
-                    slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]];
-                    }
-                else {
-                clearInterval(looop);
-                    }
-            }
-            if(cb_obj.active == false){
-                cb_obj.label = '► Play';
-                clearInterval(looop);
-            }
-            else {
-                cb_obj.label = '❚❚ Pause';
-                var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']);
-            };
-        """)
-        toggl.js_on_change('active',toggl_js)
-
-        reachabilityDiag = figure(
-            title='OPTICS Reachability Diag',
-            tools=('pan, wheel_zoom, reset'),
-            width = 2000, height = 400
-        )
-        space = np.arange(self.numImgsToUse)
-        reachability = self.opticsClust.reachability_[self.opticsClust.ordering_]
-#        reachability = self.opticsClust.reachability_
-        opticsData_df = pd.DataFrame({'x':space,'y':reachability})
-        opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels]
-        opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]]
-        opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']]
-        color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))],
-                                               palette=Category20[20])
-        opticssource = ColumnDataSource(opticsData_df)
-        reachabilityDiag.circle(
-            'x',
-            'y',
-            source=opticssource,
-            color=dict(field='ptColor', transform=color_mapping2),
-            line_alpha=0.6,
-            fill_alpha=0.6,
-            legend_field='cluster'
-        )
-        reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed")
-        reachabilityDiag.y_range = Range1d(-1, 10)
-
-        LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"]
-        radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
-        radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code="""
-            const x = datasource.data.x
-            const y = datasource.data.y
-            const image = datasource.data.image
-            const medoidBold = datasource.data.medoidBold
-            const cluster = datasource.data.cluster
-            const anomDet = datasource.data.anomDet
-            const imgind = datasource.data.imgind
-            const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
-            const anom_backgroundColor = datasource.data.anom_backgroundColor
-            const optics_backgroundColor = datasource.data.optics_backgroundColor
-
-            const opticsClust = opticssource.data.clusterForScatterPlot
-
-            let ptColor = null
-            let backgroundColor = null
-
-            if (cb_obj.active==0){
-                ptColor = cluster
-                backgroundColor = dbscan_backgroundColor
-            }
-            else if (cb_obj.active==1){
-                ptColor = opticsClust
-                backgroundColor = optics_backgroundColor
-            }
-            else{
-                ptColor = anomDet
-                backgroundColor = anom_backgroundColor
-            }
-            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
-        """)
-        radio_button_group.js_on_change("active", radioGroup_js)
-
-        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
-
-    def fullVisualize(self):
-        self.genUMAP()
-        self.genABOD()
-        self.genLabels()
-        self.genHTML()
-
-    def updateLabels(self):
-        self.genLabels()
-        self.genHTML()
-
-    def userSave(self):
-        save(self.viewResults)
-
-    def userShow(self):
-        from IPython.display import display, HTML
-        display(HTML("<style>.container { width:100% !important; }</style>"))
-        display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
-        display(HTML("<style>.container { height:100% !important; }</style>"))
-        display(HTML("<style>.output_result { max-height:100% !important; }</style>"))
-        from bokeh.io import output_notebook
-        output_notebook()
-        show(self.viewResults)

From 260fe9ac8431eecb6102e068b55a48e127551439 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Tue, 3 Oct 2023 15:12:36 -0700
Subject: [PATCH 43/57] Removing UMAP and HDBSCAN dependency from freqdir
 module. Also made draw_sketch task able to run on more than one core.

---
 btx/processing/freqdir.py | 7 +++++--
 scripts/elog_submit.sh    | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index d986adfed..e0b2c2911 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -39,8 +39,8 @@
 
 from datetime import datetime
 
-import umap
-import hdbscan
+#import umap
+#import hdbscan
 from sklearn.cluster import OPTICS, cluster_optics_dbscan
 
 from matplotlib import colors
@@ -953,6 +953,9 @@ def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
     def genUMAP(self):
+        import umap
+        import hdbscan
+
         imgs = None
         projections = None
         for currRank in range(self.nprocs):
diff --git a/scripts/elog_submit.sh b/scripts/elog_submit.sh
index 5103fd33b..d19d45a40 100755
--- a/scripts/elog_submit.sh
+++ b/scripts/elog_submit.sh
@@ -131,7 +131,8 @@ CORES=${CORES:=1}
 if [ ${TASK} != 'find_peaks' ] &&\
    [ ${TASK} != 'stream_analysis' ] &&\
    [ ${TASK} != 'determine_cell' ] &&\
-   [ ${TASK} != 'opt_geom' ]; then
+   [ ${TASK} != 'opt_geom' ] &&\
+   [ ${TASK} != 'draw_sketch' ]; then
   CORES=1
 fi
 

From 0bbad2452aa6a69a4b634d53a13ee67264ee0936 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Tue, 3 Oct 2023 15:41:08 -0700
Subject: [PATCH 44/57] attempt at only having psana dependency where needed in
 freqdir

---
 btx/processing/freqdir.py | 74 +++++++++++++--------------------------
 scripts/tasks.py          | 31 +++++++++-------
 2 files changed, 42 insertions(+), 63 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index e0b2c2911..40f388b88 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -23,15 +23,15 @@
 from matplotlib import pyplot as plt
 from matplotlib import colors
 
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
+# from btx.misc.shortcuts import TaskTimer
+#
+# from btx.interfaces.ipsana import (
+#     PsanaInterface,
+#     bin_data,
+#     bin_pixel_index_map,
+#     retrieve_pixel_index_map,
+#     assemble_image_stack_batch,
+# )
 
 from PIL import Image
 from io import BytesIO
@@ -1320,6 +1320,7 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
+    from btx.interfaces.ipsana import PsanaInterface
     def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
@@ -1352,13 +1353,13 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
 
         self.usePSI = usePSI
         if usePSI:
-            self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type)
             self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
         else:
             self.psi = None
 
         if self.rank==0:
-            self.currRun = datetime.now().strftime("%y%m%d%H%M%S")
+            self.currRun = run #datetime.now().strftime("%y%m%d%H%M%S")
         else:
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
@@ -1565,39 +1566,6 @@ def addThumbnailsToProjectH5(self):
         f1.close()
         self.comm.barrier()
 
-    def visualizeMe(self):
-        st = time.perf_counter()
-#        self.addThumbnailsToProjectH5()
-        #UMAP STEP
-        ##########################################################################################
-        if self.rank==0:
-
-#            print("here 1")
-
-            skipSize = 1
-            numImgsToUse = int(self.num_imgs/skipSize)
-            visMe = visualizeFD(inputFile=self.writeToHere+"{}_ProjectedData".format(self.currRun),
-                            outputFile="./UMAPVis_{}.html".format(self.currRun),
-                            numImgsToUse=self.num_imgs,
-                            nprocs=self.size,
-                            userGroupings=[],
-                            includeABOD=True,
-                            skipSize = skipSize,
-#                            umap_n_neighbors=numImgsToUse//40,
-                            umap_n_neighbors=numImgsToUse//4000,
-                            umap_random_state=42,
-                            hdbscan_min_samples=int(numImgsToUse*0.75//40),
-                            hdbscan_min_cluster_size=int(numImgsToUse//40),
-                            optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05, 
-                            outlierQuantile=0.3)
-#            print("here 2")
-            visMe.fullVisualize()
-#            print("here 3")
-            visMe.userSave()
-            et = time.perf_counter()
-            print("UMAP HTML Generation Processing time: {}".format(et - st))
-            #print("TOTAL PROCESING TIME: {}".format(et - stfull))
-
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
         self.threshold = threshold
@@ -1656,6 +1624,12 @@ def normalizeIntensityFunc(self, img, currIntensity):
 
 
 class DataRetriever:
+    from btx.interfaces.ipsana import (
+        PsanaInterface,
+        bin_data,
+        retrieve_pixel_index_map,
+        assemble_image_stack_batch,
+    )
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
@@ -1665,7 +1639,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
+        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
 
         self.imageProcessor = imageProcessor
 
@@ -1678,12 +1652,12 @@ def assembleImgsToSave(self, imgs):
         imgs: ndarray
             images to downsample
         """
-        pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+        pixel_index_map = self.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
 
         saveMe = []
         for img in imgs:
             imgRe = np.reshape(img, self.psi.det.shape())
-            imgRe = assemble_image_stack_batch(imgRe, pixel_index_map)
+            imgRe = self.assemble_image_stack_batch(imgRe, pixel_index_map)
             saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth))))
         return np.array(saveMe)
 
@@ -1745,7 +1719,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
             if self.downsample:
 #                print("Downsampling images")
-                imgs = bin_data(imgs, self.bin_factor)
+                imgs = self.bin_data(imgs, self.bin_factor)
 #            print("Flattening images")
             num_valid_imgs, p, x, y = imgs.shape
             img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
@@ -1786,7 +1760,6 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 nimg_batch = np.array(nimg_batch).T
 #                print("hstacking")
                 if fullimgs is None:
-
                     fullimgs = nimg_batch
                 else:
                     fullimgs = np.hstack((fullimgs, nimg_batch))
@@ -1799,6 +1772,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
 
 class SinglePanelDataRetriever:
+    from btx.interfaces.ipsana import PsanaInterface
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
@@ -1806,7 +1780,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-        self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type)
 
         self.imageProcessor = imageProcessor
 
diff --git a/scripts/tasks.py b/scripts/tasks.py
index 0603a2498..cc25ff692 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -539,17 +539,22 @@ def show_sketch():
     """ Display Sketch. """
     taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
-    fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type,
-                       start_offset=task.start_offset, num_imgs=task.num_imgs, 
-                       writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps,
-                       num_components=task.num_components, alpha=task.alpha, 
-                       rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError,
-                       downsample=task.downsample, bin_factor=task.bin_factor, 
-                       threshold=task.threshold, eluThreshold=task.eluThreshold,
-                       eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, 
-                       noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, 
-                       samplingFactor=task.samplingFactor, divBy=task.divBy, 
-                       thresholdQuantile=task.thresholdQuantile)
-    logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...')
-    fd.visualizeMe()
+    visMe = visualizeFD(inputFile=taskdir + "{}_ProjectedData".format(setup.run),
+                        outputFile="./UMAPVis_{}.html".format(setup.run),
+                        numImgsToUse=task.num_imgs,
+                        nprocs=task.nprocs,
+                        userGroupings=[],
+                        includeABOD=True,
+                        skipSize=task.skip_size,
+                        #                            umap_n_neighbors=numImgsToUse//40,
+                        umap_n_neighbors=task.num_imgs_to_use // 4000,
+                        umap_random_state=42,
+                        hdbscan_min_samples=int(task.num_imgs_to_use * 0.75 // 40),
+                        hdbscan_min_cluster_size=int(task.num_imgs_to_use // 40),
+                        optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05,
+                        outlierQuantile=0.3)
+    #            print("here 2")
+    visMe.fullVisualize()
+    #            print("here 3")
+    visMe.userSave()
     logger.debug('Done!')

From 704a76e6abeb7e2e8f34c4b63d66d0265a4786e0 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Wed, 4 Oct 2023 10:37:55 -0700
Subject: [PATCH 45/57] Drafting FD tasks.

---
 btx/interfaces/ischeduler.py |  2 ++
 btx/processing/freqdir.py    | 40 ++++++++++++++++++++++++++++++++++++
 scripts/tasks.py             | 39 ++++++++++++++++++-----------------
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py
index 3d771c8e3..505eaa18b 100644
--- a/btx/interfaces/ischeduler.py
+++ b/btx/interfaces/ischeduler.py
@@ -118,6 +118,8 @@ def _write_dependencies(self, dependencies):
         if "xgandalf" in dependencies:
             dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/:$PATH\n"
             dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/eigen3/Eigen/:$PATH"
+        if "fdviz" in dependencies:
+            dep_paths += f"conda activate /sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3"
         dep_paths += "\n"
         
         with open(self.jobfile, 'a') as jfile:
diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 40f388b88..ab21c6279 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1893,3 +1893,43 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             return (fullimgs, fullthumbnails, imgsTracked)
         else:
             return (fullimgs, imgsTracked)
+
+def main():
+    """
+    Perform Frequent Direction Visualization.
+    """
+    params = parse_input()
+    os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True)
+    visMe = visualizeFD(inputFile=params.outdir + f"{params.run:04}_ProjectedData",
+                        outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html",
+                        numImgsToUse=params.num_imgs,
+                        nprocs=params.nprocs,
+                        userGroupings=[],
+                        includeABOD=True,
+                        skipSize=params.skip_size,
+                        umap_n_neighbors=params.num_imgs_to_use // 4000,
+                        umap_random_state=42,
+                        hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40),
+                        hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40),
+                        optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05,
+                        outlierQuantile=0.3)
+    visMe.fullVisualize()
+    visMe.userSave()
+def parse_input():
+    """
+    Parse command line input.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-e', '--exp', help='Experiment name', required=True, type=str)
+    parser.add_argument('-r', '--run', help='Run number', required=True, type=int)
+    parser.add_argument('-d', '--det_type', help='Detector name, e.g epix10k2M or jungfrau4M',  required=True, type=str)
+    parser.add_argument('-o', '--outdir', help='Output directory for powders and plots', required=True, type=str)
+    parser.add_argument('--num_imgs', help='Number of images to process, -1 for full run', required=False, default=-1, type=int)
+    parser.add_argument('--nprocs', help='Number of cores used for upstream analysis', required=False, type=int)
+    parser.add_argument('--skip_size', help='Skip size', required=False, type=int)
+    parser.add_argument('--num_imgs_to_use', help="Number of images to use", required=False, type=int)
+
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/tasks.py b/scripts/tasks.py
index cc25ff692..2f5c12328 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -533,28 +533,29 @@ def draw_sketch(config):
     logger.debug('Done!')
 
 def show_sketch():
-    from btx.processing.freqdir import WrapperFullFD
+    from btx.interfaces.ischeduler import JobScheduler
     setup = config.setup
     task = config.show_sketch
     """ Display Sketch. """
     taskdir = os.path.join(setup.root_dir, 'sketch')
     os.makedirs(taskdir, exist_ok=True)
-    visMe = visualizeFD(inputFile=taskdir + "{}_ProjectedData".format(setup.run),
-                        outputFile="./UMAPVis_{}.html".format(setup.run),
-                        numImgsToUse=task.num_imgs,
-                        nprocs=task.nprocs,
-                        userGroupings=[],
-                        includeABOD=True,
-                        skipSize=task.skip_size,
-                        #                            umap_n_neighbors=numImgsToUse//40,
-                        umap_n_neighbors=task.num_imgs_to_use // 4000,
-                        umap_random_state=42,
-                        hdbscan_min_samples=int(task.num_imgs_to_use * 0.75 // 40),
-                        hdbscan_min_cluster_size=int(task.num_imgs_to_use // 40),
-                        optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05,
-                        outlierQuantile=0.3)
-    #            print("here 2")
-    visMe.fullVisualize()
-    #            print("here 3")
-    visMe.userSave()
+    script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../btx/processing/freqdir.py")
+    command = f"python {script_path}"
+    command += f" -e {setup.exp} -r {setup.run} -d {setup.det_type} -o {taskdir}"
+    if task.get('num_imgs') is not None:
+        command += f" --num_imgs={task.num_imgs}"
+    if task.get('nprocs') is not None:
+        command += f" --nprocs={task.nprocs}"
+    if task.get('skip_size') is not None:
+        command += f" --skip_size={task.skip_size}"
+    if task.get('num_imgs_to_use') is not None:
+        command += f" --num_imgs_to_use={task.num_imgs_to_use}"
+    js = JobScheduler(os.path.join(".", f'fd_{setup.run:04}.sh'),
+                      queue=setup.queue,
+                      ncores=task.ncores,
+                      jobname=f'fd_{setup.run:04}')
+    js.write_header()
+    js.write_main(f"{command}\n", dependencies=['psana','fdviz'])
+    js.clean_up()
+    js.submit()
     logger.debug('Done!')

From b4a7793825d179888581e8a5031fa9ee1b25358a Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Wed, 4 Oct 2023 22:11:02 -0700
Subject: [PATCH 46/57] Fixed bug in importing btx and other libraries. Minor
 other changes I think.

---
 btx/processing/freqdir.py | 46 ++++++++++++++++++++-------------------
 scripts/tasks.py          |  1 +
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index ab21c6279..099f6efa3 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -819,6 +819,8 @@ class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
     """
+    umap = __import__('umap')
+    hdbscan = __import__('hdbscan')
     def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
             skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
             optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
@@ -953,8 +955,6 @@ def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
     def genUMAP(self):
-        import umap
-        import hdbscan
 
         imgs = None
         projections = None
@@ -967,8 +967,6 @@ def genUMAP(self):
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
 
-        print("AOIDWJOIAWDJ", len(imgs), len(projections))
-
         intensities = []
         for img in imgs:
             intensities.append(np.sum(img.flatten()))
@@ -983,7 +981,7 @@ def genUMAP(self):
         if len(self.imgs)!= self.numImgsToUse:
             raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
 
-        self.clusterable_embedding = umap.UMAP(
+        self.clusterable_embedding = self.umap.UMAP(
             n_neighbors=self.umap_n_neighbors,
             random_state=self.umap_random_state,
             n_components=2,
@@ -991,7 +989,7 @@ def genUMAP(self):
             min_dist=0.1,
         ).fit_transform(self.projections)
 
-        self.labels = hdbscan.HDBSCAN(
+        self.labels = self.hdbscan.HDBSCAN(
             min_samples = self.hdbscan_min_samples,
             min_cluster_size = self.hdbscan_min_cluster_size
         ).fit_predict(self.clusterable_embedding)
@@ -1320,7 +1318,8 @@ class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
     """
-    from btx.interfaces.ipsana import PsanaInterface
+#    from btx.interfaces.ipsana import PsanaInterface
+    btx = __import__('btx')
     def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
@@ -1353,7 +1352,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
 
         self.usePSI = usePSI
         if usePSI:
-            self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type)
+            self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
             self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size
         else:
             self.psi = None
@@ -1365,7 +1364,8 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile)
-        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 150, thumbnailWidth = 150)
+#        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
+        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
 #    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
 #        """ 
@@ -1624,12 +1624,13 @@ def normalizeIntensityFunc(self, img, currIntensity):
 
 
 class DataRetriever:
-    from btx.interfaces.ipsana import (
-        PsanaInterface,
-        bin_data,
-        retrieve_pixel_index_map,
-        assemble_image_stack_batch,
-    )
+    btx = __import__('btx')
+#    from btx.interfaces.ipsana import (
+#        PsanaInterface,
+#        bin_data,
+#        retrieve_pixel_index_map,
+#        assemble_image_stack_batch,
+#    )
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
@@ -1639,7 +1640,8 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
+#        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
+        self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
 
         self.imageProcessor = imageProcessor
 
@@ -1652,12 +1654,12 @@ def assembleImgsToSave(self, imgs):
         imgs: ndarray
             images to downsample
         """
-        pixel_index_map = self.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
+        pixel_index_map = self.btx.interfaces.ipsana.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run))
 
         saveMe = []
         for img in imgs:
             imgRe = np.reshape(img, self.psi.det.shape())
-            imgRe = self.assemble_image_stack_batch(imgRe, pixel_index_map)
+            imgRe = self.btx.interfaces.ipsana.assemble_image_stack_batch(imgRe, pixel_index_map)
             saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth))))
         return np.array(saveMe)
 
@@ -1719,7 +1721,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
             if self.downsample:
 #                print("Downsampling images")
-                imgs = self.bin_data(imgs, self.bin_factor)
+                imgs = self.btx.interfaces.ipsana.bin_data(imgs, self.bin_factor)
 #            print("Flattening images")
             num_valid_imgs, p, x, y = imgs.shape
             img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
@@ -1772,7 +1774,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
 
 class SinglePanelDataRetriever:
-    from btx.interfaces.ipsana import PsanaInterface
+#    from btx.interfaces.ipsana import PsanaInterface
+    btx = __import__('btx')
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
@@ -1780,7 +1783,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type)
+        self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
 
         self.imageProcessor = imageProcessor
 
@@ -1841,7 +1844,6 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 for img in imgs:
                     saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
                 thumbnails = np.array(saveMe)
-                print("thumbaaowdijaoiajw", len(imgs), len(thumbnails))
 
             num_valid_imgs, x, y = imgs.shape
             img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T
diff --git a/scripts/tasks.py b/scripts/tasks.py
index 2f5c12328..3239c02ca 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -529,6 +529,7 @@ def draw_sketch(config):
                        samplingFactor=task.samplingFactor, divBy=task.divBy, 
                        thresholdQuantile=task.thresholdQuantile)
     logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...')
+    fd.retrieveImages()
     fd.runMe()
     logger.debug('Done!')
 

From 04195d8fa695e1c701b54123cf58967ab175151b Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Mon, 16 Oct 2023 22:17:12 -0700
Subject: [PATCH 47/57] Checkpoint

---
 btx/processing/freqdir.py | 81 +++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 099f6efa3..8dd6a3273 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -148,6 +148,7 @@ def __init__(
             super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset,
                     num_images=num_imgs, num_components=num_components, batch_size=0, priming=False,
                     downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi)
+            self.num_features,self.num_images = imgData.shape
         else:
             self.start_offset = start_offset
             self.downsample = False
@@ -155,7 +156,6 @@ def __init__(
             self.output_dir = output_dir
             self.num_components = num_components
             self.num_features,self.num_images = imgData.shape 
-            print("NUM IMAGES: ", self.num_images)
 
             self.task_durations = dict({})
 
@@ -313,7 +313,6 @@ def rotate(self):
             self.sketch[self.ell:,:] = 0
             self.nextZeroRow = self.ell
         else:
-            print(S.shape, self.ell)
             self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:]
             self.sketch[ssize:,:] = 0
             self.nextZeroRow = ssize
@@ -842,6 +841,7 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use
         self.optics_min_cluster_size = optics_min_cluster_size
         self.outlierQuantile = outlierQuantile
 
+
     def embeddable_image(self, data):
         img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
 #        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
@@ -972,6 +972,9 @@ def genUMAP(self):
             intensities.append(np.sum(img.flatten()))
         intensities = np.array(intensities)
 
+        if self.numImgsToUse==-1:
+            self.numImgsToUse = len(imgs)
+
         self.imgs = imgs[:self.numImgsToUse:self.skipSize]
         self.projections = projections[:self.numImgsToUse:self.skipSize]
         self.intensities = intensities[:self.numImgsToUse:self.skipSize]
@@ -998,13 +1001,13 @@ def genUMAP(self):
 
         self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
         self.opticsClust.fit(self.clusterable_embedding)
-        self.opticsLabels = cluster_optics_dbscan(
-            reachability=self.opticsClust.reachability_,
-            core_distances=self.opticsClust.core_distances_,
-            ordering=self.opticsClust.ordering_,
-            eps=2.5,
-        )
-#        self.opticsLabels = self.opticsClust.labels_
+#        self.opticsLabels = cluster_optics_dbscan(
+#            reachability=self.opticsClust.reachability_,
+#            core_distances=self.opticsClust.core_distances_,
+#            ordering=self.opticsClust.ordering_,
+#            eps=2.5,
+#        )
+        self.opticsLabels = self.opticsClust.labels_
 
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
@@ -1320,7 +1323,7 @@ class WrapperFullFD:
     """
 #    from btx.interfaces.ipsana import PsanaInterface
     btx = __import__('btx')
-    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True):
+    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar, usePSI=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1342,6 +1345,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.samplingFactor=samplingFactor
         self.divBy = divBy 
         self.thresholdQuantile = thresholdQuantile
+        self.unitVar = unitVar
 
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -1363,9 +1367,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
-        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile)
-#        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
-        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
+        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar)
+        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
+#        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
 #    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
 #        """ 
@@ -1567,7 +1571,7 @@ def addThumbnailsToProjectH5(self):
         self.comm.barrier()
 
 class FD_ImageProcessing:
-    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile):
+    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar):
         self.threshold = threshold
         self.eluThreshold = eluThreshold
         self.eluAlpha = eluAlpha
@@ -1575,6 +1579,7 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize
         self.normalizeIntensity = normalizeIntensity
         self.minIntensity = minIntensity
         self.thresholdQuantile = thresholdQuantile
+        self.unitVar = unitVar
 
     def processImg(self, nimg, currIntensity):
         if self.threshold:
@@ -1585,6 +1590,8 @@ def processImg(self, nimg, currIntensity):
             nimg = self.removeZeroIntensityFunc(nimg, currIntensity)
         if self.normalizeIntensity:
             nimg = self.normalizeIntensityFunc(nimg, currIntensity)
+        if self.unitVar:
+            nimg = self.unitVarFunc(nimg, currIntensity)
         return nimg
 
     def elu(self,x):
@@ -1622,6 +1629,35 @@ def normalizeIntensityFunc(self, img, currIntensity):
         else:
             return img/np.sum(img.flatten(), dtype=np.double)
 
+    def unitVarFunc(self, img, currIntensity):
+        if img is None or currIntensity<self.minIntensity:
+            return img
+        else:
+            return img/img.std(axis=0)
+#            return (img - img.mean(axis=0)) / img.std(axis=0)
+
+    def centerImgFunc(self, img, roi_w, roi_h):
+        if img is None: 
+            return img
+        else:
+            nimg = np.pad(img, max(roi_w, roi_h)+1)
+            if  np.sum(img.flatten(), dtype=np.double)<10000:
+                cogx, cogy = (roi_w, roi_h)
+            else:
+                cogx, cogy  = self.calcCenterGrav(nimg)
+#            return nimg[cogy-(roi_h):cogy+(roi_h//2), cogx-(roi_w):cogx+(roi_w//2)]
+            return nimg[cogx-(roi_w//2):cogx+(roi_w//2), cogy-(roi_h//2):cogy+(roi_h//2)]
+
+
+    def calcCenterGrav(self, grid):
+        M_total = np.sum(grid)
+        row_indices, col_indices = np.indices(grid.shape)
+        X_c = np.sum(row_indices * grid) / M_total
+        Y_c = np.sum(col_indices * grid) / M_total
+        return (round(X_c), round(Y_c))
+
+
+
 
 class DataRetriever:
     btx = __import__('btx')
@@ -1839,6 +1875,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
 
+            jimgs = []
+            for img in imgs:
+                jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100))
+            imgs = np.array(jimgs)
+
             if getThumbnails:
                 saveMe = []
                 for img in imgs:
@@ -1865,12 +1906,15 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                     if nimg is not None:
                         nimg_batch.append(nimg)
                         nthumbnail_batch.append(nthumbnail)
+                    else:
+                        num_valid_thumbnails -= 1
+                        num_valid_imgs -= 1
                 nimg_batch = np.array(nimg_batch).T
                 nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
                 if fullimgs is None:
                     fullimgs = nimg_batch
                     fullthumbnails = nthumbnail_batch
-                else:
+                elif len(nimg_batch)!=0:
                     fullimgs = np.hstack((fullimgs, nimg_batch))
                     fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
             else:
@@ -1881,12 +1925,15 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                     nimg = self.imageProcessor.processImg(img, currIntensity)
                     if nimg is not None:
                         nimg_batch.append(nimg)
+                    else:
+                        num_valid_imgs -= 1
                 nimg_batch = np.array(nimg_batch).T
+#                print(nimg_batch.shape)
 #                print("hstacking")
                 if fullimgs is None:
-
                     fullimgs = nimg_batch
-                else:
+                elif len(nimg_batch)!=0:
+#                    print(fullimgs.shape, nimg_batch.shape, nimg_batch)
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
 #        print("Images tracked:", imgsTracked)

From 3c91bd096e4e6e01f1b19f97aefc8eeef191e6c6 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Fri, 20 Oct 2023 11:38:16 -0700
Subject: [PATCH 48/57] added some logger INFO to ischeduler

---
 btx/interfaces/ischeduler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py
index 505eaa18b..0dc027c21 100644
--- a/btx/interfaces/ischeduler.py
+++ b/btx/interfaces/ischeduler.py
@@ -124,8 +124,10 @@ def _write_dependencies(self, dependencies):
         
         with open(self.jobfile, 'a') as jfile:
             jfile.write(dep_paths)
+            logger.info(dep_paths)
             if 'SIT_PSDM_DATA' in os.environ:
                 jfile.write(f"export SIT_PSDM_DATA={os.environ['SIT_PSDM_DATA']}\n")
+                logger.info(f"export SIT_PSDM_DATA={os.environ['SIT_PSDM_DATA']}\n")
 
     def write_main(self, application, dependencies=[]):
         """ Write application and source requested dependencies. """
@@ -135,6 +137,7 @@ def write_main(self, application, dependencies=[]):
         pythonpath = self._find_python_path()
         with open(self.jobfile, 'a') as jfile:
             jfile.write(application.replace("python", pythonpath))
+            logger.info(application.replace("python", pythonpath))
 
     def submit(self):
         """ Submit to queue. """
@@ -145,3 +148,4 @@ def clean_up(self):
         """ Add a line to delete submission file."""
         with open(self.jobfile, 'a') as jfile:
             jfile.write(f"if [ -f {self.jobfile} ]; then rm -f {self.jobfile}; fi")
+            logger.info(f"if [ -f {self.jobfile} ]; then rm -f {self.jobfile}; fi")

From c2b7ea55a24bf01b855c02d0add5db9430e95b24 Mon Sep 17 00:00:00 2001
From: fredericpoitevin <frederic.poitevin@gmail.com>
Date: Fri, 20 Oct 2023 11:53:28 -0700
Subject: [PATCH 49/57] when a conda environment is activated in
 write_dependencies (in ischeduler), the pythonpath needs to be given as well,
 otherwise python or mpirun defaults to the original environment.

---
 btx/interfaces/ischeduler.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py
index 0dc027c21..fd5e6d094 100644
--- a/btx/interfaces/ischeduler.py
+++ b/btx/interfaces/ischeduler.py
@@ -41,11 +41,15 @@ def _data_systems_management(self):
 
         self.ana_conda_manage = f'{self.ana_conda_dir}conda1/manage/bin/'
         self.ana_conda_bin = f'{self.ana_conda_dir}conda1/inst/envs/ana-4.0.47-py3/bin/'
+        self.pythonpath = None
 
     def _find_python_path(self):
         """ Determine the relevant python path. """
         pythonpath=None
-        possible_paths = [f"{self.ana_conda_bin}python"]
+        if self.pythonpath is None:
+            possible_paths = [f"{self.ana_conda_bin}python"]
+        else:
+            possible_paths = [f"{self.pythonpath}"]
     
         try:
             pythonpath = os.environ['WHICHPYTHON']
@@ -120,6 +124,7 @@ def _write_dependencies(self, dependencies):
             dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/eigen3/Eigen/:$PATH"
         if "fdviz" in dependencies:
             dep_paths += f"conda activate /sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3"
+            self.pythonpath = "/sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3/bin/python"
         dep_paths += "\n"
         
         with open(self.jobfile, 'a') as jfile:

From 5de126856a5e37430b66897fa1629307c9cda002 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Wed, 8 Nov 2023 09:53:39 -0800
Subject: [PATCH 50/57] Some of these settings produced good visualization. I
 think you have to re-enable ROI and the things in the settings (I think no
 threshold, but throw away zeros and apply unit variance and possibly
 normalization).

---
 btx/processing/freqdir.py | 84 +++++++++++++++++++++++++++++++--------
 scripts/tasks.py          |  4 +-
 2 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 8dd6a3273..8eaad726f 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -47,9 +47,11 @@
 import matplotlib as mpl
 from matplotlib import cm
 
+from bokeh.transform import linear_cmap
+from bokeh.util.hex import hexbin
 from bokeh.plotting import figure, show, output_file, save
 from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
-from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3
+from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11
 from bokeh.layouts import column, row
 
 import cProfile
@@ -958,14 +960,22 @@ def genUMAP(self):
 
         imgs = None
         projections = None
+        trueIntensities = None
         for currRank in range(self.nprocs):
             with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
                 if imgs is None:
                     imgs = hf["SmallImages"][:]
                     projections = hf["ProjectedData"][:]
+                    trueIntensities = hf["TrueIntensities"][:]
                 else:
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+                    trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0)
+
+        for intensMe in trueIntensities:
+            print(intensMe)
+            if(np.isnan(intensMe)):
+                print("This is NAN")
 
         intensities = []
         for img in imgs:
@@ -1013,6 +1023,13 @@ def genUMAP(self):
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
         self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
 
+#        self.experData_df['trueIntensities'] = [str(int(math.abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities]
+        self.experData_df['trueIntensities'] = [5 for x in trueIntensities]
+#        self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(math.abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities]
+        self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities]
+        print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities']))
+        print(trueIntensities)
+
     def genABOD(self):
         if self.includeABOD:
             abod = self.fastABOD(self.projections, 10)
@@ -1029,6 +1046,8 @@ def genABOD(self):
         self.experData_df['anomDet'] = outlierLabels
         self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels]
 
+        print("2adwjiaomd", len(self.experData_df['anomDet']), self.experData_df['anomDet'], type(self.experData_df['anomDet']))
+
     def setUserGroupings(self, userGroupings):
         """
         Set User Grouping. An adjustment is made at the beginning of this function,
@@ -1079,12 +1098,19 @@ def genLabels(self):
 
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
-        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        #JOHN CHANGE 20231020
+#        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16])
         plot_figure = figure(
             title='UMAP projection with DBSCAN clustering of the LCLS dataset',
             tools=('pan, wheel_zoom, reset'),
             width = 2000, height = 600
         )
+        
+#        bins = hexbin(self.clusterable_embedding[self.clustered, 0], self.clusterable_embedding[self.clustered, 1], 0.5)
+#        plot_figure.hex_tile(q="q", r="r", size=0.5, line_color=None, source=bins,
+#           fill_color=linear_cmap('counts', 'Viridis256', 0, max(bins.counts)))
+
         plot_figure.add_tools(HoverTool(tooltips="""
         <div style="width: 170; height: 64; background-color:@backgroundColor; margin: 5px 0px 5px 0px">
             <div style='width: 64; height: 64; float: left;'>
@@ -1171,12 +1197,14 @@ def genHTML(self):
         const cluster = datasource.data.cluster
         const ptColor = datasource.data.ptColor
         const anomDet = datasource.data.anomDet
+        const trueIntensities = datasource.data.trueIntensities
         const imgind = datasource.data.imgind
         const backgroundColor = datasource.data.backgroundColor
         const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
         const anom_backgroundColor = datasource.data.anom_backgroundColor
         const optics_backgroundColor = datasource.data.optics_backgroundColor
-        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
+        const trueIntensities_backgroundColor = datasource.data.trueIntensities_backgroundColor
+        datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, trueIntensities, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor, trueIntensities_backgroundColor}
         """)
         cols.js_on_change('value', callback)
 
@@ -1257,7 +1285,7 @@ def genHTML(self):
         reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed")
         reachabilityDiag.y_range = Range1d(-1, 10)
 
-        LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"]
+        LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection", "True Intensity"]
         radio_button_group = RadioButtonGroup(labels=LABELS, active=0)
         radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code="""
             const x = datasource.data.x
@@ -1266,10 +1294,12 @@ def genHTML(self):
             const medoidBold = datasource.data.medoidBold
             const cluster = datasource.data.cluster
             const anomDet = datasource.data.anomDet
+            const trueIntensities = datasource.data.trueIntensities
             const imgind = datasource.data.imgind
             const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor
             const anom_backgroundColor = datasource.data.anom_backgroundColor
             const optics_backgroundColor = datasource.data.optics_backgroundColor
+            const trueIntensities_backgroundColor = datasource.data.trueIntensities_backgroundColor
 
             const opticsClust = opticssource.data.clusterForScatterPlot
 
@@ -1284,21 +1314,29 @@ def genHTML(self):
                 ptColor = opticsClust
                 backgroundColor = optics_backgroundColor
             }
-            else{
+            else if (cb_obj.active==2) {
                 ptColor = anomDet
                 backgroundColor = anom_backgroundColor
             }
-            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor}
+            else {
+                ptColor = trueIntensities
+                backgroundColor = trueIntensities_backgroundColor
+            }
+            datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, trueIntensities, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor, trueIntensities_backgroundColor}
         """)
         radio_button_group.js_on_change("active", radioGroup_js)
 
         self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
 
+    def genCSV(self):
+        self.experData_df.to_csv(self.outputFile[:-4]+"csv")
+
     def fullVisualize(self):
         self.genUMAP()
         self.genABOD()
         self.genLabels()
         self.genHTML()
+        self.genCSV()
 
     def updateLabels(self):
         self.genLabels()
@@ -1323,7 +1361,7 @@ class WrapperFullFD:
     """
 #    from btx.interfaces.ipsana import PsanaInterface
     btx = __import__('btx')
-    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar, usePSI=True):
+    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1563,10 +1601,11 @@ def runMe(self):
     def addThumbnailsToProjectH5(self):
 #        print("Gathering thumbnails")
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
+        _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
         file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
         f1 = h5py.File(file_name, 'r+')
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
+        f1.create_dataset("TrueIntensities",  data=np.array(self.trueIntensitiesData))
         f1.close()
         self.comm.barrier()
 
@@ -1861,7 +1900,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
         fullthumbnails = None
         imgsTracked = []
         runs = self.split_range(startInd, startInd+n, num_steps)
-        print(runs) 
+        print(runs)
+        trueIntensities = []
         for runStart, runEnd in runs:
 #            print("RETRIEVING: [", runStart, ":", runEnd,"]")
             self.psi.counter = runStart
@@ -1875,10 +1915,19 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
 
-            jimgs = []
-            for img in imgs:
-                jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100))
-            imgs = np.array(jimgs)
+            origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
+            newTrueIntensities = []
+            for j in origTrueIntensities:
+                if j>0:
+                    newTrueIntensities.append(0)
+                else:
+                    newTrueIntensities.append(np.log(j))
+            origTrueIntensities = newTrueIntensities
+
+#            jimgs = []
+#            for img in imgs:
+#                jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100))
+#            imgs = np.array(jimgs)
 
             if getThumbnails:
                 saveMe = []
@@ -1899,13 +1948,15 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             if getThumbnails:
                 nimg_batch = []
                 nthumbnail_batch = []
-                for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
+                ntrueIntensity_batch = []
+                for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities):
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
                     nimg = self.imageProcessor.processImg(img, currIntensity)
                     nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                         nthumbnail_batch.append(nthumbnail)
+                        ntrueIntensity_batch.append(trueIntens)
                     else:
                         num_valid_thumbnails -= 1
                         num_valid_imgs -= 1
@@ -1917,6 +1968,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 elif len(nimg_batch)!=0:
                     fullimgs = np.hstack((fullimgs, nimg_batch))
                     fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
+                trueIntensities += ntrueIntensity_batch
             else:
                 nimg_batch = []
                 for img in img_batch.T:
@@ -1939,7 +1991,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 #        print("Images tracked:", imgsTracked)
         if getThumbnails:
             print(fullimgs.shape, fullthumbnails.shape, imgsTracked)
-            return (fullimgs, fullthumbnails, imgsTracked)
+            return (fullimgs, fullthumbnails, imgsTracked, trueIntensities)
         else:
             return (fullimgs, imgsTracked)
 
@@ -1949,7 +2001,7 @@ def main():
     """
     params = parse_input()
     os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True)
-    visMe = visualizeFD(inputFile=params.outdir + f"{params.run:04}_ProjectedData",
+    visMe = visualizeFD(inputFile=params.outdir + f"/{params.run:04}_ProjectedData",
                         outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html",
                         numImgsToUse=params.num_imgs,
                         nprocs=params.nprocs,
diff --git a/scripts/tasks.py b/scripts/tasks.py
index 3239c02ca..1708a7548 100755
--- a/scripts/tasks.py
+++ b/scripts/tasks.py
@@ -533,7 +533,7 @@ def draw_sketch(config):
     fd.runMe()
     logger.debug('Done!')
 
-def show_sketch():
+def show_sketch(config):
     from btx.interfaces.ischeduler import JobScheduler
     setup = config.setup
     task = config.show_sketch
@@ -553,7 +553,7 @@ def show_sketch():
         command += f" --num_imgs_to_use={task.num_imgs_to_use}"
     js = JobScheduler(os.path.join(".", f'fd_{setup.run:04}.sh'),
                       queue=setup.queue,
-                      ncores=task.ncores,
+                      ncores=task.nprocs,
                       jobname=f'fd_{setup.run:04}')
     js.write_header()
     js.write_main(f"{command}\n", dependencies=['psana','fdviz'])

From 64cabbf4fc5972bb45484fa1e504f6e2d0f15997 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Fri, 29 Dec 2023 15:09:01 -0800
Subject: [PATCH 51/57] Everything is working. It runs and produces beam
 profiles using scaling script and run and single panel stuff.

---
 btx/processing/freqdir.py | 548 ++++++++++++++++++++++++++++++++++----
 1 file changed, 495 insertions(+), 53 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 8eaad726f..bbf3a13be 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -23,15 +23,21 @@
 from matplotlib import pyplot as plt
 from matplotlib import colors
 
-# from btx.misc.shortcuts import TaskTimer
-#
-# from btx.interfaces.ipsana import (
-#     PsanaInterface,
-#     bin_data,
-#     bin_pixel_index_map,
-#     retrieve_pixel_index_map,
-#     assemble_image_stack_batch,
-# )
+##########################
+##########################
+#JOHN CHANGE BACK AFTER 12/15/2023
+from btx.misc.shortcuts import TaskTimer
+
+from btx.interfaces.ipsana import (
+    PsanaInterface,
+    bin_data,
+    bin_pixel_index_map,
+    retrieve_pixel_index_map,
+    assemble_image_stack_batch,
+)
+##########################
+##########################
+
 
 from PIL import Image
 from io import BytesIO
@@ -51,12 +57,15 @@
 from bokeh.util.hex import hexbin
 from bokeh.plotting import figure, show, output_file, save
 from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
+from bokeh.models import CustomJS, ColumnDataSource, Span, PreText
 from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11
 from bokeh.layouts import column, row
 
 import cProfile
 import string
 
+import cv2
+
 class FreqDir(DimRed):
 
     """
@@ -816,6 +825,7 @@ def update(self, vec):
         self.sketch.push(vec, pi, wi)
 
 
+
 class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
@@ -844,6 +854,278 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use
         self.outlierQuantile = outlierQuantile
 
 
+    def retrieveCircularity(self, fullThumbnailData):
+
+        def rotate_image(image, angle, center=None, scale=1.0):
+            (h, w) = image.shape[:2]
+            if center is None:
+                center = (w // 2, h // 2)
+            M = cv2.getRotationMatrix2D(center, angle, scale)
+            rotated = cv2.warpAffine(image, M, (w, h))
+            return rotated
+
+        def compute_properties(M):
+            # Calculate centroid
+            cx = int(M["m10"] / M["m00"])
+            cy = int(M["m01"] / M["m00"])
+
+            # Calculate orientation
+            mu20 = M["mu20"] / M["m00"]
+            mu02 = M["mu02"] / M["m00"]
+            mu11 = M["mu11"] / M["m00"]
+            theta = 0.5 * np.arctan2(2 * mu11, mu20 - mu02)
+
+            # Calculate eccentricity
+            a = 2 * np.sqrt(mu20 + mu02 + np.sqrt(4 * mu11**2 + (mu20 - mu02)**2))
+            b = 2 * np.sqrt(mu20 + mu02 - np.sqrt(4 * mu11**2 + (mu20 - mu02)**2))
+            eccentricity = np.sqrt(1 - (b / a) ** 2)
+
+            return cx, cy, theta, eccentricity
+
+        def reorientImg(nimg):
+            M = cv2.moments(nimg)
+            cx, cy , theta, eccentricity = compute_properties(M)
+            return rotate_image(nimg, angle=theta*180/math.pi)
+
+
+        def denoiseImg(image):
+            # Threshold the image to get a binary image
+            _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY)
+
+            # Perform connected component labeling
+            num_labels, labels_im = cv2.connectedComponents(binary_image)
+
+            # Create a mask for components larger than the size threshold
+            mask = np.zeros_like(image, dtype=bool)
+
+            size_threshold = 500
+
+            # Iterate through components and update the mask for large components
+            for label in range(1, num_labels):
+                if np.sum(labels_im == label) > size_threshold:
+                    mask[labels_im == label] = True
+
+            # Apply the mask to the original grayscale image
+            masked_image = np.zeros_like(image)
+            masked_image[mask] = image[mask]
+            return masked_image
+
+        def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired_size=224, min_contour_area=100):
+            """
+            Centers and crops the main intensity pattern in an image.
+
+            Parameters:
+            image (numpy.ndarray): The input image.
+            threshold_value (int): Threshold value for binary thresholding.
+            blur_kernel (tuple): Kernel size for Gaussian blur.
+
+            Returns:
+            numpy.ndarray: The cropped image centered around the intensity pattern.
+            """
+            # Normalize or scale the image
+            if image.dtype != np.uint8:
+                # If the range of your image is known (e.g., 0 to 5), normalize accordingly
+                # image = ((image - image.min()) / (image.max() - image.min())) * 255
+
+                # If the range is not known, scale based on the current min and max
+                image = 255 * (image - image.min()) / (image.max() - image.min())
+                image = image.astype(np.uint8)
+
+        #     print(image[100])
+
+            # Ensure the image is in grayscale
+            if len(image.shape) == 3:
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+            # Step 1: Noise Reduction
+            blurred = cv2.GaussianBlur(image, blur_kernel, 0)
+        #     blurred = image
+
+            # Step 2: Thresholding
+        #     _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY)
+            _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY)
+
+
+        #     # Step 3: Locate the Beam
+        #     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        #     if not contours:
+        #         return None  # No contours found
+
+        #     # Step 4: Determine the Bounding Box
+        #     beam = max(contours, key=cv2.contourArea)
+        #     x, y, w, h = cv2.boundingRect(beam)
+
+        #     # Step 5: Centering and Cropping
+        #     cropped = image[y:y+h, x:x+w]
+        #     print(x, y, w, h)
+
+            # Locate the Beam
+            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        #     contours = [c for c in contours if cv2.contourArea(c) > min_contour_area]
+            if not contours:
+                return None
+
+            # Determine the Bounding Box
+            beam = max(contours, key=cv2.contourArea)
+            x, y, w, h = cv2.boundingRect(beam)
+
+            # Find the center of the bounding box
+            center_x, center_y = x + w // 2, y + h // 2
+
+            # Define new bounding box dimensions
+            new_x = max(center_x - desired_size // 2, 0)
+            new_y = max(center_y - desired_size // 2, 0)
+
+        #     print("new x: ", x, new_x)
+        #     print("new y: ", y, new_y)
+
+            # Adjust if the new box extends beyond the original image
+            new_x = min(new_x, image.shape[1] - desired_size)
+            new_y = min(new_y, image.shape[0] - desired_size)
+
+        #     print("new x: ", x, new_x)
+        #     print("new y: ", y, new_y)
+
+            # Crop the image to the new bounding box
+            cropped = image[new_y:new_y + desired_size, new_x:new_x + desired_size]
+
+            return cropped
+
+        # threshVal = 100
+
+        nimgs = []
+        nbws = []
+        nbws1 = []
+        contours = []
+        contourImgs = []
+        for j in range(len(fullThumbnailData)):
+        #     currImg = (fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy()
+        #     nimg = currImg
+            nimg = center_and_crop_beam(fullThumbnailData[j])
+        #     nimg = reorientImg(nimg)
+            if nimg is None:
+                continue
+            nimg = reorientImg(nimg)
+            nimg = denoiseImg(nimg)
+            nimgs.append(nimg)
+        #     nbws.append(nimg)
+        #     (thresh, im_bw) = cv2.threshold((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy(), 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+        #     print(nimg)
+        #     print(j, np.max(nimg))
+        #     np.set_printoptions(threshold=np.inf, linewidth=np.inf)
+        #     print(nimg)
+
+            (thresh, im_bw) = cv2.threshold(nimg, 0, 255, cv2.THRESH_BINARY)
+            nbws.append(im_bw.copy())
+            (thresh1, im_bw1) = cv2.threshold(nimg, 0, 1, cv2.THRESH_BINARY)
+            nbws1.append(im_bw1.copy())
+
+        #     # Assuming 'im' is your grayscale image
+        #     # Apply Gaussian blur to the image
+        #     blurred = cv2.GaussianBlur(im_bw, (5, 5), 0)
+        #     # Apply binary thresholding on the blurred image
+        #     _, binary = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)
+        #     # Find contours
+        #     contourList, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+        #     # Find the largest contour based on area
+        #     largest_contour = max(contourList, key=cv2.contourArea)
+        #     contours.append(largest_contour)
+        #     canvas = np.zeros(im_bw.shape, dtype='uint8')
+        #     # Draw the largest contour in white
+        #     cv2.drawContours(canvas, [largest_contour], -1, (255), 1)
+        #     contourImgs.append(canvas)
+
+        # #     nbws.append(cv2.GaussianBlur(nimg, (5, 5), 0))
+        # #     nbws.append(im_bw)
+        # #     nbws.append((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy()
+
+        # # ind = 356
+
+        # # plt.imshow(nimgs[ind])
+        # # plt.show()
+
+        # # print(nbws1[ind][80])
+
+        # # Calculate moments
+        # M = cv2.moments(nbws1[ind])
+        # # Zeroth moment is the area
+        # area = M['m00']
+        # epsilon = 0.01 * cv2.arcLength(contours[ind], True)
+        # approx = cv2.approxPolyDP(contours[ind], epsilon, True)
+        # # Calculate the perimeter
+        # perimeter = cv2.arcLength(approx, True)
+        # # Calculate circularity using moments
+        # circularity = 4 * 3.14159 * area / (perimeter * perimeter)
+        # print(circularity)
+
+        # # Calculate moments
+        # M = cv2.moments(nbws1[ind])
+        # ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02']))
+        # print(ncirc)
+
+        circs = []
+        ncircs = []
+
+        for ind in range(len(nbws)):
+        #     # Calculate moments
+        #     M = cv2.moments(nbws1[ind])
+        #     # Zeroth moment is the area
+        #     area = M['m00']
+        #     epsilon = 0.01 * cv2.arcLength(contours[ind], True)
+        #     approx = cv2.approxPolyDP(contours[ind], epsilon, True)
+        #     # Calculate the perimeter
+        #     perimeter = cv2.arcLength(approx, True)
+        #     # Calculate circularity using moments
+        #     circularity = 4 * 3.14159 * area / (perimeter * perimeter)
+
+            # Calculate moments
+            M = cv2.moments(nbws[ind])
+            try:
+                ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02']))
+            except:
+                ncirc = 1
+
+        #     circs.append(circularity)
+            ncircs.append(ncirc)
+
+        sorted_indices = np.argsort(ncircs)
+        sorted_arrays = [nimgs[i] for i in sorted_indices]
+        sorted_full = [fullThumbnailData[i] for i in sorted_indices]
+
+        #     import matplotlib.pyplot as plt
+        #     import numpy as np
+
+        #     # Assuming 'images' is your list of 16 NumPy array images
+        #     # For demonstration, creating 16 random 8x8 grayscale images
+        #     images = [j for j in sorted_arrays[::len(sorted_arrays)//16]]
+
+        #     # Create a 4x4 grid of subplots
+        #     fig, axs = plt.subplots(4, 4, figsize=(10, 10))
+
+        #     # Flatten the array of axes for easy iteration
+        #     axs = axs.ravel()
+
+        #     # Plot each image and add text
+        #     for i in range(16):
+        #         axs[i].imshow(images[i], cmap='jet', vmin=0, vmax=255)  # Assuming grayscale images
+        #         axs[i].text(50, 5, f"Image {i+1}", color='white', ha='center', va='center')
+        #         axs[i].axis('off')  # Turn off axis
+
+        #     plt.tight_layout()  # Adjust subplots to fit into the figure area.
+        #     plt.show()
+
+        # ind=23
+        # nimg = center_and_crop_beam(fullThumbnailData[40])
+        # # plt.imshow(fullThumbnailData[ind])
+        # plt.imshow(nimg)
+        # plt.show()
+
+        bigOrSmall = [1 if j>len(sorted_arrays)*10//16 else 0 for j in sorted_indices]
+#        np.savez(saveDir+'circularityImgs_{}.npz'.format(currRun), **{f'array_{i}': arr for i, arr in enumerate(nimgs)}, labels=bigOrSmall)
+
+        return ncircs
+
+
     def embeddable_image(self, data):
         img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
 #        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
@@ -874,7 +1156,8 @@ def genMedoids(self, medoidLabels, clusterPoints):
             for test_index, test_point in enumerate(lst):
                 if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]):
                     fin_ind = test_index
-            medoid_lst.append((k, v[fin_ind][0]))
+#            medoid_lst.append((k, v[fin_ind][0]))
+            medoid_lst.append((k, v[fin_ind+1][0]))
         return medoid_lst
 
     def relabel_to_closest_zero(self, labels):
@@ -956,8 +1239,31 @@ def genHist(self, vals, endClass):
     def genLeftRight(self, endClass):
         return [*range(endClass+1)], [*range(1, endClass+2)]
 
+
+    def float_to_int_percentile(self, float_list):
+        # Edge case: If the list is empty, return an empty list
+        if not float_list:
+            return []
+
+        # Calculate the percentiles that define the bin edges
+        percentiles = np.percentile(float_list, [10 * i for i in range(1, 10)])
+
+        # Function to find the bin for a single value
+        def find_bin(value):
+            for i, p in enumerate(percentiles):
+                if value < p:
+                    return i
+            return 9  # For values in the highest bin
+
+        # Convert each float to an integer based on its bin
+        int_list = [find_bin(value) for value in float_list]
+
+        return int_list
+
+
     def genUMAP(self):
 
+
         imgs = None
         projections = None
         trueIntensities = None
@@ -971,6 +1277,7 @@ def genUMAP(self):
                     imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
                     projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
                     trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0)
+        print(len(imgs))
 
         for intensMe in trueIntensities:
             print(intensMe)
@@ -998,14 +1305,18 @@ def genUMAP(self):
             n_neighbors=self.umap_n_neighbors,
             random_state=self.umap_random_state,
             n_components=2,
-#            min_dist=0.25,
-            min_dist=0.1,
+            min_dist=0,
+#            min_dist=0.1,
         ).fit_transform(self.projections)
 
-        self.labels = self.hdbscan.HDBSCAN(
-            min_samples = self.hdbscan_min_samples,
-            min_cluster_size = self.hdbscan_min_cluster_size
-        ).fit_predict(self.clusterable_embedding)
+#        self.labels = self.hdbscan.HDBSCAN(
+#            min_samples = self.hdbscan_min_samples,
+#            min_cluster_size = self.hdbscan_min_cluster_size
+#        ).fit_predict(self.clusterable_embedding)
+
+        ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs))
+        self.labels = np.array(ncircs)
+
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
 
@@ -1023,12 +1334,15 @@ def genUMAP(self):
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
         self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
 
-#        self.experData_df['trueIntensities'] = [str(int(math.abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities]
-        self.experData_df['trueIntensities'] = [5 for x in trueIntensities]
-#        self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(math.abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities]
-        self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities]
-        print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities']))
-        print(trueIntensities)
+#        self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities]
+#        self.experData_df['trueIntensities'] = [5 for x in trueIntensities]
+#        self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities]
+#        self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities]
+#        print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities']))
+#        print(trueIntensities)
+        self.experData_df['trueIntensities'] = [1 for x in self.experData_df['imgind']]
+        self.experData_df['trueIntensities_backgroundColor'] = [1 for x in self.experData_df['imgind']]
+
 
     def genABOD(self):
         if self.includeABOD:
@@ -1103,7 +1417,7 @@ def genHTML(self):
         color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16])
         plot_figure = figure(
             title='UMAP projection with DBSCAN clustering of the LCLS dataset',
-            tools=('pan, wheel_zoom, reset'),
+            tools=('pan, wheel_zoom, reset, lasso_select'),
             width = 2000, height = 600
         )
         
@@ -1142,6 +1456,8 @@ def genHTML(self):
         plot_figure.legend.location = "bottom_right"
         plot_figure.legend.title = "Clusters"
 
+        density_text = PreText(text='Density_Text')
+
         vals = [x for x in self.newLabels]
         trueSource = ColumnDataSource(data=dict(vals = vals))
         hist, maxCount = self.genHist(vals, max(vals))
@@ -1165,7 +1481,7 @@ def genHTML(self):
                 end=self.numImgsToUse,
                 value=(0, self.numImgsToUse-1),
                 step=1, sizing_mode="stretch_width")
-        callback = CustomJS(args=dict(cols=cols, trueSource = trueSource,
+        callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, density_text=density_text,
                                       histsource = histsource, datasource=datasource, indexCDS=indexCDS), code="""
         function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) {
             let counts = new Array(largestVal-smallestVal); for (let i=0; i<largestVal-smallestVal; ++i) counts[i] = 0;
@@ -1326,7 +1642,69 @@ def genHTML(self):
         """)
         radio_button_group.js_on_change("active", radioGroup_js)
 
-        self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
+
+        datasource.selected.js_on_change(
+        'indices',
+        CustomJS(args=dict(datasource=datasource, cols=cols, density_text=density_text),code="""
+            function countCommonElementsInWindow(smallArray, largeArray, windowSize) {
+                const smallSet = new Set(smallArray);
+                const counts = {};
+                let count = 0;
+                let result = 0;
+
+                // Initialize the first window
+                for (let i = 0; i < windowSize; i++) {
+                    if (smallSet.has(largeArray[i])) {
+                        counts[largeArray[i]] = (counts[largeArray[i]] || 0) + 1;
+                        if (counts[largeArray[i]] === 1) {
+                            count++;
+                        }
+                    }
+                }
+                result = result + count/(largeArray.length)
+
+                // Slide the window
+                for (let i = windowSize; i < largeArray.length; i++) {
+                    // Remove the element exiting the window
+                    if (smallSet.has(largeArray[i - windowSize])) {
+                        counts[largeArray[i - windowSize]]--;
+                        if (counts[largeArray[i - windowSize]] === 0) {
+                            count--;
+                        }
+                    }
+
+                    // Add the new element entering the window
+                    if (smallSet.has(largeArray[i])) {
+                        counts[largeArray[i]] = (counts[largeArray[i]] || 0) + 1;
+                        if (counts[largeArray[i]] === 1) {
+                            count++;
+                        }
+                    }
+                    result = result + count/(windowSize);
+                }
+
+                return result/(largeArray.length - windowSize);
+            }
+
+            var inds = cb_obj.indices;
+
+            const leftVal = cols.value[0]
+            const rightVal = cols.value[1]
+
+            if (inds.length == 0) {
+                return
+            }
+
+
+            const arrayFrom1ToLength = Array.from({ length: datasource.data["y"].length}, (_, i) => i + 1);
+
+            console.log(rightVal-leftVal)
+            var avg = countCommonElementsInWindow(inds, arrayFrom1ToLength, rightVal-leftVal);
+            density_text.text = avg.toString();
+        """))
+
+
+        self.viewResults = column(row(plot_figure, density_text), p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag)
 
     def genCSV(self):
         self.experData_df.to_csv(self.outputFile[:-4]+"csv")
@@ -1405,7 +1783,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
-        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar)
+        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150)
         self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 #        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
@@ -1610,7 +1988,7 @@ def addThumbnailsToProjectH5(self):
         self.comm.barrier()
 
 class FD_ImageProcessing:
-    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar):
+    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h):
         self.threshold = threshold
         self.eluThreshold = eluThreshold
         self.eluAlpha = eluAlpha
@@ -1619,12 +1997,23 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize
         self.minIntensity = minIntensity
         self.thresholdQuantile = thresholdQuantile
         self.unitVar = unitVar
+        self.centerImg = centerImg
+        self.roi_w = roi_w
+        self.roi_h = roi_h
 
-    def processImg(self, nimg, currIntensity):
+    def processImg(self, nimg, ncurrIntensity):
         if self.threshold:
             nimg = self.thresholdFunc(nimg)
         if self.eluThreshold:
             nimg = self.eluThresholdFunc(nimg)
+        if self.centerImg:
+            nimg = self.centerImgFunc(nimg)
+
+        if nimg is not None:
+            currIntensity = abs(np.sum(nimg.flatten(), dtype=np.double))
+        else:
+            currIntensity = 0
+
         if self.noZeroIntensity:
             nimg = self.removeZeroIntensityFunc(nimg, currIntensity)
         if self.normalizeIntensity:
@@ -1664,8 +2053,10 @@ def normalizeIntensityFunc(self, img, currIntensity):
         if img is None:
             return img
         elif currIntensity<self.minIntensity:
+#            print("LESS", currIntensity, self.minIntensity)
             return np.zeros(img.shape)+1
         else:
+#            print("MORE", np.sum(img.flatten(), dtype=np.double), currIntensity, self.minIntensity)
             return img/np.sum(img.flatten(), dtype=np.double)
 
     def unitVarFunc(self, img, currIntensity):
@@ -1675,17 +2066,25 @@ def unitVarFunc(self, img, currIntensity):
             return img/img.std(axis=0)
 #            return (img - img.mean(axis=0)) / img.std(axis=0)
 
-    def centerImgFunc(self, img, roi_w, roi_h):
+    def centerImgFunc(self, img):
         if img is None: 
             return img
         else:
-            nimg = np.pad(img, max(roi_w, roi_h)+1)
-            if  np.sum(img.flatten(), dtype=np.double)<10000:
-                cogx, cogy = (roi_w, roi_h)
-            else:
-                cogx, cogy  = self.calcCenterGrav(nimg)
-#            return nimg[cogy-(roi_h):cogy+(roi_h//2), cogx-(roi_w):cogx+(roi_w//2)]
-            return nimg[cogx-(roi_w//2):cogx+(roi_w//2), cogy-(roi_h//2):cogy+(roi_h//2)]
+            nimg = img
+            rampingFact = 1
+            while rampingFact>=1:
+                curr_roi_w = int(self.roi_w*rampingFact)
+                curr_roi_h = int(self.roi_h*rampingFact)
+                nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1)
+#                print(rampingFact)
+                if  np.sum(img.flatten(), dtype=np.double)<10000:
+                    cogx, cogy = (curr_roi_w, curr_roi_h)
+                else:
+                    cogx, cogy  = self.calcCenterGrav(nimg)
+    #            return nimg[cogy-(roi_h):cogy+(roi_h//2), cogx-(roi_w):cogx+(roi_w//2)]
+                nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)]
+                rampingFact -= 0.5
+            return nimg
 
 
     def calcCenterGrav(self, grid):
@@ -1693,6 +2092,7 @@ def calcCenterGrav(self, grid):
         row_indices, col_indices = np.indices(grid.shape)
         X_c = np.sum(row_indices * grid) / M_total
         Y_c = np.sum(col_indices * grid) / M_total
+#        print(M_total, X_c, Y_c, grid)
         return (round(X_c), round(Y_c))
 
 
@@ -1812,9 +2212,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 nimg_batch = []
                 nthumbnail_batch = []
                 for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
-                    currIntensity = np.sum(img.flatten(), dtype=np.double)
-                    nimg = self.imageProcessor.processImg(img, currIntensity)
-                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
+#                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+                    nimg = self.imageProcessor.processImg(img) #JOHN 011/09/2023
+                    nthumbnail = self.imageProcessor.processImg(thumbnail)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                         nthumbnail_batch.append(nthumbnail)
@@ -1829,9 +2229,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             else:
                 nimg_batch = []
                 for img in img_batch.T:
-                    currIntensity = np.sum(img.flatten(), dtype=np.double)
+#                    currIntensity = np.sum(img.flatten(), dtype=np.double) #JOHN 011/09/2023
 #                    print("Starting image processing of size {}".format(img_batch.T.shape))
-                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    nimg = self.imageProcessor.processImg(img)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                 nimg_batch = np.array(nimg_batch).T
@@ -1918,7 +2318,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
             newTrueIntensities = []
             for j in origTrueIntensities:
-                if j>0:
+#                if j>0:
+#                    newTrueIntensities.append(0)
+#                else:
+#                    newTrueIntensities.append(np.log(j))
+                if j<0:
                     newTrueIntensities.append(0)
                 else:
                     newTrueIntensities.append(np.log(j))
@@ -1932,18 +2336,25 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             if getThumbnails:
                 saveMe = []
                 for img in imgs:
-                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
+#                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023
+                    saveMe.append(np.array(img)) #JOHN 011/09/2023
                 thumbnails = np.array(saveMe)
 
-            num_valid_imgs, x, y = imgs.shape
-            img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T
+            num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023
+
+#            img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T #JOHN 011/09/2023
+            img_batch = imgs.T
 #            print("Image values less than 0 setting to 0")
             img_batch[img_batch<0] = 0
+
+#            num_valid_imgs, x, y = img_batch.T.shape #JOHN 11/20/2023
+#            print(num_valid_imgs, x, y)
     
             if getThumbnails:
 #                print("FLattening thumbnails")
                 num_valid_thumbnails, tx, ty = thumbnails.shape
-                thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
+#                thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T #JOHN 011/09/2023
+                thumbnail_batch = thumbnails.T
 
             if getThumbnails:
                 nimg_batch = []
@@ -1951,17 +2362,36 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 ntrueIntensity_batch = []
                 for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities):
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
-                    nimg = self.imageProcessor.processImg(img, currIntensity)
-                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity)
+                    if self.imageProcessor.centerImg: 
+                        nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
+                    else:
+                        nimg = self.imageProcessor.processImg(img, currIntensity)
+#                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) #JOHN 011/09/2023
+                    if nimg is None:
+                        nthumbnail = None
+                    else:
+                        nthumbnail = nimg.copy()
+#                    print(np.array(nimg).shape)
+#                    print(nthumbnail)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                         nthumbnail_batch.append(nthumbnail)
                         ntrueIntensity_batch.append(trueIntens)
                     else:
+#                        nimg_batch.append(np.zeros((x, y)))
+#                        nthumbnail_batch.append(np.zeros((tx, ty)))
+#                        ntrueIntensity_batch.append(0)
                         num_valid_thumbnails -= 1
                         num_valid_imgs -= 1
-                nimg_batch = np.array(nimg_batch).T
-                nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+                if self.imageProcessor.centerImg: #JOHN 011/09/2023
+#                    print("a09wupoidkw", np.array(nimg_batch).shape)
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
+                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023
+                else: #JOHN 011/09/2023
+#                    print("a09wupoidkw", np.array(nimg_batch).shape)
+#                    print(num_valid_imgs, x, y)
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023
+                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023
                 if fullimgs is None:
                     fullimgs = nimg_batch
                     fullthumbnails = nthumbnail_batch
@@ -1974,12 +2404,23 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 for img in img_batch.T:
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
 #                    print("Starting image processing of size {}".format(img_batch.T.shape))
-                    nimg = self.imageProcessor.processImg(img, currIntensity)
+                    nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                     else:
+#                        nimg_batch.append(np.zeros((x, y)))
                         num_valid_imgs -= 1
-                nimg_batch = np.array(nimg_batch).T
+
+#                nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
+
+                #JOHN 11/20/23
+                if self.imageProcessor.centerImg: #JOHN 011/09/2023
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
+                else: #JOHN 011/09/2023
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023
+
+
+
 #                print(nimg_batch.shape)
 #                print("hstacking")
                 if fullimgs is None:
@@ -2008,7 +2449,8 @@ def main():
                         userGroupings=[],
                         includeABOD=True,
                         skipSize=params.skip_size,
-                        umap_n_neighbors=params.num_imgs_to_use // 4000,
+#                        umap_n_neighbors=params.num_imgs_to_use // 4000,
+                        umap_n_neighbors=params.num_imgs_to_use // 10000,
                         umap_random_state=42,
                         hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40),
                         hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40),

From e4ac86e218334744f7a6c966060561be24745804 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Sun, 31 Dec 2023 12:19:17 -0800
Subject: [PATCH 52/57] run, script, scalingscript, scalingrun. Fixed bug where
 images were not being scaled correctly using the new boxing mechanism.

---
 btx/processing/freqdir.py | 42 +++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index bbf3a13be..8773bda21 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1291,6 +1291,7 @@ def genUMAP(self):
 
         if self.numImgsToUse==-1:
             self.numImgsToUse = len(imgs)
+            self.logging_numImgsToUse = len(imgs)
 
         self.imgs = imgs[:self.numImgsToUse:self.skipSize]
         self.projections = projections[:self.numImgsToUse:self.skipSize]
@@ -1299,7 +1300,7 @@ def genUMAP(self):
         self.numImgsToUse = int(self.numImgsToUse/self.skipSize)
 
         if len(self.imgs)!= self.numImgsToUse:
-            raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse))
+            raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({}). TRUE LEN IS {}.".format(len(self.imgs), self.numImgsToUse, self.logging_numImgsToUse))
 
         self.clusterable_embedding = self.umap.UMAP(
             n_neighbors=self.umap_n_neighbors,
@@ -1783,7 +1784,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
+#JOHN CHANGE 12/30/2023
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150)
+        # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300)
         self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 #        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 
@@ -1980,12 +1983,14 @@ def addThumbnailsToProjectH5(self):
 #        print("Gathering thumbnails")
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
+        # print("FULL THUMBNAIL DATA: ", np.array(self.fullThumbnailData).shape)
         file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
         f1 = h5py.File(file_name, 'r+')
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
         f1.create_dataset("TrueIntensities",  data=np.array(self.trueIntensitiesData))
         f1.close()
         self.comm.barrier()
+        # print("FINISHED AIJOWDAWODIDWJA")
 
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h):
@@ -2336,8 +2341,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             if getThumbnails:
                 saveMe = []
                 for img in imgs:
-#                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023
-                    saveMe.append(np.array(img)) #JOHN 011/09/2023
+                    #JOHN CHANGE 12/30/2023
+                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023
+#                    saveMe.append(np.array(img)) #JOHN 011/09/2023
                 thumbnails = np.array(saveMe)
 
             num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023
@@ -2387,18 +2393,36 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 #                    print("a09wupoidkw", np.array(nimg_batch).shape)
                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
                     nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023
+
+                    ##############################
+                    # JOHN 12/30/2023
+                    saveMe = []
+                    for img in nthumbnail_batch:
+                        saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
+                    nthumbnail_batch = np.array(saveMe)
+                    # print("a09wdjaoimd", nimg_batch.shape, nthumbnail_batch.shape)
+                    # print(nthumbnail_batch.shape)
+                    # JOHN 12/30/2023
+
                 else: #JOHN 011/09/2023
-#                    print("a09wupoidkw", np.array(nimg_batch).shape)
+#                    print("a09wupoidkw", np.arrayħnimg_batch).shape)
 #                    print(num_valid_imgs, x, y)
                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023
                     nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023
-                if fullimgs is None:
+                
+                
+                if fullimgs is None and nimg_batch.shape[1]!=0:
                     fullimgs = nimg_batch
                     fullthumbnails = nthumbnail_batch
-                elif len(nimg_batch)!=0:
+                    # print("FULL IMGS IS NONE.", "nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
+                    trueIntensities += ntrueIntensity_batch
+                # elif len(nimg_batch)!=0:
+                elif nimg_batch.shape[1]!=0: #JOHN CHANGE 12/31/2023
+                    # print("nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
                     fullimgs = np.hstack((fullimgs, nimg_batch))
                     fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
-                trueIntensities += ntrueIntensity_batch
+                    # print("NEW: nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
+                    trueIntensities += ntrueIntensity_batch
             else:
                 nimg_batch = []
                 for img in img_batch.T:
@@ -2425,7 +2449,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 #                print("hstacking")
                 if fullimgs is None:
                     fullimgs = nimg_batch
-                elif len(nimg_batch)!=0:
+                # elif len(nimg_batch)!=0: #JOHN 12/31/2023
+                elif nimg_batch.shape[1]!=0:
 #                    print(fullimgs.shape, nimg_batch.shape, nimg_batch)
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
@@ -2458,6 +2483,7 @@ def main():
                         outlierQuantile=0.3)
     visMe.fullVisualize()
     visMe.userSave()
+    
 def parse_input():
     """
     Parse command line input.

From 8f81c98cfdea5c4020367588a6063a7283acdc63 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Thu, 11 Jan 2024 09:57:50 -0800
Subject: [PATCH 53/57] Checkpoint. This is working nice.

---
 btx/processing/freqdir.py | 105 +++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 8773bda21..54ef7e47c 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -205,17 +205,22 @@ def __init__(
         self.imgData = imgData
         self.imgsTracked = imgsTracked
 
+        self.fdTime = 0
+
     def run(self):
         """
         Perform frequent directions matrix sketching
         on run subject to initialization parameters.
         """
         img_batch = self.imgData
-        if self.samplingFactor <1:
+        if self.samplingFactor<1:
+            st = time.process_time() 
             psamp = PrioritySampling(int((img_batch.shape[1])*self.samplingFactor), self.d)
             for row in img_batch.T:
                 psamp.update(row)
             img_batch = np.array(psamp.sketch.get()).T
+            et = time.process_time() 
+            self.fdTime += et - st
         self.update_model(img_batch)
 #        if self.mean is None:
 #            self.mean = np.mean(img_batch, axis=1)
@@ -252,36 +257,42 @@ def update_model(self, X):
         X: ndarray
             data to update matrix sketch with
         """
+
+        rankAdapt_increaseAmount = 50
+
         _, numIncorp  = X.shape
         origNumIncorp = numIncorp
-        with TaskTimer(self.task_durations, "total update"):
-            if self.rank==0 and not self.merger:
-                print(
-                    "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
-                        m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell
-                    )
+        # with TaskTimer(self.task_durations, "total update"):
+        if self.rank==0 and not self.merger:
+            print(
+                "Factoring {m} sample{s} into {n} sample, {q} component model...".format(
+                    m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell
                 )
-            for row in X.T:
-                canRankAdapt = numIncorp > (self.ell + 15)
-                if self.nextZeroRow >= self.m:
-                    if self.increaseEll and canRankAdapt and self.rankAdapt:
-                        self.ell = self.ell + 10
-                        self.m = 2*self.ell
-                        self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d))))
-                        self.increaseEll = False
-                        print("Increasing rank of process {} to {}".format(self.rank, self.ell))
-                    else:
-                        copyBatch = self.sketch[self.ell:,:].copy()
-                        self.rotate()
-                        if canRankAdapt and self.rankAdapt:
-                            reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch))
-                            print("RANK ADAPT RECON ERROR: ", reconError)
-                            if (reconError > self.rankAdaptMinError):
-                                self.increaseEll = True
-                self.sketch[self.nextZeroRow,:] = row 
-                self.nextZeroRow += 1
-                self.num_incorporated_images += 1
-                numIncorp -= 1
+            )
+        for row in X.T:
+            st = time.process_time() 
+            canRankAdapt = numIncorp > (self.ell + 15)
+            if self.nextZeroRow >= self.m:
+                if self.increaseEll and canRankAdapt and self.rankAdapt:
+                    self.ell = self.ell + rankAdapt_increaseAmount
+                    self.m = 2*self.ell
+                    self.sketch = np.vstack((*self.sketch, np.zeros((2*rankAdapt_increaseAmount, self.d))))
+                    self.increaseEll = False
+                    print("Increasing rank of process {} to {}".format(self.rank, self.ell))
+                else:
+                    copyBatch = self.sketch[self.ell:,:].copy()
+                    self.rotate()
+                    if canRankAdapt and self.rankAdapt:
+                        reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch))
+                        print("RANK ADAPT RECON ERROR: ", reconError)
+                        if (reconError > self.rankAdaptMinError):
+                            self.increaseEll = True
+            self.sketch[self.nextZeroRow,:] = row 
+            self.nextZeroRow += 1
+            self.num_incorporated_images += 1
+            numIncorp -= 1
+            et = time.process_time() 
+            self.fdTime += et - st
     
     def rotate(self):
         """ 
@@ -550,7 +561,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
-        time.sleep(10)
+        # time.sleep(10)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
@@ -573,6 +584,8 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
 
         self.currRun = currRun
 
+        self.mergeTime = 0
+
     def merge(self):
         """
         Merge Frequent Direction Components in a tree-like fashion. 
@@ -620,6 +633,7 @@ def merge(self):
 #                                + hf["sketch"].attrs["numImgsIncorp"])
                         self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"]
                         self.fullImgsTracked = np.vstack((self.fullImgsTracked,  hf["imgsTracked"][:]))
+            self.mergeTime = self.fd.fdTime
             return self.fd.get()
         else:
             return
@@ -704,7 +718,7 @@ def __init__(
 #        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
 #        while(not os.path.isfile(readFile2)):
 #            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        time.sleep(10)
+        # time.sleep(10)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -718,6 +732,7 @@ def __init__(
 
         self.imgData = imgData
 
+        self.compTime = 0
 
     def run(self):
         """
@@ -737,10 +752,13 @@ def apply_compression(self, X):
         X: ndarray
             data to project
         """
+        st = time.process_time() 
         if self.processedData is None:
             self.processedData = np.dot(X.T, self.components.T)
         else:
             self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T)))
+        et = time.process_time() 
+        self.compTime += et - st
 
     def write(self):
         """
@@ -1315,8 +1333,9 @@ def genUMAP(self):
 #            min_cluster_size = self.hdbscan_min_cluster_size
 #        ).fit_predict(self.clusterable_embedding)
 
-        ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs))
-        self.labels = np.array(ncircs)
+        # ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs))
+        # self.labels = np.array(ncircs)
+        self.labels = np.array(np.zeros(len(self.imgs)))
 
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
@@ -1784,6 +1803,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
+        #JOHN CHANGE 01/08/2024
+        self.newBareTime = 0
+
 #JOHN CHANGE 12/30/2023
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150)
         # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300)
@@ -1900,6 +1922,7 @@ def genSynthData(self):
 #        return (eigVecs @ (D) @ eigVecs.T)
 
     def compDecayingSVD(self, seedMe, a, b):
+        #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
         numFeats = a
         numSamps = b//self.size
         perturbation = np.random.rand(numSamps, numFeats)*0.1
@@ -1913,7 +1936,8 @@ def compDecayingSVD(self, seedMe, a, b):
         S.sort()
         S = S[::-1]
         for j in range(len(S)): #Modify
-            S[j] = (2**(-16*(j+1)/len(S)))*S[j]
+            # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
+            S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
         self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
         self.imgsTracked = [(0, numSamps)]
 
@@ -1940,6 +1964,7 @@ def runMe(self):
         print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun))
         st1 = time.perf_counter()
         freqDir.run()
+        self.newBareTime += freqDir.fdTime
         localSketchFilename = freqDir.write()
         et1 = time.perf_counter()
         print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et1 - st1))
@@ -1957,6 +1982,7 @@ def runMe(self):
                 output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI)
         st2 = time.perf_counter()
         mergeTree.merge()
+        self.newBareTime += mergeTree.mergeTime
         mergedSketchFilename = mergeTree.write()
         et2 = time.perf_counter()
         print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et2 - st2))
@@ -1966,11 +1992,12 @@ def runMe(self):
         appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData)
         st3 = time.perf_counter()
         self.matSketch = appComp.run()
+        self.newBareTime += appComp.compTime
         appComp.write()
         et3 = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3))
         print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull))
-        self.addThumbnailsToProjectH5()
+        # self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. 
         return (et1 + et2 + et3 - st1 - st2 - st3)
 
 #        self.comm.barrier()
@@ -2267,6 +2294,8 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
 
         self.imageProcessor = imageProcessor
 
+        self.excludedImgs = []
+
     def split_range(self, start, end, num_tuples):
         if start==end:
             raise ValueError('Range processing error: start value equals end value, which leads to no images processed.')
@@ -2366,7 +2395,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 nimg_batch = []
                 nthumbnail_batch = []
                 ntrueIntensity_batch = []
-                for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities):
+                for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)):
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
                     if self.imageProcessor.centerImg: 
                         nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
@@ -2389,6 +2418,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 #                        ntrueIntensity_batch.append(0)
                         num_valid_thumbnails -= 1
                         num_valid_imgs -= 1
+                        self.excludedImgs.append(ind)
                 if self.imageProcessor.centerImg: #JOHN 011/09/2023
 #                    print("a09wupoidkw", np.array(nimg_batch).shape)
                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
@@ -2425,7 +2455,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                     trueIntensities += ntrueIntensity_batch
             else:
                 nimg_batch = []
-                for img in img_batch.T:
+                for ind, img in enumerate(img_batch.T):
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
 #                    print("Starting image processing of size {}".format(img_batch.T.shape))
                     nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
@@ -2434,6 +2464,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                     else:
 #                        nimg_batch.append(np.zeros((x, y)))
                         num_valid_imgs -= 1
+                        self.excludedImgs.append(ind)
 
 #                nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
 
@@ -2454,6 +2485,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 #                    print(fullimgs.shape, nimg_batch.shape, nimg_batch)
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
+        print("EXCLUDING IMAGES: ", self.excludedImgs)
+
 #        print("Images tracked:", imgsTracked)
         if getThumbnails:
             print(fullimgs.shape, fullthumbnails.shape, imgsTracked)

From 931de83ec1fe31574e8a8f705279688c7e7daab6 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana001.sdf.slac.stanford.edu>
Date: Mon, 11 Mar 2024 20:43:51 -0700
Subject: [PATCH 54/57] Pushing as a checkpoint. Not too sure what changed, but
 this code seems to work.

---
 btx/processing/freqdir.py | 189 +++++++++++++++++++++++---------------
 1 file changed, 116 insertions(+), 73 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 54ef7e47c..0dc2889ae 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -23,20 +23,21 @@
 from matplotlib import pyplot as plt
 from matplotlib import colors
 
-##########################
-##########################
-#JOHN CHANGE BACK AFTER 12/15/2023
-from btx.misc.shortcuts import TaskTimer
-
-from btx.interfaces.ipsana import (
-    PsanaInterface,
-    bin_data,
-    bin_pixel_index_map,
-    retrieve_pixel_index_map,
-    assemble_image_stack_batch,
-)
-##########################
-##########################
+#JOHN: COMMENTED OUT AFTER 03/11/2024
+# ##########################
+# ##########################
+# #JOHN CHANGE BACK AFTER 12/15/2023
+# from btx.misc.shortcuts import TaskTimer
+
+# from btx.interfaces.ipsana import (
+#     PsanaInterface,
+#     bin_data,
+#     bin_pixel_index_map,
+#     retrieve_pixel_index_map,
+#     assemble_image_stack_batch,
+# )
+# ##########################
+# ##########################
 
 
 from PIL import Image
@@ -637,6 +638,39 @@ def merge(self):
             return self.fd.get()
         else:
             return
+        
+    def serialMerge(self):
+        """
+        Merge Frequent Direction Components in a serial fashion. 
+        Returns
+        -------
+        finalSketch : ndarray
+            Merged matrix sketch of cumulative data
+        """
+
+        if self.rank==0:
+            for currWorkingCore in range(1, self.size):
+                bufferMe = np.empty(self.buffSizes[currWorkingCore] * self.data.shape[1], dtype=np.double)
+                self.comm.Recv(bufferMe, source=currWorkingCore, tag=currWorkingCore)
+                bufferMe = np.reshape(bufferMe, (self.buffSizes[currWorkingCore], self.data.shape[1]))
+                self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1],1)))))
+        else:
+            bufferMe = self.fd.get().copy().flatten()
+            self.comm.Send(bufferMe, dest=0, tag=self.rank)
+
+        if self.rank==0:
+            for readMe in self.allWriteDirecs:
+                with h5py.File(readMe, 'r') as hf:
+                    if self.fullNumIncorp==0:
+                        self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"]
+                        self.fullImgsTracked = hf["imgsTracked"][:]
+                    else:
+                        self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"]
+                        self.fullImgsTracked = np.vstack((self.fullImgsTracked,  hf["imgsTracked"][:]))
+            self.mergeTime = self.fd.fdTime
+            return self.fd.get()
+        else:
+            return
 
     def write(self):
         """
@@ -907,23 +941,13 @@ def reorientImg(nimg):
 
 
         def denoiseImg(image):
-            # Threshold the image to get a binary image
             _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY)
-
-            # Perform connected component labeling
             num_labels, labels_im = cv2.connectedComponents(binary_image)
-
-            # Create a mask for components larger than the size threshold
             mask = np.zeros_like(image, dtype=bool)
-
             size_threshold = 500
-
-            # Iterate through components and update the mask for large components
             for label in range(1, num_labels):
                 if np.sum(labels_im == label) > size_threshold:
                     mask[labels_im == label] = True
-
-            # Apply the mask to the original grayscale image
             masked_image = np.zeros_like(image)
             masked_image[mask] = image[mask]
             return masked_image
@@ -940,77 +964,45 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired
             Returns:
             numpy.ndarray: The cropped image centered around the intensity pattern.
             """
-            # Normalize or scale the image
             if image.dtype != np.uint8:
-                # If the range of your image is known (e.g., 0 to 5), normalize accordingly
-                # image = ((image - image.min()) / (image.max() - image.min())) * 255
-
-                # If the range is not known, scale based on the current min and max
                 image = 255 * (image - image.min()) / (image.max() - image.min())
                 image = image.astype(np.uint8)
-
-        #     print(image[100])
-
-            # Ensure the image is in grayscale
             if len(image.shape) == 3:
                 image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
-            # Step 1: Noise Reduction
             blurred = cv2.GaussianBlur(image, blur_kernel, 0)
         #     blurred = image
 
-            # Step 2: Thresholding
         #     _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY)
             _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY)
 
 
-        #     # Step 3: Locate the Beam
         #     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         #     if not contours:
         #         return None  # No contours found
 
-        #     # Step 4: Determine the Bounding Box
         #     beam = max(contours, key=cv2.contourArea)
         #     x, y, w, h = cv2.boundingRect(beam)
 
-        #     # Step 5: Centering and Cropping
         #     cropped = image[y:y+h, x:x+w]
         #     print(x, y, w, h)
 
-            # Locate the Beam
             contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         #     contours = [c for c in contours if cv2.contourArea(c) > min_contour_area]
             if not contours:
                 return None
 
-            # Determine the Bounding Box
             beam = max(contours, key=cv2.contourArea)
             x, y, w, h = cv2.boundingRect(beam)
-
-            # Find the center of the bounding box
             center_x, center_y = x + w // 2, y + h // 2
-
-            # Define new bounding box dimensions
             new_x = max(center_x - desired_size // 2, 0)
             new_y = max(center_y - desired_size // 2, 0)
-
-        #     print("new x: ", x, new_x)
-        #     print("new y: ", y, new_y)
-
-            # Adjust if the new box extends beyond the original image
             new_x = min(new_x, image.shape[1] - desired_size)
             new_y = min(new_y, image.shape[0] - desired_size)
-
-        #     print("new x: ", x, new_x)
-        #     print("new y: ", y, new_y)
-
-            # Crop the image to the new bounding box
             cropped = image[new_y:new_y + desired_size, new_x:new_x + desired_size]
 
             return cropped
 
-        # threshVal = 100
-
         nimgs = []
         nbws = []
         nbws1 = []
@@ -1807,7 +1799,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.newBareTime = 0
 
 #JOHN CHANGE 12/30/2023
-        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150)
+        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200)
         # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300)
         self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
 #        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
@@ -1921,26 +1913,76 @@ def genSynthData(self):
 #        D = np.diag(diag_entries) + np.eye(matrixSize)
 #        return (eigVecs @ (D) @ eigVecs.T)
 
+    def modified_gram_schmidt(self, A, num_vecs):
+        m, n = A.shape
+        Q = np.zeros((m, num_vecs))
+        for j in range(num_vecs):
+            v = A[:, j]
+            for i in range(j):
+                rij = Q[:, i].dot(A[:, j])
+                v = v - rij * Q[:, i]
+            rjj = np.linalg.norm(v, 2)
+            Q[:, j] = v / rjj
+            print(f"COMPUTED VECTOR {j}/{num_vecs}")
+        return Q
+    
     def compDecayingSVD(self, seedMe, a, b):
-        #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
         numFeats = a
         numSamps = b//self.size
-        perturbation = np.random.rand(numSamps, numFeats)*0.1
-        np.random.seed(seedMe)
-        A1 = np.random.rand(numSamps, numFeats) 
-        Q1, R1 = np.linalg.qr(A1)
-        Q1 = Q1 + perturbation
-        A2 = np.random.rand(numFeats, numFeats) #Modify
-        Q2, R2 = np.linalg.qr(A2)
-        S = list(np.random.rand(numFeats)) #Modify
-        S.sort()
-        S = S[::-1]
-        for j in range(len(S)): #Modify
-            # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
-            S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
-        self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+        self.fullImgData = np.random.randn(numFeats, numSamps)
         self.imgsTracked = [(0, numSamps)]
 
+    # def compDecayingSVD(self, seedMe, a, b):
+    #     #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
+    #     print(1)
+    #     np.random.seed(seedMe + self.rank)
+    #     numFeats = a
+    #     numSamps = b//self.size
+    #     # perturbation = np.random.rand(numSamps, numFeats)*0.1
+    #     # print(2)
+    #     A1 = np.random.rand(numSamps, numFeats) 
+    #     print(3)
+    #     Q1 = self.modified_gram_schmidt(A1, numFeats)
+    #     print(5)
+    #     A2 = np.random.rand(numFeats, numFeats) #Modify
+    #     print(6)
+    #     Q2, R2 = np.linalg.qr(A2)
+    #     print(7)
+    #     S = list(np.random.rand(numFeats)) #Modify
+    #     print(8)
+    #     S.sort()
+    #     print(9)
+    #     S = S[::-1]
+    #     print(10)
+    #     for j in range(len(S)): #Modify
+    #         # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
+    #         S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
+    #     print(11)
+    #     self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+    #     print(12)
+    #     self.imgsTracked = [(0, numSamps)]
+    #     print(13)
+
+    # def compDecayingSVD(self, seedMe, a, b):
+    #     #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
+    #     numFeats = a
+    #     numSamps = b//self.size
+    #     perturbation = np.random.rand(numSamps, numFeats)*0.1
+    #     np.random.seed(seedMe)
+    #     A1 = np.random.rand(numSamps, numFeats) 
+    #     Q1, R1 = np.linalg.qr(A1)
+    #     Q1 = Q1 + perturbation
+    #     A2 = np.random.rand(numFeats, numFeats) #Modify
+    #     Q2, R2 = np.linalg.qr(A2)
+    #     S = list(np.random.rand(numFeats)) #Modify
+    #     S.sort()
+    #     S = S[::-1]
+    #     for j in range(len(S)): #Modify
+    #         # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
+    #         S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
+    #     self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+    #     self.imgsTracked = [(0, numSamps)]
+
     def runMe(self):
 
         stfull = time.perf_counter()
@@ -1982,6 +2024,7 @@ def runMe(self):
                 output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI)
         st2 = time.perf_counter()
         mergeTree.merge()
+        # mergeTree.serialMerge()
         self.newBareTime += mergeTree.mergeTime
         mergedSketchFilename = mergeTree.write()
         et2 = time.perf_counter()
@@ -1997,7 +2040,7 @@ def runMe(self):
         et3 = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3))
         print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull))
-        # self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. 
+        self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. 
         return (et1 + et2 + et3 - st1 - st2 - st3)
 
 #        self.comm.barrier()

From c4abf71a1163832c898168d518e262dc9f9e7928 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 12 Mar 2024 12:01:13 -0700
Subject: [PATCH 55/57] Modified the script to allow for elog submission.

---
 btx/processing/freqdir.py | 733 ++++++++++++--------------------------
 1 file changed, 226 insertions(+), 507 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 0dc2889ae..b044d61ae 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -23,31 +23,12 @@
 from matplotlib import pyplot as plt
 from matplotlib import colors
 
-#JOHN: COMMENTED OUT AFTER 03/11/2024
-# ##########################
-# ##########################
-# #JOHN CHANGE BACK AFTER 12/15/2023
-# from btx.misc.shortcuts import TaskTimer
-
-# from btx.interfaces.ipsana import (
-#     PsanaInterface,
-#     bin_data,
-#     bin_pixel_index_map,
-#     retrieve_pixel_index_map,
-#     assemble_image_stack_batch,
-# )
-# ##########################
-# ##########################
-
-
 from PIL import Image
 from io import BytesIO
 import base64
 
 from datetime import datetime
 
-#import umap
-#import hdbscan
 from sklearn.cluster import OPTICS, cluster_optics_dbscan
 
 from matplotlib import colors
@@ -67,6 +48,12 @@
 
 import cv2
 
+try:
+    import umap
+    import hdbscan
+except: 
+    print("UMAP NOT AVAILABLE")
+
 class FreqDir(DimRed):
 
     """
@@ -370,33 +357,33 @@ def reconstructionError(self, matrixCentered):
         	matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/(
                 (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) 
 
-#    def lowMemoryReconstructionErrorScaled(self, matrixCentered):
-#        """ 
-#        Compute the low memory reconstruction error of the matrix sketch
-#        against given data. This is the same as reconstructionError,
-#        but estimates the norm computation and does not scale by the 
-#        minimum projection matrix, but rather by the matrix norm itself. 
-#
-#        Parameters
-#        ----------
-#        matrixCentered: ndarray
-#           Data to compare matrix sketch to 
-#
-#        Returns
-#        -------
-#        float,
-#            Data subtracted by data projected onto sketched space, scaled by matrix elements
-#       """
-#        matSketch = self.sketch[:self.ell, :]
-#        print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape)
-##        k = 10
-#        matrixCenteredT = matrixCentered.T
-#        matSketchT = matSketch.T
-#        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
-##        G = U[:,:k]
-#        G = U
-#        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
-#                np.linalg.norm(matrixCenteredT, 'fro')**2)
+    def oldLowMemoryReconstructionErrorScaled(self, matrixCentered):
+        """ 
+        Compute the low memory reconstruction error of the matrix sketch
+        against given data. This is the same as reconstructionError,
+        but estimates the norm computation and does not scale by the 
+        minimum projection matrix, but rather by the matrix norm itself. 
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+            Data to compare matrix sketch to 
+
+        Returns
+        -------
+        float,
+            Data subtracted by data projected onto sketched space, scaled by matrix elements
+        """
+        matSketch = self.sketch[:self.ell, :]
+        print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape)
+        #        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
+        #        G = U[:,:k]
+        G = U
+        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
+                np.linalg.norm(matrixCenteredT, 'fro')**2)
 
     def lowMemoryReconstructionErrorScaled(self, matrixCentered):
         matSketch = self.sketch[:self.ell, :]
@@ -416,50 +403,50 @@ def estimFrobNormJ(self, addMe, arrs, k):
         sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro')
         return sumMe
 
-#    def estimFrobNormSquared(self, addMe, arrs, its):
-#        """ 
-#        Estimate the Frobenius Norm of product of arrs matrices 
-#        plus addME matrix using its iterations. 
-#
-#        Parameters
-#        ----------
-#        arrs: list of ndarray
-#           Matrices to multiply together
-#
-#        addMe: ndarray
-#            Matrix to add to others
-#
-#        its: int
-#            Number of iterations to average over
-#
-#        Returns
-#        -------
-#        sumMe/its*no_rows : float
-#            Estimate of frobenius norm of product
-#            of arrs matrices plus addMe matrix
-#
-#        Notes
-#        -----
-#        Frobenius estimation is the expected value of matrix
-#        multiplied by random vector from multivariate normal distribution
-#        based on [1]. 
-#
-#        [1] Norm and Trace Estimation with Random Rank-one Vectors 
-#        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
-#        Analysis and Applications 2021 42:1, 202-223
-#       """
-#        no_rows = arrs[-1].shape[1]
-#        v = np.random.normal(size=no_rows)
-#        v_hat = v / np.linalg.norm(v)
-#        sumMe = 0
-#        for j in range(its):
-#            v = np.random.normal(size=no_rows)
-#            v_hat = v / np.linalg.norm(v)
-#            v_addMe = addMe @ v_hat
-#            for arr in arrs[::-1]:
-#                v_hat = arr @ v_hat
-#            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-#        return sumMe/its*no_rows
+    def oldEstimFrobNormSquared(self, addMe, arrs, its):
+        """ 
+        Estimate the Frobenius Norm of product of arrs matrices 
+        plus addME matrix using its iterations. 
+
+        Parameters
+        ----------
+        arrs: list of ndarray
+            Matrices to multiply together
+
+        addMe: ndarray
+            Matrix to add to others
+
+        its: int
+            Number of iterations to average over
+
+        Returns
+        -------
+        sumMe/its*no_rows : float
+            Estimate of frobenius norm of product
+            of arrs matrices plus addMe matrix
+
+        Notes
+        -----
+        Frobenius estimation is the expected value of matrix
+        multiplied by random vector from multivariate normal distribution
+        based on [1]. 
+
+        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+        Analysis and Applications 2021 42:1, 202-223
+        """
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
 
 
     def gatherFreqDirsSerial(self):
@@ -518,8 +505,7 @@ def write(self):
         filename : string
             Name of h5 file where sketch, mean of data, and indices of data processed is written
         """
-#        self.comm.barrier()
-        filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank)
+        filename = self.output_dir + f'{self.currRun:04}_sketch_{self.rank}.h5'
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("sketch",  data=self.sketch[:self.ell, :])
 #            hf.create_dataset("mean", data=self.mean)
@@ -562,7 +548,6 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
         
         self.divBy = divBy
         
-        # time.sleep(10)
         with h5py.File(readFile, 'r') as hf:
             self.data = hf["sketch"][:]
 
@@ -570,7 +555,6 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output
 
         sendbuf = self.data.shape[0]
         self.buffSizes = np.array(self.comm.allgather(sendbuf))
-#        print(self.buffSizes)
 
         self.fd.update_model(self.data.T)
 
@@ -676,7 +660,7 @@ def write(self):
         """
         Write merged matrix sketch to h5 file
         """
-        filename = self.output_dir + '{}_merge.h5'.format(self.currRun)
+        filename = self.output_dir + f'{self.currRun:04}_merge.h5'
 
         if self.rank==0:
             for ind in range(self.size):
@@ -749,10 +733,6 @@ def __init__(
 
         readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5"
 
-#        print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2)))
-#        while(not os.path.isfile(readFile2)):
-#            print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank))
-        # time.sleep(10)
         with h5py.File(readFile2, 'r') as hf:
             self.data = hf["sketch"][:]
 #            self.mean = hf["mean"][:]
@@ -798,10 +778,9 @@ def write(self):
         """
         Write projected data and downsampled data to h5 file
         """
-        filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank)
+        filename = self.output_dir + f'{self.currRun:04}_ProjectedData_{self.rank}.h5'
         with h5py.File(filename, 'w') as hf:
             hf.create_dataset("ProjectedData",  data=self.processedData)
-#        print("CREATED FILE: ", filename)
         self.comm.barrier()
         return filename
 
@@ -836,7 +815,6 @@ def get(self):
         ret = []
         while self.queue:
             curr = heapq.heappop(self.queue)[-1]
-            #ret.append(curr[0]*max(curr[1], curr[2])/curr[2])
             ret.append(curr[0])
         return ret
 
@@ -877,13 +855,10 @@ def update(self, vec):
         self.sketch.push(vec, pi, wi)
 
 
-
 class visualizeFD:
     """
     Visualize FD Dimension Reduction using UMAP and DBSCAN
     """
-    umap = __import__('umap')
-    hdbscan = __import__('hdbscan')
     def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
             skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
             optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
@@ -971,24 +946,9 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired
                 image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
             blurred = cv2.GaussianBlur(image, blur_kernel, 0)
-        #     blurred = image
-
-        #     _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY)
             _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY)
 
-
-        #     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        #     if not contours:
-        #         return None  # No contours found
-
-        #     beam = max(contours, key=cv2.contourArea)
-        #     x, y, w, h = cv2.boundingRect(beam)
-
-        #     cropped = image[y:y+h, x:x+w]
-        #     print(x, y, w, h)
-
             contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        #     contours = [c for c in contours if cv2.contourArea(c) > min_contour_area]
             if not contours:
                 return None
 
@@ -1009,136 +969,34 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired
         contours = []
         contourImgs = []
         for j in range(len(fullThumbnailData)):
-        #     currImg = (fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy()
-        #     nimg = currImg
             nimg = center_and_crop_beam(fullThumbnailData[j])
-        #     nimg = reorientImg(nimg)
             if nimg is None:
                 continue
             nimg = reorientImg(nimg)
             nimg = denoiseImg(nimg)
             nimgs.append(nimg)
-        #     nbws.append(nimg)
-        #     (thresh, im_bw) = cv2.threshold((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy(), 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-        #     print(nimg)
-        #     print(j, np.max(nimg))
-        #     np.set_printoptions(threshold=np.inf, linewidth=np.inf)
-        #     print(nimg)
-
             (thresh, im_bw) = cv2.threshold(nimg, 0, 255, cv2.THRESH_BINARY)
             nbws.append(im_bw.copy())
             (thresh1, im_bw1) = cv2.threshold(nimg, 0, 1, cv2.THRESH_BINARY)
             nbws1.append(im_bw1.copy())
-
-        #     # Assuming 'im' is your grayscale image
-        #     # Apply Gaussian blur to the image
-        #     blurred = cv2.GaussianBlur(im_bw, (5, 5), 0)
-        #     # Apply binary thresholding on the blurred image
-        #     _, binary = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)
-        #     # Find contours
-        #     contourList, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        #     # Find the largest contour based on area
-        #     largest_contour = max(contourList, key=cv2.contourArea)
-        #     contours.append(largest_contour)
-        #     canvas = np.zeros(im_bw.shape, dtype='uint8')
-        #     # Draw the largest contour in white
-        #     cv2.drawContours(canvas, [largest_contour], -1, (255), 1)
-        #     contourImgs.append(canvas)
-
-        # #     nbws.append(cv2.GaussianBlur(nimg, (5, 5), 0))
-        # #     nbws.append(im_bw)
-        # #     nbws.append((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy()
-
-        # # ind = 356
-
-        # # plt.imshow(nimgs[ind])
-        # # plt.show()
-
-        # # print(nbws1[ind][80])
-
-        # # Calculate moments
-        # M = cv2.moments(nbws1[ind])
-        # # Zeroth moment is the area
-        # area = M['m00']
-        # epsilon = 0.01 * cv2.arcLength(contours[ind], True)
-        # approx = cv2.approxPolyDP(contours[ind], epsilon, True)
-        # # Calculate the perimeter
-        # perimeter = cv2.arcLength(approx, True)
-        # # Calculate circularity using moments
-        # circularity = 4 * 3.14159 * area / (perimeter * perimeter)
-        # print(circularity)
-
-        # # Calculate moments
-        # M = cv2.moments(nbws1[ind])
-        # ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02']))
-        # print(ncirc)
-
         circs = []
         ncircs = []
-
         for ind in range(len(nbws)):
-        #     # Calculate moments
-        #     M = cv2.moments(nbws1[ind])
-        #     # Zeroth moment is the area
-        #     area = M['m00']
-        #     epsilon = 0.01 * cv2.arcLength(contours[ind], True)
-        #     approx = cv2.approxPolyDP(contours[ind], epsilon, True)
-        #     # Calculate the perimeter
-        #     perimeter = cv2.arcLength(approx, True)
-        #     # Calculate circularity using moments
-        #     circularity = 4 * 3.14159 * area / (perimeter * perimeter)
-
-            # Calculate moments
             M = cv2.moments(nbws[ind])
             try:
                 ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02']))
             except:
                 ncirc = 1
-
-        #     circs.append(circularity)
             ncircs.append(ncirc)
-
         sorted_indices = np.argsort(ncircs)
         sorted_arrays = [nimgs[i] for i in sorted_indices]
         sorted_full = [fullThumbnailData[i] for i in sorted_indices]
-
-        #     import matplotlib.pyplot as plt
-        #     import numpy as np
-
-        #     # Assuming 'images' is your list of 16 NumPy array images
-        #     # For demonstration, creating 16 random 8x8 grayscale images
-        #     images = [j for j in sorted_arrays[::len(sorted_arrays)//16]]
-
-        #     # Create a 4x4 grid of subplots
-        #     fig, axs = plt.subplots(4, 4, figsize=(10, 10))
-
-        #     # Flatten the array of axes for easy iteration
-        #     axs = axs.ravel()
-
-        #     # Plot each image and add text
-        #     for i in range(16):
-        #         axs[i].imshow(images[i], cmap='jet', vmin=0, vmax=255)  # Assuming grayscale images
-        #         axs[i].text(50, 5, f"Image {i+1}", color='white', ha='center', va='center')
-        #         axs[i].axis('off')  # Turn off axis
-
-        #     plt.tight_layout()  # Adjust subplots to fit into the figure area.
-        #     plt.show()
-
-        # ind=23
-        # nimg = center_and_crop_beam(fullThumbnailData[40])
-        # # plt.imshow(fullThumbnailData[ind])
-        # plt.imshow(nimg)
-        # plt.show()
-
         bigOrSmall = [1 if j>len(sorted_arrays)*10//16 else 0 for j in sorted_indices]
-#        np.savez(saveDir+'circularityImgs_{}.npz'.format(currRun), **{f'array_{i}': arr for i, arr in enumerate(nimgs)}, labels=bigOrSmall)
-
         return ncircs
 
 
     def embeddable_image(self, data):
         img_data = np.uint8(cm.jet(data/max(data.flatten()))*255)
-#        image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC)
         image = Image.fromarray(img_data, mode='RGBA')
         buffer = BytesIO()
         image.save(buffer, format='png')
@@ -1166,8 +1024,8 @@ def genMedoids(self, medoidLabels, clusterPoints):
             for test_index, test_point in enumerate(lst):
                 if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]):
                     fin_ind = test_index
-#            medoid_lst.append((k, v[fin_ind][0]))
-            medoid_lst.append((k, v[fin_ind+1][0]))
+            # medoid_lst.append((k, v[fin_ind+1][0]))
+            medoid_lst.append((k, v[fin_ind][0]))
         return medoid_lst
 
     def relabel_to_closest_zero(self, labels):
@@ -1209,10 +1067,8 @@ def fastABOD(self, pts, nsamples):
                 ac = cpt - apt
                 if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0):
                     count += 1
-#                    print("TOO CLOSE")
                     continue
                 outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac))))
-#            print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors)))
             if(len(outlier_factors)==0):
                 abofs.append(np.inf)
             else:
@@ -1220,12 +1076,7 @@ def fastABOD(self, pts, nsamples):
         return abofs
 
     def getOutliers(self, lst):
-#        lstCopy = lst.copy()
-#        lstCopy.sort()
-#        quart10 = lstCopy[len(lstCopy)//divBy]
-
         lstQuant = np.quantile(np.array(lst), self.outlierQuantile)
-#        print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst)
         outlierInds = []
         notOutlierInds = []
         for j in range(len(lst)):
@@ -1233,8 +1084,6 @@ def getOutliers(self, lst):
                 outlierInds.append(j)
             else:
                 notOutlierInds.append(j)
-#        print("OUTLIER INDS: ", outlierInds)
-#        print("NOT OUTLIER INDS: ", notOutlierInds)
         return np.array(outlierInds), np.array(notOutlierInds)
 
     def genHist(self, vals, endClass):
@@ -1251,23 +1100,15 @@ def genLeftRight(self, endClass):
 
 
     def float_to_int_percentile(self, float_list):
-        # Edge case: If the list is empty, return an empty list
         if not float_list:
             return []
-
-        # Calculate the percentiles that define the bin edges
         percentiles = np.percentile(float_list, [10 * i for i in range(1, 10)])
-
-        # Function to find the bin for a single value
         def find_bin(value):
             for i, p in enumerate(percentiles):
                 if value < p:
                     return i
-            return 9  # For values in the highest bin
-
-        # Convert each float to an integer based on its bin
+            return 9 
         int_list = [find_bin(value) for value in float_list]
-
         return int_list
 
 
@@ -1312,48 +1153,31 @@ def genUMAP(self):
         if len(self.imgs)!= self.numImgsToUse:
             raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({}). TRUE LEN IS {}.".format(len(self.imgs), self.numImgsToUse, self.logging_numImgsToUse))
 
-        self.clusterable_embedding = self.umap.UMAP(
+        self.clusterable_embedding = umap.UMAP(
             n_neighbors=self.umap_n_neighbors,
             random_state=self.umap_random_state,
             n_components=2,
             min_dist=0,
-#            min_dist=0.1,
         ).fit_transform(self.projections)
 
-#        self.labels = self.hdbscan.HDBSCAN(
-#            min_samples = self.hdbscan_min_samples,
-#            min_cluster_size = self.hdbscan_min_cluster_size
-#        ).fit_predict(self.clusterable_embedding)
-
-        # ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs))
-        # self.labels = np.array(ncircs)
-        self.labels = np.array(np.zeros(len(self.imgs)))
+        self.labels = hdbscan.HDBSCAN(
+            min_samples = self.hdbscan_min_samples,
+            min_cluster_size = self.hdbscan_min_cluster_size
+        ).fit_predict(self.clusterable_embedding)
 
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
 
         self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size)
         self.opticsClust.fit(self.clusterable_embedding)
-#        self.opticsLabels = cluster_optics_dbscan(
-#            reachability=self.opticsClust.reachability_,
-#            core_distances=self.opticsClust.core_distances_,
-#            ordering=self.opticsClust.ordering_,
-#            eps=2.5,
-#        )
         self.opticsLabels = self.opticsClust.labels_
 
         self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]})
         self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered]))
         self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize
 
-#        self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities]
-#        self.experData_df['trueIntensities'] = [5 for x in trueIntensities]
-#        self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities]
-#        self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities]
-#        print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities']))
-#        print(trueIntensities)
-        self.experData_df['trueIntensities'] = [1 for x in self.experData_df['imgind']]
-        self.experData_df['trueIntensities_backgroundColor'] = [1 for x in self.experData_df['imgind']]
+        self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities]
+        self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities]
 
 
     def genABOD(self):
@@ -1372,8 +1196,6 @@ def genABOD(self):
         self.experData_df['anomDet'] = outlierLabels
         self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels]
 
-        print("2adwjiaomd", len(self.experData_df['anomDet']), self.experData_df['anomDet'], type(self.experData_df['anomDet']))
-
     def setUserGroupings(self, userGroupings):
         """
         Set User Grouping. An adjustment is made at the beginning of this function,
@@ -1419,14 +1241,13 @@ def genLabels(self):
                 opticsNewLabels.append(j)
         opticsNewLabels = list(np.array(opticsNewLabels) + 1)
         self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels))
-#        self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]]
         self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels]
 
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
         #JOHN CHANGE 20231020
-#        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
-        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16])
+        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16])
         plot_figure = figure(
             title='UMAP projection with DBSCAN clustering of the LCLS dataset',
             tools=('pan, wheel_zoom, reset, lasso_select'),
@@ -1745,6 +1566,7 @@ def userShow(self):
         output_notebook()
         show(self.viewResults)
 
+
 class WrapperFullFD:
     """
     Frequent Directions Data Processing Wrapper Class.
@@ -1790,90 +1612,87 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
             self.psi = None
 
         if self.rank==0:
-            self.currRun = run #datetime.now().strftime("%y%m%d%H%M%S")
+            self.currRun = run
         else:
             self.currRun = None
         self.currRun = self.comm.bcast(self.currRun, root=0)
 
-        #JOHN CHANGE 01/08/2024
         self.newBareTime = 0
 
-#JOHN CHANGE 12/30/2023
         self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200)
-        # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300)
+        # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = False, roi_w=500, roi_h = 500)
         self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
-#        self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
-
-#    def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
-#        """ 
-#        Compute the low memory reconstruction error of the matrix sketch
-#        against given data. This is the same as reconstructionError,
-#        but estimates the norm computation and does not scale by the 
-#        minimum projection matrix, but rather by the matrix norm itself. 
-#
-#        Parameters
-#        ----------
-#        matrixCentered: ndarray
-#           Data to compare matrix sketch to 
-#
-#        Returns
-#        -------
-#        float,
-#            Data subtracted by data projected onto sketched space, scaled by matrix elements
-#       """
-##        k = 10
-#        matrixCenteredT = matrixCentered.T
-#        matSketchT = matSketch.T
-#        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
-##        G = U[:,:k]
-#        G = U
-#        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
-#                np.linalg.norm(matrixCenteredT, 'fro')**2)
-#
-#    def estimFrobNormSquared(self, addMe, arrs, its):
-#        """ 
-#        Estimate the Frobenius Norm of product of arrs matrices 
-#        plus addME matrix using its iterations. 
-#
-#        Parameters
-#        ----------
-#        arrs: list of ndarray
-#           Matrices to multiply together
-#
-#        addMe: ndarray
-#            Matrix to add to others
-##
-#        its: int
-#            Number of iterations to average over
-#
-#        Returns
-#        -------
-#        sumMe/its*no_rows : float
-#            Estimate of frobenius norm of product
-#            of arrs matrices plus addMe matrix
-#
-#        Notes
-#        -----
-#        Frobenius estimation is the expected value of matrix
-#        multiplied by random vector from multivariate normal distribution
-#        based on [1]. 
-#
-#        [1] Norm and Trace Estimation with Random Rank-one Vectors 
-#        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
-#        Analysis and Applications 2021 42:1, 202-223
-#       """
-#        no_rows = arrs[-1].shape[1]
-#        v = np.random.normal(size=no_rows)
-#        v_hat = v / np.linalg.norm(v)
-#        sumMe = 0
-#        for j in range(its):
-#            v = np.random.normal(size=no_rows)
-#            v_hat = v / np.linalg.norm(v)
-#            v_addMe = addMe @ v_hat
-#            for arr in arrs[::-1]:
-#                v_hat = arr @ v_hat
-#            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
-#        return sumMe/its*no_rows
+
+    def oldLowMemoryReconstructionErrorScaled1(self, matrixCentered, matSketch):
+        """ 
+        Compute the low memory reconstruction error of the matrix sketch
+        against given data. This is the same as reconstructionError,
+        but estimates the norm computation and does not scale by the 
+        minimum projection matrix, but rather by the matrix norm itself. 
+
+        Parameters
+        ----------
+        matrixCentered: ndarray
+            Data to compare matrix sketch to 
+
+        Returns
+        -------
+        float,
+            Data subtracted by data projected onto sketched space, scaled by matrix elements
+        """
+        #        k = 10
+        matrixCenteredT = matrixCentered.T
+        matSketchT = matSketch.T
+        U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False)
+        #        G = U[:,:k]
+        G = U
+        return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/
+                np.linalg.norm(matrixCenteredT, 'fro')**2)
+
+    def oldEstimFrobNormSquared1(self, addMe, arrs, its):
+        """ 
+        Estimate the Frobenius Norm of product of arrs matrices 
+        plus addME matrix using its iterations. 
+
+        Parameters
+        ----------
+        arrs: list of ndarray
+            Matrices to multiply together
+
+        addMe: ndarray
+            Matrix to add to others
+        #
+        its: int
+            Number of iterations to average over
+
+        Returns
+        -------
+        sumMe/its*no_rows : float
+            Estimate of frobenius norm of product
+            of arrs matrices plus addMe matrix
+
+        Notes
+        -----
+        Frobenius estimation is the expected value of matrix
+        multiplied by random vector from multivariate normal distribution
+        based on [1]. 
+
+        [1] Norm and Trace Estimation with Random Rank-one Vectors 
+        Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix 
+        Analysis and Applications 2021 42:1, 202-223
+        """
+        no_rows = arrs[-1].shape[1]
+        v = np.random.normal(size=no_rows)
+        v_hat = v / np.linalg.norm(v)
+        sumMe = 0
+        for j in range(its):
+            v = np.random.normal(size=no_rows)
+            v_hat = v / np.linalg.norm(v)
+            v_addMe = addMe @ v_hat
+            for arr in arrs[::-1]:
+                v_hat = arr @ v_hat
+            sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2
+        return sumMe/its*no_rows
 
     def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch):
         matrixCenteredT = matrixCentered.T
@@ -1895,12 +1714,13 @@ def estimFrobNormJ(self, addMe, arrs, k):
     def retrieveImages(self):
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
+        np.save('/sdf/home/w/winnicki/john_20240312.npy', self.fullImgData)
 
     def genSynthData(self):
         self.fullImgData = np.random.rand(70000, 100000//self.size)
         self.imgsTracked = [(0, self.rank)]
 
-#    def genDecayingSVD(self):
+#    def oldGenDecayingSVD3(self):
 #        numFeats = 70000
 #        numSamps = 100000//self.size
 #        A = np.random.rand(matrixSize, matrixSize)
@@ -1932,56 +1752,44 @@ def compDecayingSVD(self, seedMe, a, b):
         self.fullImgData = np.random.randn(numFeats, numSamps)
         self.imgsTracked = [(0, numSamps)]
 
-    # def compDecayingSVD(self, seedMe, a, b):
-    #     #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
-    #     print(1)
-    #     np.random.seed(seedMe + self.rank)
-    #     numFeats = a
-    #     numSamps = b//self.size
-    #     # perturbation = np.random.rand(numSamps, numFeats)*0.1
-    #     # print(2)
-    #     A1 = np.random.rand(numSamps, numFeats) 
-    #     print(3)
-    #     Q1 = self.modified_gram_schmidt(A1, numFeats)
-    #     print(5)
-    #     A2 = np.random.rand(numFeats, numFeats) #Modify
-    #     print(6)
-    #     Q2, R2 = np.linalg.qr(A2)
-    #     print(7)
-    #     S = list(np.random.rand(numFeats)) #Modify
-    #     print(8)
-    #     S.sort()
-    #     print(9)
-    #     S = S[::-1]
-    #     print(10)
-    #     for j in range(len(S)): #Modify
-    #         # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
-    #         S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
-    #     print(11)
-    #     self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
-    #     print(12)
-    #     self.imgsTracked = [(0, numSamps)]
-    #     print(13)
-
-    # def compDecayingSVD(self, seedMe, a, b):
-    #     #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
-    #     numFeats = a
-    #     numSamps = b//self.size
-    #     perturbation = np.random.rand(numSamps, numFeats)*0.1
-    #     np.random.seed(seedMe)
-    #     A1 = np.random.rand(numSamps, numFeats) 
-    #     Q1, R1 = np.linalg.qr(A1)
-    #     Q1 = Q1 + perturbation
-    #     A2 = np.random.rand(numFeats, numFeats) #Modify
-    #     Q2, R2 = np.linalg.qr(A2)
-    #     S = list(np.random.rand(numFeats)) #Modify
-    #     S.sort()
-    #     S = S[::-1]
-    #     for j in range(len(S)): #Modify
-    #         # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
-    #         S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
-    #     self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
-    #     self.imgsTracked = [(0, numSamps)]
+    def oldCompDecayingSVD1(self, seedMe, a, b):
+        #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
+        np.random.seed(seedMe + self.rank)
+        numFeats = a
+        numSamps = b//self.size
+        # perturbation = np.random.rand(numSamps, numFeats)*0.1
+        A1 = np.random.rand(numSamps, numFeats) 
+        Q1 = self.modified_gram_schmidt(A1, numFeats)
+        A2 = np.random.rand(numFeats, numFeats) #Modify
+        Q2, R2 = np.linalg.qr(A2)
+        S = list(np.random.rand(numFeats)) #Modify
+        S.sort()
+        S = S[::-1]
+        for j in range(len(S)): #Modify
+            # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
+            S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
+        self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+        self.imgsTracked = [(0, numSamps)]
+
+    def oldCompDecayingSVD2(self, seedMe, a, b):
+        #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
+        numFeats = a
+        numSamps = b//self.size
+        perturbation = np.random.rand(numSamps, numFeats)*0.1
+        np.random.seed(seedMe)
+        A1 = np.random.rand(numSamps, numFeats) 
+        Q1, R1 = np.linalg.qr(A1)
+        Q1 = Q1 + perturbation
+        A2 = np.random.rand(numFeats, numFeats) #Modify
+        Q2, R2 = np.linalg.qr(A2)
+        S = list(np.random.rand(numFeats)) #Modify
+        S.sort()
+        S = S[::-1]
+        for j in range(len(S)): #Modify
+            # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN
+            S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024
+        self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
+        self.imgsTracked = [(0, numSamps)]
 
     def runMe(self):
 
@@ -2040,27 +1848,18 @@ def runMe(self):
         et3 = time.perf_counter()
         print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3))
         print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull))
-        self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. 
+        self.addThumbnailsToProjectH5() 
         return (et1 + et2 + et3 - st1 - st2 - st3)
 
-#        self.comm.barrier()
-#        self.comm.Barrier()
-#        filenameTest3 = random.randint(0, 10)
-#        filenameTest3 = self.comm.allgather(filenameTest3)
-#        print("TEST 3: ", self.rank, filenameTest3)
-
     def addThumbnailsToProjectH5(self):
-#        print("Gathering thumbnails")
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
-        # print("FULL THUMBNAIL DATA: ", np.array(self.fullThumbnailData).shape)
-        file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank)
+        file_name = self.writeToHere+f"{self.currRun:04}_ProjectedData_{self.rank}.h5"
         f1 = h5py.File(file_name, 'r+')
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
         f1.create_dataset("TrueIntensities",  data=np.array(self.trueIntensitiesData))
         f1.close()
         self.comm.barrier()
-        # print("FINISHED AIJOWDAWODIDWJA")
 
 class FD_ImageProcessing:
     def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h):
@@ -2128,10 +1927,8 @@ def normalizeIntensityFunc(self, img, currIntensity):
         if img is None:
             return img
         elif currIntensity<self.minIntensity:
-#            print("LESS", currIntensity, self.minIntensity)
             return np.zeros(img.shape)+1
         else:
-#            print("MORE", np.sum(img.flatten(), dtype=np.double), currIntensity, self.minIntensity)
             return img/np.sum(img.flatten(), dtype=np.double)
 
     def unitVarFunc(self, img, currIntensity):
@@ -2139,7 +1936,6 @@ def unitVarFunc(self, img, currIntensity):
             return img
         else:
             return img/img.std(axis=0)
-#            return (img - img.mean(axis=0)) / img.std(axis=0)
 
     def centerImgFunc(self, img):
         if img is None: 
@@ -2151,17 +1947,14 @@ def centerImgFunc(self, img):
                 curr_roi_w = int(self.roi_w*rampingFact)
                 curr_roi_h = int(self.roi_h*rampingFact)
                 nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1)
-#                print(rampingFact)
                 if  np.sum(img.flatten(), dtype=np.double)<10000:
                     cogx, cogy = (curr_roi_w, curr_roi_h)
                 else:
                     cogx, cogy  = self.calcCenterGrav(nimg)
-    #            return nimg[cogy-(roi_h):cogy+(roi_h//2), cogx-(roi_w):cogx+(roi_w//2)]
                 nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)]
                 rampingFact -= 0.5
             return nimg
 
-
     def calcCenterGrav(self, grid):
         M_total = np.sum(grid)
         row_indices, col_indices = np.indices(grid.shape)
@@ -2172,15 +1965,8 @@ def calcCenterGrav(self, grid):
 
 
 
-
 class DataRetriever:
     btx = __import__('btx')
-#    from btx.interfaces.ipsana import (
-#        PsanaInterface,
-#        bin_data,
-#        retrieve_pixel_index_map,
-#        assemble_image_stack_batch,
-#    )
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
         self.det_type = det_type
@@ -2190,7 +1976,6 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t
         self.thumbnailHeight = thumbnailHeight
         self.thumbnailWidth = thumbnailWidth
 
-#        self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True)
         self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
 
         self.imageProcessor = imageProcessor
@@ -2251,35 +2036,27 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
         fullthumbnails = None
         imgsTracked = []
         runs = self.split_range(startInd, startInd+n, num_steps)
-        print(runs) 
+        print(f"PROCESS RETRIEVING: {runs}") 
         for runStart, runEnd in runs:
-#            print("RETRIEVING: [", runStart, ":", runEnd,"]")
             self.psi.counter = runStart
             imgsTracked.append((runStart, runEnd))
 
-#            print("getting images")
             imgs = self.psi.get_images(runEnd-runStart, assemble=False)
 
-#            print("Removing nan images")
             imgs = imgs[
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
 
             if getThumbnails:
-#                print("Assembling thumbnails")
                 thumbnails = self.assembleImgsToSave(imgs)
 
             if self.downsample:
-#                print("Downsampling images")
                 imgs = self.btx.interfaces.ipsana.bin_data(imgs, self.bin_factor)
-#            print("Flattening images")
             num_valid_imgs, p, x, y = imgs.shape
             img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T
-#            print("Image values less than 0 setting to 0")
             img_batch[img_batch<0] = 0
     
             if getThumbnails:
-#                print("FLattening thumbnails")
                 num_valid_thumbnails, tx, ty = thumbnails.shape
                 thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T
 
@@ -2287,8 +2064,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                 nimg_batch = []
                 nthumbnail_batch = []
                 for img, thumbnail in zip(img_batch.T, thumbnail_batch.T):
-#                    currIntensity = np.sum(img.flatten(), dtype=np.double)
-                    nimg = self.imageProcessor.processImg(img) #JOHN 011/09/2023
+                    nimg = self.imageProcessor.processImg(img)
                     nthumbnail = self.imageProcessor.processImg(thumbnail)
                     if nimg is not None:
                         nimg_batch.append(nimg)
@@ -2304,19 +2080,15 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             else:
                 nimg_batch = []
                 for img in img_batch.T:
-#                    currIntensity = np.sum(img.flatten(), dtype=np.double) #JOHN 011/09/2023
-#                    print("Starting image processing of size {}".format(img_batch.T.shape))
                     nimg = self.imageProcessor.processImg(img)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                 nimg_batch = np.array(nimg_batch).T
-#                print("hstacking")
                 if fullimgs is None:
                     fullimgs = nimg_batch
                 else:
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
-#        print("Images tracked:", imgsTracked)
         if getThumbnails:
             return (fullimgs, fullthumbnails, imgsTracked)
         else:
@@ -2324,7 +2096,6 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
 
 class SinglePanelDataRetriever:
-#    from btx.interfaces.ipsana import PsanaInterface
     btx = __import__('btx')
     def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
         self.exp = exp
@@ -2380,14 +2151,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
         print(runs)
         trueIntensities = []
         for runStart, runEnd in runs:
-#            print("RETRIEVING: [", runStart, ":", runEnd,"]")
             self.psi.counter = runStart
             imgsTracked.append((runStart, runEnd))
 
-#            print("getting images")
             imgs = self.psi.get_images(runEnd-runStart, assemble=False)
 
-#            print("Removing nan images")
             imgs = imgs[
                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
             ]
@@ -2395,43 +2163,25 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
             origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
             newTrueIntensities = []
             for j in origTrueIntensities:
-#                if j>0:
-#                    newTrueIntensities.append(0)
-#                else:
-#                    newTrueIntensities.append(np.log(j))
                 if j<0:
                     newTrueIntensities.append(0)
                 else:
                     newTrueIntensities.append(np.log(j))
             origTrueIntensities = newTrueIntensities
 
-#            jimgs = []
-#            for img in imgs:
-#                jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100))
-#            imgs = np.array(jimgs)
-
             if getThumbnails:
                 saveMe = []
                 for img in imgs:
-                    #JOHN CHANGE 12/30/2023
-                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023
-#                    saveMe.append(np.array(img)) #JOHN 011/09/2023
+                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
                 thumbnails = np.array(saveMe)
 
-            num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023
+            num_valid_imgs, x, y = imgs.shape
 
-#            img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T #JOHN 011/09/2023
             img_batch = imgs.T
-#            print("Image values less than 0 setting to 0")
             img_batch[img_batch<0] = 0
-
-#            num_valid_imgs, x, y = img_batch.T.shape #JOHN 11/20/2023
-#            print(num_valid_imgs, x, y)
     
             if getThumbnails:
-#                print("FLattening thumbnails")
                 num_valid_thumbnails, tx, ty = thumbnails.shape
-#                thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T #JOHN 011/09/2023
                 thumbnail_batch = thumbnails.T
 
             if getThumbnails:
@@ -2444,93 +2194,63 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
                         nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
                     else:
                         nimg = self.imageProcessor.processImg(img, currIntensity)
-#                    nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) #JOHN 011/09/2023
                     if nimg is None:
                         nthumbnail = None
                     else:
                         nthumbnail = nimg.copy()
-#                    print(np.array(nimg).shape)
-#                    print(nthumbnail)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                         nthumbnail_batch.append(nthumbnail)
                         ntrueIntensity_batch.append(trueIntens)
                     else:
-#                        nimg_batch.append(np.zeros((x, y)))
-#                        nthumbnail_batch.append(np.zeros((tx, ty)))
-#                        ntrueIntensity_batch.append(0)
                         num_valid_thumbnails -= 1
                         num_valid_imgs -= 1
                         self.excludedImgs.append(ind)
-                if self.imageProcessor.centerImg: #JOHN 011/09/2023
-#                    print("a09wupoidkw", np.array(nimg_batch).shape)
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
-                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023
+                if self.imageProcessor.centerImg:
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T
+                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w)
 
-                    ##############################
-                    # JOHN 12/30/2023
                     saveMe = []
                     for img in nthumbnail_batch:
                         saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
                     nthumbnail_batch = np.array(saveMe)
-                    # print("a09wdjaoimd", nimg_batch.shape, nthumbnail_batch.shape)
-                    # print(nthumbnail_batch.shape)
-                    # JOHN 12/30/2023
-
-                else: #JOHN 011/09/2023
-#                    print("a09wupoidkw", np.arrayħnimg_batch).shape)
-#                    print(num_valid_imgs, x, y)
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023
-                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023
+
+                else:
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
+                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
                 
                 
                 if fullimgs is None and nimg_batch.shape[1]!=0:
                     fullimgs = nimg_batch
                     fullthumbnails = nthumbnail_batch
-                    # print("FULL IMGS IS NONE.", "nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
                     trueIntensities += ntrueIntensity_batch
-                # elif len(nimg_batch)!=0:
-                elif nimg_batch.shape[1]!=0: #JOHN CHANGE 12/31/2023
-                    # print("nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
+                elif nimg_batch.shape[1]!=0: 
                     fullimgs = np.hstack((fullimgs, nimg_batch))
                     fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
-                    # print("NEW: nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape)
                     trueIntensities += ntrueIntensity_batch
             else:
                 nimg_batch = []
                 for ind, img in enumerate(img_batch.T):
                     currIntensity = np.sum(img.flatten(), dtype=np.double)
-#                    print("Starting image processing of size {}".format(img_batch.T.shape))
                     nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
                     if nimg is not None:
                         nimg_batch.append(nimg)
                     else:
-#                        nimg_batch.append(np.zeros((x, y)))
                         num_valid_imgs -= 1
                         self.excludedImgs.append(ind)
 
-#                nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
-
-                #JOHN 11/20/23
-                if self.imageProcessor.centerImg: #JOHN 011/09/2023
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023
-                else: #JOHN 011/09/2023
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023
-
-
+                if self.imageProcessor.centerImg:
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T 
+                else: 
+                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
 
-#                print(nimg_batch.shape)
-#                print("hstacking")
                 if fullimgs is None:
                     fullimgs = nimg_batch
-                # elif len(nimg_batch)!=0: #JOHN 12/31/2023
                 elif nimg_batch.shape[1]!=0:
-#                    print(fullimgs.shape, nimg_batch.shape, nimg_batch)
                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
         print("EXCLUDING IMAGES: ", self.excludedImgs)
 
-#        print("Images tracked:", imgsTracked)
         if getThumbnails:
             print(fullimgs.shape, fullthumbnails.shape, imgsTracked)
             return (fullimgs, fullthumbnails, imgsTracked, trueIntensities)
@@ -2542,16 +2262,15 @@ def main():
     Perform Frequent Direction Visualization.
     """
     params = parse_input()
-    os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True)
     visMe = visualizeFD(inputFile=params.outdir + f"/{params.run:04}_ProjectedData",
-                        outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html",
+                        outputFile=params.outdir + f"/UMAPVis_{params.run:04}.html",
                         numImgsToUse=params.num_imgs,
                         nprocs=params.nprocs,
                         userGroupings=[],
                         includeABOD=True,
                         skipSize=params.skip_size,
 #                        umap_n_neighbors=params.num_imgs_to_use // 4000,
-                        umap_n_neighbors=params.num_imgs_to_use // 10000,
+                        umap_n_neighbors= 15,
                         umap_random_state=42,
                         hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40),
                         hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40),

From a254014fc167eef549892b0195989123ab8ed58d Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana002.sdf.slac.stanford.edu>
Date: Tue, 12 Mar 2024 12:49:54 -0700
Subject: [PATCH 56/57] I don't think I changed anything.

---
 btx/processing/freqdir.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index b044d61ae..523c325ad 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -1714,7 +1714,6 @@ def estimFrobNormJ(self, addMe, arrs, k):
     def retrieveImages(self):
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
         self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
-        np.save('/sdf/home/w/winnicki/john_20240312.npy', self.fullImgData)
 
     def genSynthData(self):
         self.fullImgData = np.random.rand(70000, 100000//self.size)

From 1449dac863c193a60b499044f19e0fd82600ddb9 Mon Sep 17 00:00:00 2001
From: John Winnicki <winnicki@sdfiana003.sdf.slac.stanford.edu>
Date: Mon, 1 Jul 2024 10:02:53 -0700
Subject: [PATCH 57/57] Not sure what changed

---
 btx/processing/freqdir.py | 565 ++++++++++++++++++++++++++------------
 1 file changed, 388 insertions(+), 177 deletions(-)

diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py
index 523c325ad..1a355fd50 100644
--- a/btx/processing/freqdir.py
+++ b/btx/processing/freqdir.py
@@ -40,12 +40,14 @@
 from bokeh.plotting import figure, show, output_file, save
 from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label
 from bokeh.models import CustomJS, ColumnDataSource, Span, PreText
-from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11
+from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11, Plasma10, Inferno256
 from bokeh.layouts import column, row
 
 import cProfile
 import string
 
+import pickle
+
 import cv2
 
 try:
@@ -861,7 +863,7 @@ class visualizeFD:
     """
     def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, 
             skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size,
-            optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile):
+            optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile, ):
         self.inputFile = inputFile
         self.outputFile = outputFile
         output_file(filename=outputFile, title="Static HTML file")
@@ -1113,27 +1115,58 @@ def find_bin(value):
 
 
     def genUMAP(self):
-
-
         imgs = None
         projections = None
         trueIntensities = None
-        for currRank in range(self.nprocs):
-            with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
-                if imgs is None:
-                    imgs = hf["SmallImages"][:]
-                    projections = hf["ProjectedData"][:]
-                    trueIntensities = hf["TrueIntensities"][:]
-                else:
-                    imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
-                    projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
-                    trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0)
-        print(len(imgs))
-
-        for intensMe in trueIntensities:
-            print(intensMe)
-            if(np.isnan(intensMe)):
-                print("This is NAN")
+        
+        runlbs = []
+        currlb = 0
+        currLen = 0
+
+        skipNums = [229, 231, 232, 248, 275, 305, 306, 309]
+        for iirun in range(214, 215, 1):
+            if iirun in skipNums:
+                print(f"SKIPPING RUN {iirun}")
+                continue
+            else:
+                for currRank in range(self.nprocs):
+                    with h5py.File(self.inputFile+f"{iirun:04}_ProjectedData" + "_"+str(currRank)+".h5", 'r') as hf:
+                        print(f"PROCESSING: " + self.inputFile+f"{iirun:04}_ProjectedData" + "_"+str(currRank)+".h5")
+                        if imgs is None:
+                            imgs = hf["SmallImages"][:]
+                            projections = hf["ProjectedData"][:]
+                            trueIntensities = hf["TrueIntensities"][:]
+                        else:
+                            imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
+                            projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+                            trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0)
+                        currLen += len(hf["TrueIntensities"][:])
+                for currInd in range(currLen):
+                    runlbs.append(currlb)
+                currlb += 1
+                currLen = 0
+
+        self.numFiles = currlb
+
+        # colorRunLbs = [int(x/len(set(runlbs))*255) for x in runlbs]
+        # print(colorRunLbs)
+        # trueIntensities = None
+        # for currRank in range(self.nprocs):
+        #     with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf:
+        #         if imgs is None:
+        #             imgs = hf["SmallImages"][:]
+        #             projections = hf["ProjectedData"][:]
+        #             trueIntensities = hf["TrueIntensities"][:]
+        #         else:
+        #             imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0)
+        #             projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0)
+        #             trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0)
+        print(f"PROCESSING {len(imgs)} BEAM PROFILES")
+
+        # for intensMe in trueIntensities:
+        #     print(intensMe)
+        #     if(np.isnan(intensMe)):
+        #         print("This is NAN")
 
         intensities = []
         for img in imgs:
@@ -1148,6 +1181,9 @@ def genUMAP(self):
         self.projections = projections[:self.numImgsToUse:self.skipSize]
         self.intensities = intensities[:self.numImgsToUse:self.skipSize]
 
+        trueIntensities = trueIntensities[:self.numImgsToUse:self.skipSize]
+        runlbs = np.array(runlbs[:self.numImgsToUse:self.skipSize])
+
         self.numImgsToUse = int(self.numImgsToUse/self.skipSize)
 
         if len(self.imgs)!= self.numImgsToUse:
@@ -1160,10 +1196,14 @@ def genUMAP(self):
             min_dist=0,
         ).fit_transform(self.projections)
 
-        self.labels = hdbscan.HDBSCAN(
-            min_samples = self.hdbscan_min_samples,
-            min_cluster_size = self.hdbscan_min_cluster_size
-        ).fit_predict(self.clusterable_embedding)
+        ##################### JOHN 05/18/2024
+        np.save('clusteringSave.npy', self.clusterable_embedding)
+
+        # self.labels = hdbscan.HDBSCAN(
+        #     min_samples = self.hdbscan_min_samples,
+        #     min_cluster_size = self.hdbscan_min_cluster_size
+        # ).fit_predict(self.clusterable_embedding)
+        self.labels = runlbs
 
         exclusionList = np.array([])
         self.clustered = np.isin(self.labels, exclusionList, invert=True)
@@ -1218,8 +1258,13 @@ def genLabels(self):
         self.newLabels = np.array(self.relabel_to_closest_zero(newLabels))
         self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]]
         self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']]
-        self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels]
-        self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
+
+        ################# JOHN CHANGE 05/14/2024
+        self.experData_df['dbscan_backgroundColor'] = [Inferno256[::(int(256/self.numFiles))][x] for x in self.newLabels]
+        self.experData_df['backgroundColor'] = [Inferno256[::(int(256/self.numFiles))][x] for x in self.newLabels]
+        # self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels]
+        # self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels]
+
         medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding)
         self.medoidInds = [x[1] for x in medoid_lst]
         medoidBold = []
@@ -1245,8 +1290,10 @@ def genLabels(self):
 
     def genHTML(self):
         datasource = ColumnDataSource(self.experData_df)
+        ####################################### JOHN CHANGE 05/14/2024
+        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Inferno256[::(int(256/self.numFiles))])
         #JOHN CHANGE 20231020
-        color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
+        # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20])
         # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16])
         plot_figure = figure(
             title='UMAP projection with DBSCAN clustering of the LCLS dataset',
@@ -1573,7 +1620,7 @@ class WrapperFullFD:
     """
 #    from btx.interfaces.ipsana import PsanaInterface
     btx = __import__('btx')
-    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True):
+    def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True, downsampleImg=150, roiLen=800, thumbLen=64, centerImg=True):
         self.start_offset = start_offset
         self.num_imgs = num_imgs
         self.exp = exp
@@ -1597,6 +1644,10 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.thresholdQuantile = thresholdQuantile
         self.unitVar = unitVar
 
+        self.downsampleImg = downsampleImg
+        self.roiLen = roiLen
+        self.thumbLen = thumbLen
+
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
         self.size = self.comm.Get_size()
@@ -1604,6 +1655,8 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
         self.imgsTracked = []
         self.grabImgSteps = grabImgSteps
 
+        self.centerImg = centerImg
+
         self.usePSI = usePSI
         if usePSI:
             self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
@@ -1619,9 +1672,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab
 
         self.newBareTime = 0
 
-        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200)
+        self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = self.centerImg, roiLen=self.roiLen, downsampleImg=self.downsampleImg)
         # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = False, roi_w=500, roi_h = 500)
-        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64)
+        self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbLen=self.thumbLen)
 
     def oldLowMemoryReconstructionErrorScaled1(self, matrixCentered, matSketch):
         """ 
@@ -1713,7 +1766,8 @@ def estimFrobNormJ(self, addMe, arrs, k):
 
     def retrieveImages(self):
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
+        # self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
+        self.fullImgData, self.imgsTracked = self.imgRetriever.get_fake_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False)
 
     def genSynthData(self):
         self.fullImgData = np.random.rand(70000, 100000//self.size)
@@ -1745,6 +1799,7 @@ def modified_gram_schmidt(self, A, num_vecs):
             print(f"COMPUTED VECTOR {j}/{num_vecs}")
         return Q
     
+    #JOHN COMMENT 06/27/2024: This is the code you should use for testing speed in parallelization. 
     def compDecayingSVD(self, seedMe, a, b):
         numFeats = a
         numSamps = b//self.size
@@ -1770,6 +1825,7 @@ def oldCompDecayingSVD1(self, seedMe, a, b):
         self.fullImgData = (Q1 @ np.diag(S) @ Q2).T
         self.imgsTracked = [(0, numSamps)]
 
+    #JOHN COMMENT 06/27/2024: This is the code you should use for testing performance. It generates synthetic data for error. 
     def oldCompDecayingSVD2(self, seedMe, a, b):
         #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. 
         numFeats = a
@@ -1852,7 +1908,8 @@ def runMe(self):
 
     def addThumbnailsToProjectH5(self):
         startingPoint = self.start_offset + self.num_imgs*self.rank//self.size
-        _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
+        # _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
+        _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_fake_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True)
         file_name = self.writeToHere+f"{self.currRun:04}_ProjectedData_{self.rank}.h5"
         f1 = h5py.File(file_name, 'r+')
         f1.create_dataset("SmallImages",  data=self.fullThumbnailData)
@@ -1861,7 +1918,7 @@ def addThumbnailsToProjectH5(self):
         self.comm.barrier()
 
 class FD_ImageProcessing:
-    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h):
+    def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roiLen, downsampleImg):
         self.threshold = threshold
         self.eluThreshold = eluThreshold
         self.eluAlpha = eluAlpha
@@ -1871,8 +1928,8 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize
         self.thresholdQuantile = thresholdQuantile
         self.unitVar = unitVar
         self.centerImg = centerImg
-        self.roi_w = roi_w
-        self.roi_h = roi_h
+        self.roiLen = roiLen
+        self.downsampleImg = downsampleImg
 
     def processImg(self, nimg, ncurrIntensity):
         if self.threshold:
@@ -1880,6 +1937,7 @@ def processImg(self, nimg, ncurrIntensity):
         if self.eluThreshold:
             nimg = self.eluThresholdFunc(nimg)
         if self.centerImg:
+            # print("CENTERING IMAGE")
             nimg = self.centerImgFunc(nimg)
 
         if nimg is not None:
@@ -1893,6 +1951,8 @@ def processImg(self, nimg, ncurrIntensity):
             nimg = self.normalizeIntensityFunc(nimg, currIntensity)
         if self.unitVar:
             nimg = self.unitVarFunc(nimg, currIntensity)
+        if self.downsampleImg!=-1:
+            nimg = self.downsampleFunc(nimg)
         return nimg
 
     def elu(self,x):
@@ -1937,30 +1997,69 @@ def unitVarFunc(self, img, currIntensity):
             return img/img.std(axis=0)
 
     def centerImgFunc(self, img):
-        if img is None: 
+        def find_center_of_gravity(image):
+            moments = cv2.moments(image)
+            if moments["m00"] != 0:
+                cX = int(moments["m10"] / moments["m00"])
+                cY = int(moments["m01"] / moments["m00"])
+            else:
+                cX, cY = 0, 0
+            return int(cX), int(cY)
+        def find_bounding_box(array, box_size):
+            center_y, center_x = find_center_of_gravity(array)
+            return (center_y-box_size//2, center_x-box_size//2, box_size, box_size)
+        if img is None:
             return img
         else:
-            nimg = img
-            rampingFact = 1
-            while rampingFact>=1:
-                curr_roi_w = int(self.roi_w*rampingFact)
-                curr_roi_h = int(self.roi_h*rampingFact)
-                nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1)
-                if  np.sum(img.flatten(), dtype=np.double)<10000:
-                    cogx, cogy = (curr_roi_w, curr_roi_h)
-                else:
-                    cogx, cogy  = self.calcCenterGrav(nimg)
-                nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)]
-                rampingFact -= 0.5
+            nimg = np.pad(img, self.roiLen//2, mode='constant', constant_values=0)
+            bounding_box = find_bounding_box(nimg, self.roiLen)
+            nimg = nimg[bounding_box[1]:bounding_box[1]+bounding_box[3], bounding_box[0]:bounding_box[0]+bounding_box[2]]
             return nimg
+    
+    def downsampleFunc(self, img):
+        if img is None:
+            return img
+        else:
+            normalized_array = (255 * (img - np.min(img)) / np.ptp(img)).astype(np.uint8)
+            image = Image.fromarray(normalized_array)
+            nimg = image.resize((self.downsampleImg, self.downsampleImg), Image.Resampling.LANCZOS)
+            return np.array(nimg)
+        
 
-    def calcCenterGrav(self, grid):
-        M_total = np.sum(grid)
-        row_indices, col_indices = np.indices(grid.shape)
-        X_c = np.sum(row_indices * grid) / M_total
-        Y_c = np.sum(col_indices * grid) / M_total
-#        print(M_total, X_c, Y_c, grid)
-        return (round(X_c), round(Y_c))
+    # def centerImgFunc(self, img):
+    #     if img is None: 
+    #         return img
+    #     else:
+    #         nimg = img
+    #         rampingFact = 1
+    #         while rampingFact>=1:
+    #             curr_roi_w = int(self.roi_w*rampingFact)
+    #             curr_roi_h = int(self.roi_h*rampingFact)
+    #             nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1)
+    #             if  np.sum(img.flatten(), dtype=np.double)<10000:
+    #                 cogx, cogy = (curr_roi_w, curr_roi_h)
+    #             else:
+    #                 cogx, cogy  = self.calcCenterGrav(nimg)
+    #             nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)]
+    #             rampingFact -= 0.5
+    #         return nimg
+
+#     def calcCenterGrav(self, grid):
+#         M_total = np.sum(grid)
+#         row_indices, col_indices = np.indices(grid.shape)
+#         X_c = np.sum(row_indices * grid) / M_total
+#         Y_c = np.sum(col_indices * grid) / M_total
+# #        print(M_total, X_c, Y_c, grid)
+#         return (round(X_c), round(Y_c))
+    # def find_center_of_gravity(image):
+    #     moments = cv2.moments(image)
+    #     if moments["m00"] != 0:
+    #         cX = int(moments["m10"] / moments["m00"])
+    #         cY = int(moments["m01"] / moments["m00"])
+    #     else:
+    #         cX, cY = 0, 0
+
+    #     return cX, cY
 
 
 
@@ -2096,12 +2195,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
 
 class SinglePanelDataRetriever:
     btx = __import__('btx')
-    def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth):
+    def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbLen):
         self.exp = exp
         self.det_type = det_type
         self.run = run
-        self.thumbnailHeight = thumbnailHeight
-        self.thumbnailWidth = thumbnailWidth
+        self.thumbLen = thumbLen
 
         self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type)
 
@@ -2125,136 +2223,249 @@ def split_range(self, start, end, num_tuples):
         tuples.append((last_batch_start, last_batch_end))
         return tuples    
 
-    def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
-        """
-        Fetch n - x image segments from run, where x is the number of 'dead' images.
-
-        Parameters
-        ----------
-        n : int
-            number of images to retrieve
-        start_index : int
-            start index of subsection of data to retrieve
-        end_index : int
-            end index of subsection of data to retrieve
-
-        Returns
-        -------
-        ndarray, shape (end_index-start_index, n-x)
-            n-x retrieved image segments of dimension end_index-start_index
-        """
-        fullimgs = None
-        fullthumbnails = None
-        imgsTracked = []
-        runs = self.split_range(startInd, startInd+n, num_steps)
-        print(runs)
-        trueIntensities = []
-        for runStart, runEnd in runs:
-            self.psi.counter = runStart
-            imgsTracked.append((runStart, runEnd))
-
-            imgs = self.psi.get_images(runEnd-runStart, assemble=False)
+    # def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
+    #     """
+    #     Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+    #     Parameters
+    #     ----------
+    #     n : int
+    #         number of images to retrieve
+    #     start_index : int
+    #         start index of subsection of data to retrieve
+    #     end_index : int
+    #         end index of subsection of data to retrieve
+
+    #     Returns
+    #     -------
+    #     ndarray, shape (end_index-start_index, n-x)
+    #         n-x retrieved image segments of dimension end_index-start_index
+    #     """
+    #     fullimgs = None
+    #     imgsTracked = []
+    #     runs = self.split_range(startInd, startInd+n, num_steps)
+    #     print(runs)
+    #     trueIntensities = []
+
+    #     for runStart, runEnd in runs:
+    #         self.psi.counter = runStart
+    #         imgsTracked.append((runStart, runEnd))
+
+    #         imgs = self.psi.get_images(runEnd-runStart, assemble=False)
+
+    #         imgs = imgs[
+    #             [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+    #         ]
+
+    #         origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
+    #         newTrueIntensities = []
+    #         for j in origTrueIntensities:
+    #             if j<0:
+    #                 newTrueIntensities.append(0)
+    #             else:
+    #                 newTrueIntensities.append(np.log(j))
+    #         origTrueIntensities = newTrueIntensities
+
+    #         num_valid_imgs, x, y = imgs.shape
+
+    #         imshape = (x, y)
+    #         if self.imageProcessor.centerImg:
+    #             imshape = (self.imageProcessor.roiLen, self.imageProcessor.roiLen)
+    #         if self.imageProcessor.downsampleImg!=-1:
+    #             imshape = (self.imageProcessor.downsampleImg, self.imageProcessor.downsampleImg)
+
+    #         img_batch = imgs.T
+    #         img_batch[img_batch<0] = 0
+    
+    #         nimg_batch = []
+    #         ntrueIntensity_batch = []
+    #         for ind, (img, trueIntens) in enumerate(zip(img_batch.T, origTrueIntensities)):
+    #             currIntensity = np.sum(img.flatten(), dtype=np.double)
+    #             nimg = self.imageProcessor.processImg(img, currIntensity)
+    #             if nimg is not None:
+    #                 nimg_batch.append(nimg)
+    #                 ntrueIntensity_batch.append(trueIntens)
+    #             else:
+    #                 num_valid_imgs -= 1
+    #                 self.excludedImgs.append(ind)
+    #         if self.imageProcessor.centerImg or self.imageProcessor.downsampleImg!=-1:
+    #             nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, imshape[0]*imshape[1]).T
+    #         else:
+    #             nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T
+
+    #         if fullimgs is None and nimg_batch.shape[1]!=0:
+    #             fullimgs = nimg_batch
+    #             trueIntensities += ntrueIntensity_batch
+    #         elif nimg_batch.shape[1]!=0: 
+    #             fullimgs = np.hstack((fullimgs, nimg_batch))
+    #             trueIntensities += ntrueIntensity_batch
+
+    #     print("EXCLUDING IMAGES: ", self.excludedImgs)
+
+    #     if getThumbnails:
+    #         fullThumbs = []
+    #         for img in fullimgs.T:
+    #             nimg = img.reshape(imshape)
+    #             if self.imageProcessor.downsampleImg==-1:
+    #                 normalized_array = (255 * (nimg - np.min(nimg)) / np.ptp(nimg)).astype(np.uint8)
+    #             else:
+    #                 normalized_array = nimg
+    #             image = Image.fromarray(normalized_array)
+    #             thumbnail = image.resize((self.thumbLen, self.thumbLen), Image.Resampling.LANCZOS)
+    #             thumbnail_array = np.array(thumbnail)
+    #             fullThumbs.append(thumbnail_array)
+    #         return (fullimgs, fullThumbs, imgsTracked, trueIntensities)
+    #     else:
+    #         return (fullimgs, imgsTracked)
+    
 
-            imgs = imgs[
-                [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
-            ]
+    def get_fake_images(self, startInd, n, num_steps, getThumbnails):
+        fullimgs = np.load('/sdf/data/lcls/ds/xpp/xppx22715/scratch/winnicki/fakeImgData.npy')
+        with open('/sdf/data/lcls/ds/xpp/xppx22715/scratch/winnicki/fakeThumbData.pkl', 'rb') as f:
+            fullThumbs = pickle.load(f)
+        if getThumbnails:
+            return (fullimgs, fullThumbs, [[0, 10]], [1 for x in range(len(fullimgs))])
+        else:
+            return (fullimgs, [[0, 10]])
+
+
+################################# JOHN OLD VERSION 05/13/2024 ############################################
+# def get_formatted_images(self, startInd, n, num_steps, getThumbnails):
+#         """
+#         Fetch n - x image segments from run, where x is the number of 'dead' images.
+
+#         Parameters
+#         ----------
+#         n : int
+#             number of images to retrieve
+#         start_index : int
+#             start index of subsection of data to retrieve
+#         end_index : int
+#             end index of subsection of data to retrieve
+
+#         Returns
+#         -------
+#         ndarray, shape (end_index-start_index, n-x)
+#             n-x retrieved image segments of dimension end_index-start_index
+#         """
+#         fullimgs = None
+#         fullthumbnails = None
+#         imgsTracked = []
+#         runs = self.split_range(startInd, startInd+n, num_steps)
+#         print(runs)
+#         trueIntensities = []
+#         for runStart, runEnd in runs:
+#             self.psi.counter = runStart
+#             imgsTracked.append((runStart, runEnd))
+
+#             imgs = self.psi.get_images(runEnd-runStart, assemble=False)
+
+#             imgs = imgs[
+#                 [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()]
+#             ]
+
+#             origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
+#             newTrueIntensities = []
+#             for j in origTrueIntensities:
+#                 if j<0:
+#                     newTrueIntensities.append(0)
+#                 else:
+#                     newTrueIntensities.append(np.log(j))
+#             origTrueIntensities = newTrueIntensities
+
+#             if getThumbnails:
+#                 saveMe = []
+#                 for img in imgs:
+#                     saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
+#                 thumbnails = np.array(saveMe)
+
+#             num_valid_imgs, x, y = imgs.shape
+
+#             img_batch = imgs.T
+#             img_batch[img_batch<0] = 0
+    
+#             if getThumbnails:
+#                 num_valid_thumbnails, tx, ty = thumbnails.shape
+#                 thumbnail_batch = thumbnails.T
+
+#             if getThumbnails:
+#                 nimg_batch = []
+#                 nthumbnail_batch = []
+#                 ntrueIntensity_batch = []
+#                 for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)):
+#                     currIntensity = np.sum(img.flatten(), dtype=np.double)
+#                     if self.imageProcessor.centerImg: 
+#                         nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
+#                     else:
+#                         nimg = self.imageProcessor.processImg(img, currIntensity)
+#                     if nimg is None:
+#                         nthumbnail = None
+#                     else:
+#                         nthumbnail = nimg.copy()
+#                     if nimg is not None:
+#                         nimg_batch.append(nimg)
+#                         nthumbnail_batch.append(nthumbnail)
+#                         ntrueIntensity_batch.append(trueIntens)
+#                     else:
+#                         num_valid_thumbnails -= 1
+#                         num_valid_imgs -= 1
+#                         self.excludedImgs.append(ind)
+#                 if self.imageProcessor.centerImg:
+#                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T
+#                     nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w)
+
+#                     saveMe = []
+#                     for img in nthumbnail_batch:
+#                         saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
+#                     nthumbnail_batch = np.array(saveMe)
+
+#                 else:
+#                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
+#                     nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
+                
+                
+#                 if fullimgs is None and nimg_batch.shape[1]!=0:
+#                     fullimgs = nimg_batch
+#                     fullthumbnails = nthumbnail_batch
+#                     trueIntensities += ntrueIntensity_batch
+#                 elif nimg_batch.shape[1]!=0: 
+#                     fullimgs = np.hstack((fullimgs, nimg_batch))
+#                     fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
+#                     trueIntensities += ntrueIntensity_batch
+#             else:
+#                 nimg_batch = []
+#                 for ind, img in enumerate(img_batch.T):
+#                     currIntensity = np.sum(img.flatten(), dtype=np.double)
+#                     nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
+#                     if nimg is not None:
+#                         nimg_batch.append(nimg)
+#                     else:
+#                         num_valid_imgs -= 1
+#                         self.excludedImgs.append(ind)
 
-            origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs]
-            newTrueIntensities = []
-            for j in origTrueIntensities:
-                if j<0:
-                    newTrueIntensities.append(0)
-                else:
-                    newTrueIntensities.append(np.log(j))
-            origTrueIntensities = newTrueIntensities
+#                 if self.imageProcessor.centerImg:
+#                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T 
+#                 else: 
+#                     nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
 
-            if getThumbnails:
-                saveMe = []
-                for img in imgs:
-                    saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
-                thumbnails = np.array(saveMe)
+#                 if fullimgs is None:
+#                     fullimgs = nimg_batch
+#                 elif nimg_batch.shape[1]!=0:
+#                     fullimgs = np.hstack((fullimgs, nimg_batch))
 
-            num_valid_imgs, x, y = imgs.shape
+#         print("EXCLUDING IMAGES: ", self.excludedImgs)
 
-            img_batch = imgs.T
-            img_batch[img_batch<0] = 0
-    
-            if getThumbnails:
-                num_valid_thumbnails, tx, ty = thumbnails.shape
-                thumbnail_batch = thumbnails.T
+#         if getThumbnails:
+#             print(fullimgs.shape, fullthumbnails.shape, imgsTracked)
+#             return (fullimgs, fullthumbnails, imgsTracked, trueIntensities)
+#         else:
+#             return (fullimgs, imgsTracked)
 
-            if getThumbnails:
-                nimg_batch = []
-                nthumbnail_batch = []
-                ntrueIntensity_batch = []
-                for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)):
-                    currIntensity = np.sum(img.flatten(), dtype=np.double)
-                    if self.imageProcessor.centerImg: 
-                        nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
-                    else:
-                        nimg = self.imageProcessor.processImg(img, currIntensity)
-                    if nimg is None:
-                        nthumbnail = None
-                    else:
-                        nthumbnail = nimg.copy()
-                    if nimg is not None:
-                        nimg_batch.append(nimg)
-                        nthumbnail_batch.append(nthumbnail)
-                        ntrueIntensity_batch.append(trueIntens)
-                    else:
-                        num_valid_thumbnails -= 1
-                        num_valid_imgs -= 1
-                        self.excludedImgs.append(ind)
-                if self.imageProcessor.centerImg:
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T
-                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w)
-
-                    saveMe = []
-                    for img in nthumbnail_batch:
-                        saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth))))
-                    nthumbnail_batch = np.array(saveMe)
 
-                else:
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
-                    nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty)
-                
-                
-                if fullimgs is None and nimg_batch.shape[1]!=0:
-                    fullimgs = nimg_batch
-                    fullthumbnails = nthumbnail_batch
-                    trueIntensities += ntrueIntensity_batch
-                elif nimg_batch.shape[1]!=0: 
-                    fullimgs = np.hstack((fullimgs, nimg_batch))
-                    fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch))
-                    trueIntensities += ntrueIntensity_batch
-            else:
-                nimg_batch = []
-                for ind, img in enumerate(img_batch.T):
-                    currIntensity = np.sum(img.flatten(), dtype=np.double)
-                    nimg = self.imageProcessor.processImg(img[200:, :], currIntensity)
-                    if nimg is not None:
-                        nimg_batch.append(nimg)
-                    else:
-                        num_valid_imgs -= 1
-                        self.excludedImgs.append(ind)
 
-                if self.imageProcessor.centerImg:
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T 
-                else: 
-                    nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T 
 
-                if fullimgs is None:
-                    fullimgs = nimg_batch
-                elif nimg_batch.shape[1]!=0:
-                    fullimgs = np.hstack((fullimgs, nimg_batch))
 
-        print("EXCLUDING IMAGES: ", self.excludedImgs)
 
-        if getThumbnails:
-            print(fullimgs.shape, fullthumbnails.shape, imgsTracked)
-            return (fullimgs, fullthumbnails, imgsTracked, trueIntensities)
-        else:
-            return (fullimgs, imgsTracked)
 
 def main():
     """