From f2f968a30ff17c98892ef4e61c9b6e069cf27424 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 16 Jul 2023 20:53:30 -0700 Subject: [PATCH 01/57] Added frequent directions module --- btx/processing/freqdir.py | 434 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 btx/processing/freqdir.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py new file mode 100644 index 000000000..10e162a67 --- /dev/null +++ b/btx/processing/freqdir.py @@ -0,0 +1,434 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +########################################### +#John Imports +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd +import numpy as np + +import time + +from datetime import datetime +currRun = datetime.now().strftime("%y%m%d%H%M%S") + +import cProfile, sys + +############################################# + +class FreqDir: + + """Frequent Directions.""" + + def __init__( + self, + john_start, + tot_imgs, + ell, + alpha, + exp, + run, + det_type, + batch_size=10, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size + + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + _, + self.batch_size, + self.num_features, + ) = self.set_params(tot_imgs, ell, batch_size, bin_factor) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + self.outliers, self.pc_data = [], [] + + self.d = self.num_features + self.ell = ell + self.m = 2*self.ell + self.sketch = zeros( (self.m, self.d) ) + self.nextZeroRow = 0 + self.alpha = alpha + + self.dataseen = [] + + self.noImgsToProcess = tot_imgs//self.size + + print("MY RANK IS: {}".format(self.rank)) + + if self.rank==0: + self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.totPSI.counter = john_start + + def get_params(self): + """ + Method to retrieve iPCA params. + + Returns + ------- + num_incorporated_images : int + number of images used to build model + num_components : int + number of components maintained in model + batch_size : int + batch size used in model updates + num_features : int + dimensionality of incorporated images + """ + return ( + self.num_incorporated_images, + self.ell, + self.batch_size, + self.num_features, + ) + + def set_params(self, num_images, num_components, batch_size, bin_factor): + """ + Method to initialize iPCA parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + batch_size : int + Desired size of image block to be incorporated into model at each update. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + batch_size : int + Size of image block to be incorporated into model at each update. + num_features : int + Number of features (dimension) in each image. + """ + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + batch_size = min(batch_size, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, batch_size, num_features + + def run(self): + """ + Perform iPCA on run subject to initialization parameters. + """ + # update model with remaining batches + + for batch in range(0,self.noImgsToProcess,self.batch_size): + self.fetch_and_update_model(self.batch_size) + + + def get_formatted_images(self, n): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + + img_batch = self.get_formatted_images(n) + + + self.john_update_model(img_batch) + + + def john_update_model(self, X): + """ + Update matrix sketch with new batch of observations + """ + +# pr = cProfile.Profile() +# pr.enable() + + _, numIncorp = X.shape + n = self.num_incorporated_images + q = self.ell + + with TaskTimer(self.task_durations, "total update"): + + if self.rank == 0: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + ) + ) + for row in X.T: + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + self.num_incorporated_images += 1 + if self.rank==0: + self.dataseen.append(row) + +# if self.rank==0: +# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') + +# pr.disable() +# # Dump results: +# # - for binary dump +# pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank)) +# # - for text dump +# with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file: +# sys.stdout = output_file +# pr.print_stats( sort='time' ) +# sys.stdout = sys.__stdout__ + + + def john_rotate(self): + try: + [_,s,Vt] = svd(self.sketch , full_matrices=False) + except LinAlgError as err: + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + + if len(s) >= self.ell: + sCopy = s.copy() + + sShrunk = s[:self.ell]**2 - s[self.ell-1]**2 + #John: Explicitly set this value to be 0, since sometimes it is negative + # or even turns to NaN due to roundoff error + sShrunk[-1] = 0 + sShrunk = sqrt(sShrunk) + + sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] + + self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:]) + self.sketch[self.ell:,:] = 0 + self.nextZeroRow = self.ell + else: + self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) + self.sketch[len(s):,:] = 0 + self.nextZeroRow = len(s) + + def john_reconstructionError(self): + matrixCentered = np.array(self.dataseen) + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + UA, SA, VtA = np.linalg.svd(matrixCenteredT) + UAk = UA[:,:k] + SAk = np.diag(SA[:k]) + VtAk = VtA[:k] + Ak = UAk @ SAk @ VtAk + return (np.linalg.norm( + matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( + (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) + + def lowMemoryReconstructionErrorUnscaled(self): + matrixCentered = np.array(self.dataseen) + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) + + def estimFrobNormSquared(addMe, arrs, its): + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows + + + def gatherFreqDirs(self): + sendbuf = self.sketch[:self.ell,:] + recvbuf = None + if self.rank == 0: + recvbuf = np.empty( + [self.size, self.ell, self.d], dtype=np.float64) + self.comm.Gather(sendbuf, recvbuf, root=0) + if self.rank==0: + origMatSketch = self.sketch.copy() + for j in range(1, self.size): + for row in recvbuf[j]: + if(np.any(row)): + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + toReturn = self.sketch.copy() + self.sketch = origMatSketch + print(toReturn) + return toReturn + else: + return + +def parse_input(): + """ + Parse command line input. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) + parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) + parser.add_argument( + "-d", + "--det_type", + help="Detector name, e.g epix10k2M or jungfrau4M.", + required=True, + type=str, + ) + parser.add_argument( + "--start_offset", + help="Run index of first image to be incorporated into iPCA model.", + required=False, + type=int, + ) + parser.add_argument( + "--num_components", + help="Number of principal components to compute and maintain.", + required=False, + type=int, + ) + parser.add_argument( + "--batch_size", + help="Size of image batch incorporated in each model update.", + required=False, + type=int, + ) + parser.add_argument( + "--num_images", + help="Total number of images to be incorporated into model.", + required=False, + type=int, + ) + parser.add_argument( + "--output_dir", + help="Path to output directory for recording task duration data.", + required=False, + type=str, + ) + parser.add_argument( + "--priming", + help="Initialize model with PCA.", + required=False, + action="store_true", + ) + parser.add_argument( + "--downsample", + help="Enable downsampling of images.", + required=False, + action="store_true", + ) + parser.add_argument( + "--bin_factor", + help="Bin factor if using downsizing.", + required=False, + type=int, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + params = parse_input() + kwargs = {k: v for k, v in vars(params).items() if v is not None} + + pipca = PiPCA(**kwargs) + pipca.run() + pipca.get_outliers() + From 7a589c3760b7ba8d28df557730c5f26701ea210f Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 18 Jul 2023 10:31:31 -0700 Subject: [PATCH 02/57] Checkpoint. Not sure what has been changed, but parallel FD should be working other than segfault high core issue. --- btx/processing/rankAd_freqdir.py | 434 +++++++++++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 btx/processing/rankAd_freqdir.py diff --git a/btx/processing/rankAd_freqdir.py b/btx/processing/rankAd_freqdir.py new file mode 100644 index 000000000..10e162a67 --- /dev/null +++ b/btx/processing/rankAd_freqdir.py @@ -0,0 +1,434 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +########################################### +#John Imports +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd +import numpy as np + +import time + +from datetime import datetime +currRun = datetime.now().strftime("%y%m%d%H%M%S") + +import cProfile, sys + +############################################# + +class FreqDir: + + """Frequent Directions.""" + + def __init__( + self, + john_start, + tot_imgs, + ell, + alpha, + exp, + run, + det_type, + batch_size=10, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size + + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + _, + self.batch_size, + self.num_features, + ) = self.set_params(tot_imgs, ell, batch_size, bin_factor) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + self.outliers, self.pc_data = [], [] + + self.d = self.num_features + self.ell = ell + self.m = 2*self.ell + self.sketch = zeros( (self.m, self.d) ) + self.nextZeroRow = 0 + self.alpha = alpha + + self.dataseen = [] + + self.noImgsToProcess = tot_imgs//self.size + + print("MY RANK IS: {}".format(self.rank)) + + if self.rank==0: + self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.totPSI.counter = john_start + + def get_params(self): + """ + Method to retrieve iPCA params. + + Returns + ------- + num_incorporated_images : int + number of images used to build model + num_components : int + number of components maintained in model + batch_size : int + batch size used in model updates + num_features : int + dimensionality of incorporated images + """ + return ( + self.num_incorporated_images, + self.ell, + self.batch_size, + self.num_features, + ) + + def set_params(self, num_images, num_components, batch_size, bin_factor): + """ + Method to initialize iPCA parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + batch_size : int + Desired size of image block to be incorporated into model at each update. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + batch_size : int + Size of image block to be incorporated into model at each update. + num_features : int + Number of features (dimension) in each image. + """ + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + batch_size = min(batch_size, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, batch_size, num_features + + def run(self): + """ + Perform iPCA on run subject to initialization parameters. + """ + # update model with remaining batches + + for batch in range(0,self.noImgsToProcess,self.batch_size): + self.fetch_and_update_model(self.batch_size) + + + def get_formatted_images(self, n): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + + img_batch = self.get_formatted_images(n) + + + self.john_update_model(img_batch) + + + def john_update_model(self, X): + """ + Update matrix sketch with new batch of observations + """ + +# pr = cProfile.Profile() +# pr.enable() + + _, numIncorp = X.shape + n = self.num_incorporated_images + q = self.ell + + with TaskTimer(self.task_durations, "total update"): + + if self.rank == 0: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + ) + ) + for row in X.T: + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + self.num_incorporated_images += 1 + if self.rank==0: + self.dataseen.append(row) + +# if self.rank==0: +# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') + +# pr.disable() +# # Dump results: +# # - for binary dump +# pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank)) +# # - for text dump +# with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file: +# sys.stdout = output_file +# pr.print_stats( sort='time' ) +# sys.stdout = sys.__stdout__ + + + def john_rotate(self): + try: + [_,s,Vt] = svd(self.sketch , full_matrices=False) + except LinAlgError as err: + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + + if len(s) >= self.ell: + sCopy = s.copy() + + sShrunk = s[:self.ell]**2 - s[self.ell-1]**2 + #John: Explicitly set this value to be 0, since sometimes it is negative + # or even turns to NaN due to roundoff error + sShrunk[-1] = 0 + sShrunk = sqrt(sShrunk) + + sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] + + self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:]) + self.sketch[self.ell:,:] = 0 + self.nextZeroRow = self.ell + else: + self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) + self.sketch[len(s):,:] = 0 + self.nextZeroRow = len(s) + + def john_reconstructionError(self): + matrixCentered = np.array(self.dataseen) + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + UA, SA, VtA = np.linalg.svd(matrixCenteredT) + UAk = UA[:,:k] + SAk = np.diag(SA[:k]) + VtAk = VtA[:k] + Ak = UAk @ SAk @ VtAk + return (np.linalg.norm( + matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( + (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) + + def lowMemoryReconstructionErrorUnscaled(self): + matrixCentered = np.array(self.dataseen) + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) + + def estimFrobNormSquared(addMe, arrs, its): + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows + + + def gatherFreqDirs(self): + sendbuf = self.sketch[:self.ell,:] + recvbuf = None + if self.rank == 0: + recvbuf = np.empty( + [self.size, self.ell, self.d], dtype=np.float64) + self.comm.Gather(sendbuf, recvbuf, root=0) + if self.rank==0: + origMatSketch = self.sketch.copy() + for j in range(1, self.size): + for row in recvbuf[j]: + if(np.any(row)): + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + toReturn = self.sketch.copy() + self.sketch = origMatSketch + print(toReturn) + return toReturn + else: + return + +def parse_input(): + """ + Parse command line input. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) + parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) + parser.add_argument( + "-d", + "--det_type", + help="Detector name, e.g epix10k2M or jungfrau4M.", + required=True, + type=str, + ) + parser.add_argument( + "--start_offset", + help="Run index of first image to be incorporated into iPCA model.", + required=False, + type=int, + ) + parser.add_argument( + "--num_components", + help="Number of principal components to compute and maintain.", + required=False, + type=int, + ) + parser.add_argument( + "--batch_size", + help="Size of image batch incorporated in each model update.", + required=False, + type=int, + ) + parser.add_argument( + "--num_images", + help="Total number of images to be incorporated into model.", + required=False, + type=int, + ) + parser.add_argument( + "--output_dir", + help="Path to output directory for recording task duration data.", + required=False, + type=str, + ) + parser.add_argument( + "--priming", + help="Initialize model with PCA.", + required=False, + action="store_true", + ) + parser.add_argument( + "--downsample", + help="Enable downsampling of images.", + required=False, + action="store_true", + ) + parser.add_argument( + "--bin_factor", + help="Bin factor if using downsizing.", + required=False, + type=int, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + params = parse_input() + kwargs = {k: v for k, v in vars(params).items() if v is not None} + + pipca = PiPCA(**kwargs) + pipca.run() + pipca.get_outliers() + From 0a1df812cb8efdf29ab2308fff8d0e05a17d85e2 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 19 Jul 2023 13:44:05 -0700 Subject: [PATCH 03/57] Updated freqdir --- btx/processing/freqdir.py | 183 +++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 82 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 10e162a67..7561daf0f 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -28,13 +28,11 @@ from datetime import datetime currRun = datetime.now().strftime("%y%m%d%H%M%S") -import cProfile, sys - ############################################# class FreqDir: - """Frequent Directions.""" + """Parallel Frequent Directions.""" def __init__( self, @@ -45,7 +43,6 @@ def __init__( exp, run, det_type, - batch_size=10, downsample=False, bin_factor=2, output_dir="", @@ -65,14 +62,12 @@ def __init__( ( self.num_images, _, - self.batch_size, self.num_features, - ) = self.set_params(tot_imgs, ell, batch_size, bin_factor) + ) = self.set_params(tot_imgs, ell, bin_factor) self.task_durations = dict({}) self.num_incorporated_images = 0 - self.outliers, self.pc_data = [], [] self.d = self.num_features self.ell = ell @@ -81,41 +76,11 @@ def __init__( self.nextZeroRow = 0 self.alpha = alpha - self.dataseen = [] - self.noImgsToProcess = tot_imgs//self.size - print("MY RANK IS: {}".format(self.rank)) - - if self.rank==0: - self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.totPSI.counter = john_start - - def get_params(self): - """ - Method to retrieve iPCA params. - - Returns - ------- - num_incorporated_images : int - number of images used to build model - num_components : int - number of components maintained in model - batch_size : int - batch size used in model updates - num_features : int - dimensionality of incorporated images - """ - return ( - self.num_incorporated_images, - self.ell, - self.batch_size, - self.num_features, - ) - - def set_params(self, num_images, num_components, batch_size, bin_factor): + def set_params(self, num_images, num_components, bin_factor): """ - Method to initialize iPCA parameters. + Method to initialize FreqDir parameters. Parameters ---------- @@ -123,8 +88,6 @@ def set_params(self, num_images, num_components, batch_size, bin_factor): Desired number of images to incorporate into model. num_components : int Desired number of components for model to maintain. - batch_size : int - Desired size of image block to be incorporated into model at each update. bin_factor : int Factor to bin data by. @@ -134,17 +97,15 @@ def set_params(self, num_images, num_components, batch_size, bin_factor): Number of images to incorporate into model. num_components : int Number of components for model to maintain. - batch_size : int - Size of image block to be incorporated into model at each update. num_features : int Number of features (dimension) in each image. """ + max_events = self.psi.max_events downsample = self.downsample num_images = min(num_images, max_events) if num_images != -1 else max_events num_components = min(num_components, num_images) - batch_size = min(batch_size, num_images) # set d det_shape = self.psi.det.shape() @@ -157,18 +118,17 @@ def set_params(self, num_images, num_components, batch_size, bin_factor): else: num_features = int(num_features / bin_factor**2) - return num_images, num_components, batch_size, num_features + return num_images, num_components, num_features def run(self): """ - Perform iPCA on run subject to initialization parameters. + Perform frequent directions matrix sketching + on run subject to initialization parameters. """ - # update model with remaining batches - + for batch in range(0,self.noImgsToProcess,self.batch_size): self.fetch_and_update_model(self.batch_size) - def get_formatted_images(self, n): """ Fetch n - x image segments from run, where x is the number of 'dead' images. @@ -226,11 +186,13 @@ def fetch_and_update_model(self, n): def john_update_model(self, X): """ Update matrix sketch with new batch of observations + + Parameters + ---------- + X: ndarray + data to update matrix sketch with """ -# pr = cProfile.Profile() -# pr.enable() - _, numIncorp = X.shape n = self.num_incorporated_images q = self.ell @@ -249,41 +211,44 @@ def john_update_model(self, X): self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 self.num_incorporated_images += 1 - if self.rank==0: - self.dataseen.append(row) # if self.rank==0: # print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') - -# pr.disable() -# # Dump results: -# # - for binary dump -# pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank)) -# # - for text dump -# with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file: -# sys.stdout = output_file -# pr.print_stats( sort='time' ) -# sys.stdout = sys.__stdout__ - def john_rotate(self): - try: - [_,s,Vt] = svd(self.sketch , full_matrices=False) - except LinAlgError as err: - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + """ + Apply Frequent Directions Algorithm to + current matrix sketch and adjoined buffer + + Notes + ----- + Based on [1] and [2]. + + [1] Frequent Directions: Simple and Deterministic Matrix + Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and + David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 + + [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved + Practical Matrix Sketching with Guarantees. In: Schulz, A.S., + Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes + in Computer Science, vol 8737. Springer, Berlin, Heidelberg. + https://doi.org/10.1007/978-3-662-44777-2_39 + """ + + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) if len(s) >= self.ell: sCopy = s.copy() - sShrunk = s[:self.ell]**2 - s[self.ell-1]**2 + toShrink = s[:self.ell]**2 - s[self.ell-1]**2 #John: Explicitly set this value to be 0, since sometimes it is negative # or even turns to NaN due to roundoff error - sShrunk[-1] = 0 - sShrunk = sqrt(sShrunk) + toShrink[-1] = 0 + toShrink = sqrt(toShrink) - sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] + toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:]) + self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) self.sketch[self.ell:,:] = 0 self.nextZeroRow = self.ell else: @@ -291,8 +256,17 @@ def john_rotate(self): self.sketch[len(s):,:] = 0 self.nextZeroRow = len(s) - def john_reconstructionError(self): - matrixCentered = np.array(self.dataseen) + def john_reconstructionError(self, matrixCentered): + """ + Compute the reconstruction error of the matrix sketch + against given data + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + matSketch = self.sketch k = 10 matrixCenteredT = matrixCentered.T @@ -308,8 +282,18 @@ def john_reconstructionError(self): matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - def lowMemoryReconstructionErrorUnscaled(self): - matrixCentered = np.array(self.dataseen) + def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): + """ + Compute the low memory reconstruction error of the matrix sketch + against given data. This si the same as john_reconstructionError, + but estimates the norm computation and does not scale by the matrix. + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + matSketch = self.sketch k = 10 matrixCenteredT = matrixCentered.T @@ -318,7 +302,39 @@ def lowMemoryReconstructionErrorUnscaled(self): G = U[:,:k] return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) - def estimFrobNormSquared(addMe, arrs, its): + def estimFrobNormSquared(self, addMe, arrs, its): + """ + Estimate the Frobenius Norm of product of arrs matrices + plus addME matrix using its iterations. + + Parameters + ---------- + arrs: list of ndarray + Matrices to multiply together + + addMe: ndarray + Matrix to add to others + + its: int + Number of iterations to average over + + Returns + ------- + sumMe/its*no_rows : float + Estimate of frobenius norm of produce + of arrs matrices plus addMe matrix + + Notes + ----- + Frobenius estimation is the expected value of matrix + multiplied by random vector from multivariate normal distribution + based on [1]. + + [1] Norm and Trace Estimation with Random Rank-one Vectors + Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix + Analysis and Applications 2021 42:1, 202-223 + """ + no_rows = arrs[-1].shape[1] v = np.random.normal(size=no_rows) v_hat = v / np.linalg.norm(v) @@ -334,6 +350,12 @@ def estimFrobNormSquared(addMe, arrs, its): def gatherFreqDirs(self): + """ + Gather local matrix sketches to root node and + merge local sketches together. + """ + + self.comm.Barrier() sendbuf = self.sketch[:self.ell,:] recvbuf = None if self.rank == 0: @@ -351,7 +373,6 @@ def gatherFreqDirs(self): self.nextZeroRow += 1 toReturn = self.sketch.copy() self.sketch = origMatSketch - print(toReturn) return toReturn else: return @@ -430,5 +451,3 @@ def parse_input(): pipca = PiPCA(**kwargs) pipca.run() - pipca.get_outliers() - From 851487786911e2ec62ba50de324f71ce3ae9360e Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 19 Jul 2023 13:46:12 -0700 Subject: [PATCH 04/57] Checkpoint --- btx/processing/rankAd_freqdir.py | 434 ------------------------------- 1 file changed, 434 deletions(-) delete mode 100644 btx/processing/rankAd_freqdir.py diff --git a/btx/processing/rankAd_freqdir.py b/btx/processing/rankAd_freqdir.py deleted file mode 100644 index 10e162a67..000000000 --- a/btx/processing/rankAd_freqdir.py +++ /dev/null @@ -1,434 +0,0 @@ -import os, csv, argparse - -import numpy as np -from mpi4py import MPI - -from matplotlib import pyplot as plt -from matplotlib import colors - -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) - -########################################### -#John Imports -from numpy import zeros, sqrt, dot, diag -from numpy.linalg import svd, LinAlgError -from scipy.linalg import svd as scipy_svd -import numpy as np - -import time - -from datetime import datetime -currRun = datetime.now().strftime("%y%m%d%H%M%S") - -import cProfile, sys - -############################################# - -class FreqDir: - - """Frequent Directions.""" - - def __init__( - self, - john_start, - tot_imgs, - ell, - alpha, - exp, - run, - det_type, - batch_size=10, - downsample=False, - bin_factor=2, - output_dir="", - ): - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size - - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - _, - self.batch_size, - self.num_features, - ) = self.set_params(tot_imgs, ell, batch_size, bin_factor) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - self.outliers, self.pc_data = [], [] - - self.d = self.num_features - self.ell = ell - self.m = 2*self.ell - self.sketch = zeros( (self.m, self.d) ) - self.nextZeroRow = 0 - self.alpha = alpha - - self.dataseen = [] - - self.noImgsToProcess = tot_imgs//self.size - - print("MY RANK IS: {}".format(self.rank)) - - if self.rank==0: - self.totPSI = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.totPSI.counter = john_start - - def get_params(self): - """ - Method to retrieve iPCA params. - - Returns - ------- - num_incorporated_images : int - number of images used to build model - num_components : int - number of components maintained in model - batch_size : int - batch size used in model updates - num_features : int - dimensionality of incorporated images - """ - return ( - self.num_incorporated_images, - self.ell, - self.batch_size, - self.num_features, - ) - - def set_params(self, num_images, num_components, batch_size, bin_factor): - """ - Method to initialize iPCA parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - batch_size : int - Desired size of image block to be incorporated into model at each update. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - batch_size : int - Size of image block to be incorporated into model at each update. - num_features : int - Number of features (dimension) in each image. - """ - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - batch_size = min(batch_size, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, batch_size, num_features - - def run(self): - """ - Perform iPCA on run subject to initialization parameters. - """ - # update model with remaining batches - - for batch in range(0,self.noImgsToProcess,self.batch_size): - self.fetch_and_update_model(self.batch_size) - - - def get_formatted_images(self, n): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - - bin_factor = self.bin_factor - downsample = self.downsample - - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - - return formatted_imgs - - def fetch_and_update_model(self, n): - """ - Fetch images and update model. - - Parameters - ---------- - n : int - number of images to incorporate - """ - - img_batch = self.get_formatted_images(n) - - - self.john_update_model(img_batch) - - - def john_update_model(self, X): - """ - Update matrix sketch with new batch of observations - """ - -# pr = cProfile.Profile() -# pr.enable() - - _, numIncorp = X.shape - n = self.num_incorporated_images - q = self.ell - - with TaskTimer(self.task_durations, "total update"): - - if self.rank == 0: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q - ) - ) - for row in X.T: - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - self.num_incorporated_images += 1 - if self.rank==0: - self.dataseen.append(row) - -# if self.rank==0: -# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') - -# pr.disable() -# # Dump results: -# # - for binary dump -# pr.dump_stats('logging/{0}_rank_{1}.prof'.format(currRun, self.rank)) -# # - for text dump -# with open( 'logging/{0}_rank_{1}.txt'.format(currRun, self.rank), 'w') as output_file: -# sys.stdout = output_file -# pr.print_stats( sort='time' ) -# sys.stdout = sys.__stdout__ - - - def john_rotate(self): - try: - [_,s,Vt] = svd(self.sketch , full_matrices=False) - except LinAlgError as err: - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) - - if len(s) >= self.ell: - sCopy = s.copy() - - sShrunk = s[:self.ell]**2 - s[self.ell-1]**2 - #John: Explicitly set this value to be 0, since sometimes it is negative - # or even turns to NaN due to roundoff error - sShrunk[-1] = 0 - sShrunk = sqrt(sShrunk) - - sShrunk[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - - self.sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:]) - self.sketch[self.ell:,:] = 0 - self.nextZeroRow = self.ell - else: - self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) - self.sketch[len(s):,:] = 0 - self.nextZeroRow = len(s) - - def john_reconstructionError(self): - matrixCentered = np.array(self.dataseen) - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - UA, SA, VtA = np.linalg.svd(matrixCenteredT) - UAk = UA[:,:k] - SAk = np.diag(SA[:k]) - VtAk = VtA[:k] - Ak = UAk @ SAk @ VtAk - return (np.linalg.norm( - matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( - (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - - def lowMemoryReconstructionErrorUnscaled(self): - matrixCentered = np.array(self.dataseen) - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) - - def estimFrobNormSquared(addMe, arrs, its): - no_rows = arrs[-1].shape[1] - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - sumMe = 0 - for j in range(its): - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - v_addMe = addMe @ v_hat - for arr in arrs[::-1]: - v_hat = arr @ v_hat - sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 - return sumMe/its*no_rows - - - def gatherFreqDirs(self): - sendbuf = self.sketch[:self.ell,:] - recvbuf = None - if self.rank == 0: - recvbuf = np.empty( - [self.size, self.ell, self.d], dtype=np.float64) - self.comm.Gather(sendbuf, recvbuf, root=0) - if self.rank==0: - origMatSketch = self.sketch.copy() - for j in range(1, self.size): - for row in recvbuf[j]: - if(np.any(row)): - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - toReturn = self.sketch.copy() - self.sketch = origMatSketch - print(toReturn) - return toReturn - else: - return - -def parse_input(): - """ - Parse command line input. - """ - - parser = argparse.ArgumentParser() - parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) - parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) - parser.add_argument( - "-d", - "--det_type", - help="Detector name, e.g epix10k2M or jungfrau4M.", - required=True, - type=str, - ) - parser.add_argument( - "--start_offset", - help="Run index of first image to be incorporated into iPCA model.", - required=False, - type=int, - ) - parser.add_argument( - "--num_components", - help="Number of principal components to compute and maintain.", - required=False, - type=int, - ) - parser.add_argument( - "--batch_size", - help="Size of image batch incorporated in each model update.", - required=False, - type=int, - ) - parser.add_argument( - "--num_images", - help="Total number of images to be incorporated into model.", - required=False, - type=int, - ) - parser.add_argument( - "--output_dir", - help="Path to output directory for recording task duration data.", - required=False, - type=str, - ) - parser.add_argument( - "--priming", - help="Initialize model with PCA.", - required=False, - action="store_true", - ) - parser.add_argument( - "--downsample", - help="Enable downsampling of images.", - required=False, - action="store_true", - ) - parser.add_argument( - "--bin_factor", - help="Bin factor if using downsizing.", - required=False, - type=int, - ) - - return parser.parse_args() - - -if __name__ == "__main__": - - params = parse_input() - kwargs = {k: v for k, v in vars(params).items() if v is not None} - - pipca = PiPCA(**kwargs) - pipca.run() - pipca.get_outliers() - From f11a31f8287e5c6e2cf1126e6d326e63cba65274 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 20 Jul 2023 13:17:18 -0700 Subject: [PATCH 05/57] Fixed gather command. --- btx/processing/OLDfreqdir.py | 464 ++++++++++++++++++++++++++++++++++ btx/processing/freqdir.py | 100 ++++++-- btx/processing/rankAdaptFD.py | 464 ++++++++++++++++++++++++++++++++++ 3 files changed, 1008 insertions(+), 20 deletions(-) create mode 100644 btx/processing/OLDfreqdir.py create mode 100644 btx/processing/rankAdaptFD.py diff --git a/btx/processing/OLDfreqdir.py b/btx/processing/OLDfreqdir.py new file mode 100644 index 000000000..430ea4d22 --- /dev/null +++ b/btx/processing/OLDfreqdir.py @@ -0,0 +1,464 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +########################################### +#John Imports +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd +import numpy as np + +import time + +from datetime import datetime +currRun = datetime.now().strftime("%y%m%d%H%M%S") + +############################################# + +class FreqDir: + + """Parallel Frequent Directions.""" + + def __init__( + self, + john_start, + tot_imgs, + ell, + alpha, + exp, + run, + det_type, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size + + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + _, + self.num_features, + ) = self.set_params(tot_imgs, ell, bin_factor) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + + self.d = self.num_features + self.ell = ell + self.m = 2*self.ell + self.sketch = zeros( (self.m, self.d) ) + self.nextZeroRow = 0 + self.alpha = alpha + + self.noImgsToProcess = tot_imgs//self.size + + def set_params(self, num_images, num_components, bin_factor): + """ + Method to initialize FreqDir parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + num_features : int + Number of features (dimension) in each image. + """ + + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, num_features + + def run(self): + """ + Perform frequent directions matrix sketching + on run subject to initialization parameters. + """ + + for batch in range(0,self.noImgsToProcess,self.ell): + self.fetch_and_update_model(self.ell) + + self.comm.Barrier() + + def get_formatted_images(self, n): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + + img_batch = self.get_formatted_images(n) + + + self.john_update_model(img_batch) + + + def john_update_model(self, X): + """ + Update matrix sketch with new batch of observations + + Parameters + ---------- + X: ndarray + data to update matrix sketch with + """ + + _, numIncorp = X.shape + n = self.num_incorporated_images + q = self.ell + + with TaskTimer(self.task_durations, "total update"): + + if self.rank == 0: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + ) + ) + for row in X.T: + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + self.num_incorporated_images += 1 +# if self.rank==0: +# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') + + def john_rotate(self): + """ + Apply Frequent Directions Algorithm to + current matrix sketch and adjoined buffer + + Notes + ----- + Based on [1] and [2]. + + [1] Frequent Directions: Simple and Deterministic Matrix + Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and + David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 + + [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved + Practical Matrix Sketching with Guarantees. In: Schulz, A.S., + Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes + in Computer Science, vol 8737. Springer, Berlin, Heidelberg. + https://doi.org/10.1007/978-3-662-44777-2_39 + """ + + try: + [_,s,Vt] = svd(self.sketch , full_matrices=False) + except LinAlgError as err: + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + + if len(s) >= self.ell: + sCopy = s.copy() + + toShrink = s[:self.ell]**2 - s[self.ell-1]**2 + #John: Explicitly set this value to be 0, since sometimes it is negative + # or even turns to NaN due to roundoff error + toShrink[-1] = 0 + toShrink = sqrt(toShrink) + + toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] + + self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) + self.sketch[self.ell:,:] = 0 + self.nextZeroRow = self.ell + else: + self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) + self.sketch[len(s):,:] = 0 + self.nextZeroRow = len(s) + + def john_reconstructionError(self, matrixCentered): + """ + Compute the reconstruction error of the matrix sketch + against given data + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + UA, SA, VtA = np.linalg.svd(matrixCenteredT) + UAk = UA[:,:k] + SAk = np.diag(SA[:k]) + VtAk = VtA[:k] + Ak = UAk @ SAk @ VtAk + return (np.linalg.norm( + matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( + (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) + + def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): + """ + Compute the low memory reconstruction error of the matrix sketch + against given data. This si the same as john_reconstructionError, + but estimates the norm computation and does not scale by the matrix. + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) + + def estimFrobNormSquared(self, addMe, arrs, its): + """ + Estimate the Frobenius Norm of product of arrs matrices + plus addME matrix using its iterations. + + Parameters + ---------- + arrs: list of ndarray + Matrices to multiply together + + addMe: ndarray + Matrix to add to others + + its: int + Number of iterations to average over + + Returns + ------- + sumMe/its*no_rows : float + Estimate of frobenius norm of produce + of arrs matrices plus addMe matrix + + Notes + ----- + Frobenius estimation is the expected value of matrix + multiplied by random vector from multivariate normal distribution + based on [1]. + + [1] Norm and Trace Estimation with Random Rank-one Vectors + Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix + Analysis and Applications 2021 42:1, 202-223 + """ + + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows + + + def gatherFreqDirs(self): + print("STARTING GATHER") + """ + Gather local matrix sketches to root node and + merge local sketches together. + """ + + self.comm.Barrier() + sendbuf = self.sketch[:self.ell,:] + recvbuf = None + if self.rank == 0: + recvbuf = np.empty( + [self.size, self.ell, self.d], dtype=np.double) + self.comm.Gather(sendbuf, recvbuf, root=0) + print("{} FINISHED GATHER".format(self.rank)) + if self.rank==0: + origMatSketch = self.sketch.copy() + origNextZeroRow = self.nextZeroRow + self.nextZeroRow = self.ell + print("BUFFER SHAPE: ", recvbuf.shape) + for j in range(1, self.size): + print("CURRENT BUFFER: ", j) + print(recvbuf[j]) + for row in recvbuf[j]: + if(np.any(row)): + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + toReturn = self.sketch.copy() + self.sketch = origMatSketch + return toReturn + else: + return + +def parse_input(): + """ + Parse command line input. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) + parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) + parser.add_argument( + "-d", + "--det_type", + help="Detector name, e.g epix10k2M or jungfrau4M.", + required=True, + type=str, + ) + parser.add_argument( + "--start_offset", + help="Run index of first image to be incorporated into iPCA model.", + required=False, + type=int, + ) + parser.add_argument( + "--num_components", + help="Number of principal components to compute and maintain.", + required=False, + type=int, + ) + parser.add_argument( + "--batch_size", + help="Size of image batch incorporated in each model update.", + required=False, + type=int, + ) + parser.add_argument( + "--num_images", + help="Total number of images to be incorporated into model.", + required=False, + type=int, + ) + parser.add_argument( + "--output_dir", + help="Path to output directory for recording task duration data.", + required=False, + type=str, + ) + parser.add_argument( + "--priming", + help="Initialize model with PCA.", + required=False, + action="store_true", + ) + parser.add_argument( + "--downsample", + help="Enable downsampling of images.", + required=False, + action="store_true", + ) + parser.add_argument( + "--bin_factor", + help="Bin factor if using downsizing.", + required=False, + type=int, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + params = parse_input() + kwargs = {k: v for k, v in vars(params).items() if v is not None} + + pipca = PiPCA(**kwargs) + pipca.run() diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 7561daf0f..8d0f93ef9 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -126,8 +126,10 @@ def run(self): on run subject to initialization parameters. """ - for batch in range(0,self.noImgsToProcess,self.batch_size): - self.fetch_and_update_model(self.batch_size) + for batch in range(0,self.noImgsToProcess,self.ell): + self.fetch_and_update_model(self.ell) + + self.comm.Barrier() def get_formatted_images(self, n): """ @@ -211,14 +213,13 @@ def john_update_model(self, X): self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 self.num_incorporated_images += 1 - # if self.rank==0: # print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') def john_rotate(self): - """ - Apply Frequent Directions Algorithm to - current matrix sketch and adjoined buffer + """ + Apply Frequent Directions Algorithm to + current matrix sketch and adjoined buffer Notes ----- @@ -233,9 +234,12 @@ def john_rotate(self): Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes in Computer Science, vol 8737. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-44777-2_39 - """ + """ - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + try: + [_,s,Vt] = svd(self.sketch , full_matrices=False) + except LinAlgError as err: + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) if len(s) >= self.ell: sCopy = s.copy() @@ -350,32 +354,88 @@ def estimFrobNormSquared(self, addMe, arrs, its): def gatherFreqDirs(self): + print("STARTING GATHER") """ Gather local matrix sketches to root node and merge local sketches together. """ - - self.comm.Barrier() - sendbuf = self.sketch[:self.ell,:] - recvbuf = None +# self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1]) + sendbuf = self.ell + buffSizes = np.array(self.comm.gather(sendbuf, root=0)) if self.rank == 0: - recvbuf = np.empty( - [self.size, self.ell, self.d], dtype=np.float64) - self.comm.Gather(sendbuf, recvbuf, root=0) + print("BUFF SIZES: ", buffSizes) +# data = [np.array((), dtype=np.double) for _ in range(self.size)] +# data[self.rank] = self.sketch[:self.ell, :].copy() +# if self.rank == 0: +# sizes_memory = (self.d)*buffSizes +# offsets = np.zeros(self.size) +# offsets[1:] = np.cumsum(sizes_memory)[:-1] +# +# data_out = None +# recvbuf = None +# if self.rank == 0: +# # data_out = np.empty((np.sum(buffSizes), fd.d), dtype=np.float32) +# data_out = np.empty((np.sum(buffSizes), self.d)) +# recvbuf=[data_out, sizes_memory.tolist(), offsets.tolist(), MPI.DOUBLE] +# +# self.comm.Barrier() +# self.comm.Gatherv(data[self.rank],recvbuf = recvbuf, root=0) +# self.comm.Barrier() +# print("{} FINISHED GATHERV".format(self.rank)) + if self.rank==0: origMatSketch = self.sketch.copy() - for j in range(1, self.size): - for row in recvbuf[j]: + origNextZeroRow = self.nextZeroRow + self.nextZeroRow = self.ell + counter = 0 + for proc in range(1, self.size): + bufferMe = np.empty(self.ell*self.d, dtype=np.double) + self.comm.Recv(bufferMe, source=proc, tag=13) + bufferMe = np.reshape(bufferMe, (self.ell, self.d)) + for row in bufferMe: if(np.any(row)): if self.nextZeroRow >= self.m: self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + counter += 1 + print("DATA PROCESSED: {}".format(counter)) toReturn = self.sketch.copy() + print("COMPLETED MERGE PROCESS: ", toReturn) self.sketch = origMatSketch return toReturn else: - return + bufferMe = self.sketch[:self.ell, :].copy().flatten() + self.comm.Send(bufferMe, dest=0, tag=13) + return + +# self.comm.Barrier() +# sendbuf = self.sketch[:self.ell,:] +# recvbuf = None +# if self.rank == 0: +# recvbuf = np.empty( +# [self.size, self.ell, self.d], dtype=np.float32) +# self.comm.Gather(sendbuf, recvbuf, root=0) +# print("{} FINISHED GATHER".format(self.rank)) +# if self.rank==0: +# origMatSketch = self.sketch.copy() +# origNextZeroRow = self.nextZeroRow +# self.nextZeroRow = self.ell +# print("BUFFER SHAPE: ", recvbuf.shape) +# for j in range(1, self.size): +# print("CURRENT BUFFER: ", j) +# print(recvbuf[j]) +# for row in recvbuf[j]: +# if(np.any(row)): +# if self.nextZeroRow >= self.m: +# self.john_rotate() +# self.sketch[self.nextZeroRow,:] = row +# self.nextZeroRow += 1 +# toReturn = self.sketch.copy() +# self.sketch = origMatSketch +# return toReturn +# else: +# return def parse_input(): """ diff --git a/btx/processing/rankAdaptFD.py b/btx/processing/rankAdaptFD.py new file mode 100644 index 000000000..16575469d --- /dev/null +++ b/btx/processing/rankAdaptFD.py @@ -0,0 +1,464 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +########################################### +#John Imports +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd +import numpy as np + +import time + +from datetime import datetime +currRun = datetime.now().strftime("%y%m%d%H%M%S") + +############################################# + +class FreqDir: + + """Parallel Frequent Directions.""" + + def __init__( + self, + john_start, + tot_imgs, + ell, + alpha, + exp, + run, + det_type, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size + + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + _, + self.num_features, + ) = self.set_params(tot_imgs, ell, bin_factor) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + + self.d = self.num_features + self.ell = ell + self.m = 2*self.ell + self.sketch = zeros( (self.m, self.d) ) + self.nextZeroRow = 0 + self.alpha = alpha + + self.noImgsToProcess = tot_imgs//self.size + + def set_params(self, num_images, num_components, bin_factor): + """ + Method to initialize FreqDir parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + num_features : int + Number of features (dimension) in each image. + """ + + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, num_features + + def run(self): + """ + Perform frequent directions matrix sketching + on run subject to initialization parameters. + """ + + for batch in range(0,self.noImgsToProcess,self.ell): + self.fetch_and_update_model(self.ell) + + self.comm.Barrier() + + def get_formatted_images(self, n): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + + img_batch = self.get_formatted_images(n) + + + self.john_update_model(img_batch) + + + def john_update_model(self, X): + """ + Update matrix sketch with new batch of observations + + Parameters + ---------- + X: ndarray + data to update matrix sketch with + """ + + _, numIncorp = X.shape + n = self.num_incorporated_images + q = self.ell + + with TaskTimer(self.task_durations, "total update"): + + if self.rank == 0: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + ) + ) + for row in X.T: + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + self.num_incorporated_images += 1 +# if self.rank==0: +# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') + + def john_rotate(self): + """ + Apply Frequent Directions Algorithm to + current matrix sketch and adjoined buffer + + Notes + ----- + Based on [1] and [2]. + + [1] Frequent Directions: Simple and Deterministic Matrix + Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and + David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 + + [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved + Practical Matrix Sketching with Guarantees. In: Schulz, A.S., + Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes + in Computer Science, vol 8737. Springer, Berlin, Heidelberg. + https://doi.org/10.1007/978-3-662-44777-2_39 + """ + + try: + [_,s,Vt] = svd(self.sketch , full_matrices=False) + except LinAlgError as err: + [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) + + if len(s) >= self.ell: + sCopy = s.copy() + + toShrink = s[:self.ell]**2 - s[self.ell-1]**2 + #John: Explicitly set this value to be 0, since sometimes it is negative + # or even turns to NaN due to roundoff error + toShrink[-1] = 0 + toShrink = sqrt(toShrink) + + toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] + + self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) + self.sketch[self.ell:,:] = 0 + self.nextZeroRow = self.ell + else: + self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) + self.sketch[len(s):,:] = 0 + self.nextZeroRow = len(s) + + def john_reconstructionError(self, matrixCentered): + """ + Compute the reconstruction error of the matrix sketch + against given data + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + UA, SA, VtA = np.linalg.svd(matrixCenteredT) + UAk = UA[:,:k] + SAk = np.diag(SA[:k]) + VtAk = VtA[:k] + Ak = UAk @ SAk @ VtAk + return (np.linalg.norm( + matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( + (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) + + def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): + """ + Compute the low memory reconstruction error of the matrix sketch + against given data. This si the same as john_reconstructionError, + but estimates the norm computation and does not scale by the matrix. + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + """ + + matSketch = self.sketch + k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT) + G = U[:,:k] + return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) + + def estimFrobNormSquared(self, addMe, arrs, its): + """ + Estimate the Frobenius Norm of product of arrs matrices + plus addME matrix using its iterations. + + Parameters + ---------- + arrs: list of ndarray + Matrices to multiply together + + addMe: ndarray + Matrix to add to others + + its: int + Number of iterations to average over + + Returns + ------- + sumMe/its*no_rows : float + Estimate of frobenius norm of produce + of arrs matrices plus addMe matrix + + Notes + ----- + Frobenius estimation is the expected value of matrix + multiplied by random vector from multivariate normal distribution + based on [1]. + + [1] Norm and Trace Estimation with Random Rank-one Vectors + Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix + Analysis and Applications 2021 42:1, 202-223 + """ + + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows + + + def gatherFreqDirs(self): + print("STARTING GATHER") + """ + Gather local matrix sketches to root node and + merge local sketches together. + """ + + self.comm.Barrier() + sendbuf = self.sketch[:self.ell,:] + recvbuf = None + if self.rank == 0: + recvbuf = np.empty( + [self.size, self.ell, self.d], dtype=np.float32) + self.comm.Gather(sendbuf, recvbuf, root=0) + print("{} FINISHED GATHER".format(self.rank)) + if self.rank==0: + origMatSketch = self.sketch.copy() + origNextZeroRow = self.nextZeroRow + self.nextZeroRow = self.ell + print("BUFFER SHAPE: ", recvbuf.shape) + for j in range(1, self.size): + print("CURRENT BUFFER: ", j) + print(recvbuf[j]) + for row in recvbuf[j]: + if(np.any(row)): + if self.nextZeroRow >= self.m: + self.john_rotate() + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + toReturn = self.sketch.copy() + self.sketch = origMatSketch + return toReturn + else: + return + +def parse_input(): + """ + Parse command line input. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) + parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) + parser.add_argument( + "-d", + "--det_type", + help="Detector name, e.g epix10k2M or jungfrau4M.", + required=True, + type=str, + ) + parser.add_argument( + "--start_offset", + help="Run index of first image to be incorporated into iPCA model.", + required=False, + type=int, + ) + parser.add_argument( + "--num_components", + help="Number of principal components to compute and maintain.", + required=False, + type=int, + ) + parser.add_argument( + "--batch_size", + help="Size of image batch incorporated in each model update.", + required=False, + type=int, + ) + parser.add_argument( + "--num_images", + help="Total number of images to be incorporated into model.", + required=False, + type=int, + ) + parser.add_argument( + "--output_dir", + help="Path to output directory for recording task duration data.", + required=False, + type=str, + ) + parser.add_argument( + "--priming", + help="Initialize model with PCA.", + required=False, + action="store_true", + ) + parser.add_argument( + "--downsample", + help="Enable downsampling of images.", + required=False, + action="store_true", + ) + parser.add_argument( + "--bin_factor", + help="Bin factor if using downsizing.", + required=False, + type=int, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + params = parse_input() + kwargs = {k: v for k, v in vars(params).items() if v is not None} + + pipca = PiPCA(**kwargs) + pipca.run() From be8d31264df9624029824423548010ab6fa0e896 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 20 Jul 2023 19:48:30 -0700 Subject: [PATCH 06/57] checkpoint --- btx/processing/freqdir.py | 8 +- btx/processing/rankAdaptFD.py | 464 ---------------------------------- 2 files changed, 4 insertions(+), 468 deletions(-) delete mode 100644 btx/processing/rankAdaptFD.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 8d0f93ef9..5bf1e8e3d 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -354,7 +354,7 @@ def estimFrobNormSquared(self, addMe, arrs, its): def gatherFreqDirs(self): - print("STARTING GATHER") +# print("STARTING GATHER") """ Gather local matrix sketches to root node and merge local sketches together. @@ -362,8 +362,8 @@ def gatherFreqDirs(self): # self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1]) sendbuf = self.ell buffSizes = np.array(self.comm.gather(sendbuf, root=0)) - if self.rank == 0: - print("BUFF SIZES: ", buffSizes) +# if self.rank == 0: +# print("BUFF SIZES: ", buffSizes) # data = [np.array((), dtype=np.double) for _ in range(self.size)] # data[self.rank] = self.sketch[:self.ell, :].copy() # if self.rank == 0: @@ -399,7 +399,7 @@ def gatherFreqDirs(self): self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 counter += 1 - print("DATA PROCESSED: {}".format(counter)) +# print("DATA PROCESSED: {}".format(counter)) toReturn = self.sketch.copy() print("COMPLETED MERGE PROCESS: ", toReturn) self.sketch = origMatSketch diff --git a/btx/processing/rankAdaptFD.py b/btx/processing/rankAdaptFD.py deleted file mode 100644 index 16575469d..000000000 --- a/btx/processing/rankAdaptFD.py +++ /dev/null @@ -1,464 +0,0 @@ -import os, csv, argparse - -import numpy as np -from mpi4py import MPI - -from matplotlib import pyplot as plt -from matplotlib import colors - -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) - -########################################### -#John Imports -from numpy import zeros, sqrt, dot, diag -from numpy.linalg import svd, LinAlgError -from scipy.linalg import svd as scipy_svd -import numpy as np - -import time - -from datetime import datetime -currRun = datetime.now().strftime("%y%m%d%H%M%S") - -############################################# - -class FreqDir: - - """Parallel Frequent Directions.""" - - def __init__( - self, - john_start, - tot_imgs, - ell, - alpha, - exp, - run, - det_type, - downsample=False, - bin_factor=2, - output_dir="", - ): - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size - - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - _, - self.num_features, - ) = self.set_params(tot_imgs, ell, bin_factor) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - - self.d = self.num_features - self.ell = ell - self.m = 2*self.ell - self.sketch = zeros( (self.m, self.d) ) - self.nextZeroRow = 0 - self.alpha = alpha - - self.noImgsToProcess = tot_imgs//self.size - - def set_params(self, num_images, num_components, bin_factor): - """ - Method to initialize FreqDir parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - num_features : int - Number of features (dimension) in each image. - """ - - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, num_features - - def run(self): - """ - Perform frequent directions matrix sketching - on run subject to initialization parameters. - """ - - for batch in range(0,self.noImgsToProcess,self.ell): - self.fetch_and_update_model(self.ell) - - self.comm.Barrier() - - def get_formatted_images(self, n): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - - bin_factor = self.bin_factor - downsample = self.downsample - - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - - return formatted_imgs - - def fetch_and_update_model(self, n): - """ - Fetch images and update model. - - Parameters - ---------- - n : int - number of images to incorporate - """ - - img_batch = self.get_formatted_images(n) - - - self.john_update_model(img_batch) - - - def john_update_model(self, X): - """ - Update matrix sketch with new batch of observations - - Parameters - ---------- - X: ndarray - data to update matrix sketch with - """ - - _, numIncorp = X.shape - n = self.num_incorporated_images - q = self.ell - - with TaskTimer(self.task_durations, "total update"): - - if self.rank == 0: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q - ) - ) - for row in X.T: - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - self.num_incorporated_images += 1 -# if self.rank==0: -# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') - - def john_rotate(self): - """ - Apply Frequent Directions Algorithm to - current matrix sketch and adjoined buffer - - Notes - ----- - Based on [1] and [2]. - - [1] Frequent Directions: Simple and Deterministic Matrix - Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and - David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 - - [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved - Practical Matrix Sketching with Guarantees. In: Schulz, A.S., - Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes - in Computer Science, vol 8737. Springer, Berlin, Heidelberg. - https://doi.org/10.1007/978-3-662-44777-2_39 - """ - - try: - [_,s,Vt] = svd(self.sketch , full_matrices=False) - except LinAlgError as err: - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) - - if len(s) >= self.ell: - sCopy = s.copy() - - toShrink = s[:self.ell]**2 - s[self.ell-1]**2 - #John: Explicitly set this value to be 0, since sometimes it is negative - # or even turns to NaN due to roundoff error - toShrink[-1] = 0 - toShrink = sqrt(toShrink) - - toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - - self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) - self.sketch[self.ell:,:] = 0 - self.nextZeroRow = self.ell - else: - self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) - self.sketch[len(s):,:] = 0 - self.nextZeroRow = len(s) - - def john_reconstructionError(self, matrixCentered): - """ - Compute the reconstruction error of the matrix sketch - against given data - - Parameters - ---------- - matrixCentered: ndarray - Data to compare matrix sketch to - """ - - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - UA, SA, VtA = np.linalg.svd(matrixCenteredT) - UAk = UA[:,:k] - SAk = np.diag(SA[:k]) - VtAk = VtA[:k] - Ak = UAk @ SAk @ VtAk - return (np.linalg.norm( - matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( - (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - - def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): - """ - Compute the low memory reconstruction error of the matrix sketch - against given data. This si the same as john_reconstructionError, - but estimates the norm computation and does not scale by the matrix. - - Parameters - ---------- - matrixCentered: ndarray - Data to compare matrix sketch to - """ - - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) - - def estimFrobNormSquared(self, addMe, arrs, its): - """ - Estimate the Frobenius Norm of product of arrs matrices - plus addME matrix using its iterations. - - Parameters - ---------- - arrs: list of ndarray - Matrices to multiply together - - addMe: ndarray - Matrix to add to others - - its: int - Number of iterations to average over - - Returns - ------- - sumMe/its*no_rows : float - Estimate of frobenius norm of produce - of arrs matrices plus addMe matrix - - Notes - ----- - Frobenius estimation is the expected value of matrix - multiplied by random vector from multivariate normal distribution - based on [1]. - - [1] Norm and Trace Estimation with Random Rank-one Vectors - Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix - Analysis and Applications 2021 42:1, 202-223 - """ - - no_rows = arrs[-1].shape[1] - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - sumMe = 0 - for j in range(its): - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - v_addMe = addMe @ v_hat - for arr in arrs[::-1]: - v_hat = arr @ v_hat - sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 - return sumMe/its*no_rows - - - def gatherFreqDirs(self): - print("STARTING GATHER") - """ - Gather local matrix sketches to root node and - merge local sketches together. - """ - - self.comm.Barrier() - sendbuf = self.sketch[:self.ell,:] - recvbuf = None - if self.rank == 0: - recvbuf = np.empty( - [self.size, self.ell, self.d], dtype=np.float32) - self.comm.Gather(sendbuf, recvbuf, root=0) - print("{} FINISHED GATHER".format(self.rank)) - if self.rank==0: - origMatSketch = self.sketch.copy() - origNextZeroRow = self.nextZeroRow - self.nextZeroRow = self.ell - print("BUFFER SHAPE: ", recvbuf.shape) - for j in range(1, self.size): - print("CURRENT BUFFER: ", j) - print(recvbuf[j]) - for row in recvbuf[j]: - if(np.any(row)): - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - toReturn = self.sketch.copy() - self.sketch = origMatSketch - return toReturn - else: - return - -def parse_input(): - """ - Parse command line input. - """ - - parser = argparse.ArgumentParser() - parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) - parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) - parser.add_argument( - "-d", - "--det_type", - help="Detector name, e.g epix10k2M or jungfrau4M.", - required=True, - type=str, - ) - parser.add_argument( - "--start_offset", - help="Run index of first image to be incorporated into iPCA model.", - required=False, - type=int, - ) - parser.add_argument( - "--num_components", - help="Number of principal components to compute and maintain.", - required=False, - type=int, - ) - parser.add_argument( - "--batch_size", - help="Size of image batch incorporated in each model update.", - required=False, - type=int, - ) - parser.add_argument( - "--num_images", - help="Total number of images to be incorporated into model.", - required=False, - type=int, - ) - parser.add_argument( - "--output_dir", - help="Path to output directory for recording task duration data.", - required=False, - type=str, - ) - parser.add_argument( - "--priming", - help="Initialize model with PCA.", - required=False, - action="store_true", - ) - parser.add_argument( - "--downsample", - help="Enable downsampling of images.", - required=False, - action="store_true", - ) - parser.add_argument( - "--bin_factor", - help="Bin factor if using downsizing.", - required=False, - type=int, - ) - - return parser.parse_args() - - -if __name__ == "__main__": - - params = parse_input() - kwargs = {k: v for k, v in vars(params).items() if v is not None} - - pipca = PiPCA(**kwargs) - pipca.run() From 7b26c0eadc2bc87edec1fd8a4086a97f06fe2088 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 21 Jul 2023 06:12:32 -0700 Subject: [PATCH 07/57] Added rank adaptive and tree merge. Both have not been tested yet --- .gitignore | 4 +- btx/processing/freqdir.py | 203 ++++++++++++++++++++++++-------------- 2 files changed, 130 insertions(+), 77 deletions(-) diff --git a/.gitignore b/.gitignore index bfec190cb..8d2dadd1c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ adhoc/ __pycache__/ # cli -tmp yaml -tutorial/*-tmp.yaml \ No newline at end of file +tutorial/*-tmp.yaml + +*.h5 diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 5bf1e8e3d..2720a0717 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -28,6 +28,7 @@ from datetime import datetime currRun = datetime.now().strftime("%y%m%d%H%M%S") +import h5py ############################################# class FreqDir: @@ -43,31 +44,41 @@ def __init__( exp, run, det_type, + rankAdapt, + merger=False, + mergerFeatures=0, downsample=False, bin_factor=2, output_dir="", ): + self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size + if not merger: + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir - ( - self.num_images, - _, - self.num_features, - ) = self.set_params(tot_imgs, ell, bin_factor) + ( + self.num_images, + _, + self.num_features, + ) = self.set_params(tot_imgs, ell, bin_factor) - self.task_durations = dict({}) + self.task_durations = dict({}) - self.num_incorporated_images = 0 + self.num_incorporated_images = 0 + else: + #JOHN: NEED TO IMPROVE. CURRENTLY, NEED TO MANUALLY SET d, WHICH IS UNACCEPTABLE. + self.num_features = mergerFeatures + self.task_durations = dict({}) + self.num_incorporated_images = 0 self.d = self.num_features self.ell = ell @@ -78,6 +89,9 @@ def __init__( self.noImgsToProcess = tot_imgs//self.size + self.rankAdapt = rankAdapt + self.increaseEll = False + def set_params(self, num_images, num_components, bin_factor): """ Method to initialize FreqDir parameters. @@ -196,23 +210,36 @@ def john_update_model(self, X): """ _, numIncorp = X.shape - n = self.num_incorporated_images - q = self.ell - +# n = self.num_incorporated_images +# q = self.ell +# with TaskTimer(self.task_durations, "total update"): - if self.rank == 0: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q - ) - ) +# if self.rank == 0: +# print( +# "Factoring {m} sample{s} into {n} sample, {q} component model...".format( +# m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q +# ) +# ) for row in X.T: + canRankAdapt = numIncorp > (self.ell + 15) if self.nextZeroRow >= self.m: - self.john_rotate() + if self.increaseEll and canRankAdapt and self.rankAdapt: + self.ell = self.ell + 10 + self.m = 2*self.ell + self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d)))) + self.increaseEll = False + else: + copyBatch = self.sketch[self.ell:,:].copy() + self.john_rotate() + if canRankAdapt and self.rankAdapt: + reconError = self.lowMemoryReconstructionErrorUnscaled(copyBatch) + if (np.sqrt(reconError) > 0.08): + self.increaseEll = True self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 self.num_incorporated_images += 1 + numIncorp -= 1 # if self.rank==0: # print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') @@ -354,34 +381,12 @@ def estimFrobNormSquared(self, addMe, arrs, its): def gatherFreqDirs(self): -# print("STARTING GATHER") """ Gather local matrix sketches to root node and merge local sketches together. """ -# self.sketch = np.random.rand(self.sketch.shape[0], self.sketch.shape[1]) sendbuf = self.ell - buffSizes = np.array(self.comm.gather(sendbuf, root=0)) -# if self.rank == 0: -# print("BUFF SIZES: ", buffSizes) -# data = [np.array((), dtype=np.double) for _ in range(self.size)] -# data[self.rank] = self.sketch[:self.ell, :].copy() -# if self.rank == 0: -# sizes_memory = (self.d)*buffSizes -# offsets = np.zeros(self.size) -# offsets[1:] = np.cumsum(sizes_memory)[:-1] -# -# data_out = None -# recvbuf = None -# if self.rank == 0: -# # data_out = np.empty((np.sum(buffSizes), fd.d), dtype=np.float32) -# data_out = np.empty((np.sum(buffSizes), self.d)) -# recvbuf=[data_out, sizes_memory.tolist(), offsets.tolist(), MPI.DOUBLE] -# -# self.comm.Barrier() -# self.comm.Gatherv(data[self.rank],recvbuf = recvbuf, root=0) -# self.comm.Barrier() -# print("{} FINISHED GATHERV".format(self.rank)) + buffSizes = np.array(self.comm.allgather(sendbuf)) if self.rank==0: origMatSketch = self.sketch.copy() @@ -389,9 +394,9 @@ def gatherFreqDirs(self): self.nextZeroRow = self.ell counter = 0 for proc in range(1, self.size): - bufferMe = np.empty(self.ell*self.d, dtype=np.double) + bufferMe = np.empty(buffSizes[self.rank]*self.d, dtype=np.double) self.comm.Recv(bufferMe, source=proc, tag=13) - bufferMe = np.reshape(bufferMe, (self.ell, self.d)) + bufferMe = np.reshape(bufferMe, (buffSizes[self.rank], self.d)) for row in bufferMe: if(np.any(row)): if self.nextZeroRow >= self.m: @@ -407,35 +412,81 @@ def gatherFreqDirs(self): else: bufferMe = self.sketch[:self.ell, :].copy().flatten() self.comm.Send(bufferMe, dest=0, tag=13) - return - -# self.comm.Barrier() -# sendbuf = self.sketch[:self.ell,:] -# recvbuf = None -# if self.rank == 0: -# recvbuf = np.empty( -# [self.size, self.ell, self.d], dtype=np.float32) -# self.comm.Gather(sendbuf, recvbuf, root=0) -# print("{} FINISHED GATHER".format(self.rank)) -# if self.rank==0: -# origMatSketch = self.sketch.copy() -# origNextZeroRow = self.nextZeroRow -# self.nextZeroRow = self.ell -# print("BUFFER SHAPE: ", recvbuf.shape) -# for j in range(1, self.size): -# print("CURRENT BUFFER: ", j) -# print(recvbuf[j]) -# for row in recvbuf[j]: -# if(np.any(row)): -# if self.nextZeroRow >= self.m: -# self.john_rotate() -# self.sketch[self.nextZeroRow,:] = row -# self.nextZeroRow += 1 -# toReturn = self.sketch.copy() -# self.sketch = origMatSketch -# return toReturn -# else: -# return + return + + def get(self): + return self.sketch[:self.ell, :] + +class MergeTree: + + """Frequent Directions Merging Object.""" + + def __init__(self, divBy, readFile, dataSetName): + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.divBy = divBy + + with h5py.File(readFile, 'r') as hf: + self.data = hf[dataSetName][:] + + print("AOIDJWOIJA", self.rank, self.data.shape) + + self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) + + self.fd.d = self.data.shape[1] + + sendbuf = self.data.shape[0] + self.buffSizes = np.array(self.comm.allgather(sendbuf)) + print(self.buffSizes) + + #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA + self.fd.john_update_model(self.data) + + + def merge(self): + + """ + Merge Frequent Direction Components in a tree-like fashion. + Returns + ------- + finalSketch : ndarray + Merged matrix sketch of cumulative data + + + """ + powerNum = 1 + while(powerNum < self.size): + powerNum = powerNum * self.divBy + if powerNum != size: + raise ValueError('NUMBER OF CORES WOULD LEAD TO INBALANCED MERGE TREE. ENDING PROGRAM.') + return + + level = 0 + while((self.divBy ** level) < self.size): + jump = self.divBy ** level + if(self.rank%jump ==0): + root = self.rank - (self.rank%(jump*self.divBy)) + grouping = [j for j in range(root, root + jump*self.divBy, jump)] + print(grouping) +# if self.rank==root: +# for proc in grouping[1:]: +# bufferMe = np.empty(self.data.shape[0] * self.data.shape[1], dtype=np.double) +# comm.Recv(bufferMe, source=proc, tag=17) +# bufferMe = np.reshape(bufferMe, (self.data.shape[0], self.data.shape[1])) +# self.fd.john_update_model(bufferMe.T) +# print(level, data) +# else: +# bufferMe = self.fd.get().copy().flatten() +# comm.Send(bufferMe, dest=root, tag=17) + level += 1 + if self.rank==0: + finalSketch = self.fd.get() + return finalSketch + else: + return + def parse_input(): """ From b8270aa7ac51582dc97463487b8bdd9f0f79bd95 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 21 Jul 2023 07:48:53 -0700 Subject: [PATCH 08/57] Parallel Rank Adaptive and Merge Tree appear to run. Can't say for sure the sketches are useful until I implement something substantial --- btx/processing/freqdir.py | 95 +++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 2720a0717..6ff071b34 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -56,6 +56,7 @@ def __init__( self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() + self.merger = merger if not merger: self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) @@ -75,7 +76,7 @@ def __init__( self.num_incorporated_images = 0 else: - #JOHN: NEED TO IMPROVE. CURRENTLY, NEED TO MANUALLY SET d, WHICH IS UNACCEPTABLE. + #JOHN: NEED TO IMPROVE. THIS IS WACK. self.num_features = mergerFeatures self.task_durations = dict({}) self.num_incorporated_images = 0 @@ -140,10 +141,9 @@ def run(self): on run subject to initialization parameters. """ - for batch in range(0,self.noImgsToProcess,self.ell): - self.fetch_and_update_model(self.ell) - - self.comm.Barrier() + for batch in range(0,self.noImgsToProcess,self.ell*10): +# print("aodijwaoij 1") + self.fetch_and_update_model(self.ell*10) def get_formatted_images(self, n): """ @@ -193,9 +193,11 @@ def fetch_and_update_model(self, n): number of images to incorporate """ +# print("aodijwaoij 2") img_batch = self.get_formatted_images(n) +# print("aodijwaoij 3") self.john_update_model(img_batch) @@ -209,39 +211,51 @@ def john_update_model(self, X): data to update matrix sketch with """ +# print("aodijwaoij 4") _, numIncorp = X.shape -# n = self.num_incorporated_images -# q = self.ell -# - with TaskTimer(self.task_durations, "total update"): + origNumIncorp = numIncorp + n = self.num_incorporated_images + q = self.ell -# if self.rank == 0: -# print( -# "Factoring {m} sample{s} into {n} sample, {q} component model...".format( -# m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q -# ) -# ) + with TaskTimer(self.task_durations, "total update"): +# print("aodijwaoij 5") + + if self.rank==0 and not self.merger: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + ) + ) +# print("aodijwaoij 5") for row in X.T: +# print(self.rank, " aodijwaoij 6") canRankAdapt = numIncorp > (self.ell + 15) +# print(self.rank,"CAN RANK ADAPT", canRankAdapt, numIncorp, self.ell+15) if self.nextZeroRow >= self.m: +# print(self.rank, " aodijwaoij 7") if self.increaseEll and canRankAdapt and self.rankAdapt: +# print(self.rank, " aodijwaoij 8") self.ell = self.ell + 10 self.m = 2*self.ell self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d)))) self.increaseEll = False + print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell)) else: +# print(self.rank, " aodijwaoij 9") copyBatch = self.sketch[self.ell:,:].copy() self.john_rotate() +# print(self.rank, " aodijwaoij 9.25") if canRankAdapt and self.rankAdapt: - reconError = self.lowMemoryReconstructionErrorUnscaled(copyBatch) - if (np.sqrt(reconError) > 0.08): +# print(self.rank, " aodijwaoij 9.5") + reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch)) +# print("ITERATION {} - RECON ERROR OF RANK {}: {}".format(origNumIncorp - numIncorp, self.rank, reconError)) + if (reconError > 0.08): self.increaseEll = True +# print(self.rank, " aodijwaoij 10") self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 self.num_incorporated_images += 1 numIncorp -= 1 -# if self.rank==0: -# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') def john_rotate(self): """ @@ -270,7 +284,8 @@ def john_rotate(self): if len(s) >= self.ell: sCopy = s.copy() - + + #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise. toShrink = s[:self.ell]**2 - s[self.ell-1]**2 #John: Explicitly set this value to be 0, since sometimes it is negative # or even turns to NaN due to roundoff error @@ -325,13 +340,16 @@ def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): Data to compare matrix sketch to """ +# print("{} COMPUTING ERROR".format(self.rank)) matSketch = self.sketch k = 10 matrixCenteredT = matrixCentered.T matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) + U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) G = U[:,:k] - return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) +# print("{} FINISHED COMPUTING ERROR".format(self.rank)) + return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/ + np.linalg.norm(matrixCenteredT, 'fro')**2) def estimFrobNormSquared(self, addMe, arrs, its): """ @@ -417,6 +435,11 @@ def gatherFreqDirs(self): def get(self): return self.sketch[:self.ell, :] + def write(self): + with h5py.File('h5writes/{}_{}.h5'.format(currRun, self.rank), 'w') as hf: + hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) + + class MergeTree: """Frequent Directions Merging Object.""" @@ -431,18 +454,14 @@ def __init__(self, divBy, readFile, dataSetName): with h5py.File(readFile, 'r') as hf: self.data = hf[dataSetName][:] - print("AOIDJWOIJA", self.rank, self.data.shape) - self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) - self.fd.d = self.data.shape[1] - sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) print(self.buffSizes) #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA - self.fd.john_update_model(self.data) + self.fd.john_update_model(self.data.T) def merge(self): @@ -459,7 +478,7 @@ def merge(self): powerNum = 1 while(powerNum < self.size): powerNum = powerNum * self.divBy - if powerNum != size: + if powerNum != self.size: raise ValueError('NUMBER OF CORES WOULD LEAD TO INBALANCED MERGE TREE. ENDING PROGRAM.') return @@ -469,17 +488,15 @@ def merge(self): if(self.rank%jump ==0): root = self.rank - (self.rank%(jump*self.divBy)) grouping = [j for j in range(root, root + jump*self.divBy, jump)] - print(grouping) -# if self.rank==root: -# for proc in grouping[1:]: -# bufferMe = np.empty(self.data.shape[0] * self.data.shape[1], dtype=np.double) -# comm.Recv(bufferMe, source=proc, tag=17) -# bufferMe = np.reshape(bufferMe, (self.data.shape[0], self.data.shape[1])) -# self.fd.john_update_model(bufferMe.T) -# print(level, data) -# else: -# bufferMe = self.fd.get().copy().flatten() -# comm.Send(bufferMe, dest=root, tag=17) + if self.rank==root: + for proc in grouping[1:]: + bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double) + self.comm.Recv(bufferMe, source=proc, tag=17) + bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1])) + self.fd.john_update_model(bufferMe.T) + else: + bufferMe = self.fd.get().copy().flatten() + self.comm.Send(bufferMe, dest=root, tag=17) level += 1 if self.rank==0: finalSketch = self.fd.get() From d5b8abde6d1aae43bbcdbe44682b1ada9717e29c Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 21 Jul 2023 10:47:42 -0700 Subject: [PATCH 09/57] Fixed treemerge and parallel rank adaptive FD. Things seem to work. Need to verify run with application. --- btx/processing/freqdir.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 6ff071b34..0b42defa8 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -141,9 +141,9 @@ def run(self): on run subject to initialization parameters. """ - for batch in range(0,self.noImgsToProcess,self.ell*10): + for batch in range(0,self.noImgsToProcess,self.ell*6): # print("aodijwaoij 1") - self.fetch_and_update_model(self.ell*10) + self.fetch_and_update_model(self.ell*6) def get_formatted_images(self, n): """ @@ -458,7 +458,8 @@ def __init__(self, divBy, readFile, dataSetName): sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) - print(self.buffSizes) + if self.rank==0): + print(self.buffSizes) #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA self.fd.john_update_model(self.data.T) @@ -504,6 +505,9 @@ def merge(self): else: return + def write(self): + self.fd.write() + def parse_input(): """ From c204c3f4f5cee6725c6aafb5ced7c0fbad71975c Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 25 Jul 2023 12:54:54 -0700 Subject: [PATCH 10/57] Separated FreqDir to create Merge Tree and also projection module. Save intermediate results h5 files. --- btx/processing/freqdir.py | 285 +++++++++++++++++++++++++++++++++++++- 1 file changed, 279 insertions(+), 6 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 0b42defa8..b948f26e6 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -29,6 +29,11 @@ currRun = datetime.now().strftime("%y%m%d%H%M%S") import h5py + +from PIL import Image + +#writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/" +writeDirec = "h5writes/" ############################################# class FreqDir: @@ -192,11 +197,8 @@ def fetch_and_update_model(self, n): n : int number of images to incorporate """ - # print("aodijwaoij 2") img_batch = self.get_formatted_images(n) - - # print("aodijwaoij 3") self.john_update_model(img_batch) @@ -436,8 +438,11 @@ def get(self): return self.sketch[:self.ell, :] def write(self): - with h5py.File('h5writes/{}_{}.h5'.format(currRun, self.rank), 'w') as hf: + filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank) + with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) + self.comm.Barrier() + return filename class MergeTree: @@ -458,7 +463,7 @@ def __init__(self, divBy, readFile, dataSetName): sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) - if self.rank==0): + if self.rank==0: print(self.buffSizes) #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA @@ -506,8 +511,276 @@ def merge(self): return def write(self): - self.fd.write() + filename = writeDirec + '{}_merge.h5'.format(currRun) + if self.rank==0: + with h5py.File(filename, 'w') as hf: + hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) + self.comm.Barrier() + return filename + +class ApplyCompression: + """Compute principal components of matrix sketch and apply to sketched data""" + + def __init__( + self, + john_start, + tot_imgs, + ell, + alpha, + exp, + run, + det_type, + rankAdapt, + readFile, dataSetName, + merger=False, + mergerFeatures=0, + downsample=False, + bin_factor=2, + output_dir="" + ): + + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + self.merger = merger + + if not merger: + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = john_start + tot_imgs*self.rank//self.size + + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + _, + self.num_features, + ) = self.set_params(tot_imgs, ell, bin_factor) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + else: + #JOHN: NEED TO IMPROVE. THIS IS WACK. + self.num_features = mergerFeatures + self.task_durations = dict({}) + self.num_incorporated_images = 0 + + self.d = self.num_features + self.ell = ell + self.m = 2*self.ell + self.sketch = zeros( (self.m, self.d) ) + self.nextZeroRow = 0 + self.alpha = alpha + + self.noImgsToProcess = tot_imgs//self.size + + self.rankAdapt = rankAdapt + self.increaseEll = False + + + with h5py.File(readFile, 'r') as hf: + self.data = hf[dataSetName][:] + + U, S, Vt = np.linalg.svd(self.data, full_matrices=False) + self.components = Vt + + self.processedData = None + self.smallImgs = None + + self.imageIndicesProcessed = [] + + + def set_params(self, num_images, num_components, bin_factor): + """ + Method to initialize FreqDir parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + num_features : int + Number of features (dimension) in each image. + """ + + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, num_features + + def run(self): + """ + Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. + """ + for batch in range(0,self.noImgsToProcess,self.ell*6): + startCounter = self.psi.counter + self.fetch_and_update_model(self.ell*6) + self.imageIndicesProcessed.append((startCounter, self.psi.counter)) + + +# def get_formatted_images(self, n): +# """ +# Fetch n - x image segments from run, where x is the number of 'dead' images. +# +# Parameters +# ---------- +# n : int +# number of images to retrieve +# start_index : int +# start index of subsection of data to retrieve +# end_index : int +# end index of subsection of data to retrieve +# +# Returns +# ------- +# ndarray, shape (end_index-start_index, n-x) +# n-x retrieved image segments of dimension end_index-start_index +# """ +# +# bin_factor = self.bin_factor +# downsample = self.downsample +# +# # may have to rewrite eventually when number of images becomes large, +# # i.e. streamed setting, either that or downsample aggressively +# imgs = self.psi.get_images(n, assemble=False) +# print(imgs.shape) +# +# toSaveImgs = bin_data(imgs, bin_factor) +# if downsample: +# imgs = bin_data(imgs, bin_factor) +# +# toSaveImgs = toSaveImgs[ +# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] +# ] +# imgs = imgs[ +# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] +# ] +# +# num_valid_imgs, p, x, y = imgs.shape +# toSave_num_valid_imgs, toSave_p, toSave_x, toSave_y = toSaveImgs.shape +# +# formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T +# toSave_formatted_imgs = np.reshape(toSaveImgs, (toSave_num_valid_imgs, toSave_p * toSave_x * toSave_y)).T +# print(toSave_formatted_imgs.shape) +# +# return (formatted_imgs,toSave_formatted_imgs) + + def get_formatted_images(self, n): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs + + def assembleImgsToSave(self, imgs): + pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + + saveMe = [] + for img in imgs.T: + imgRe = np.reshape(img, self.psi.det.shape()) + imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) + #saveMe.append(np.array(Image.fromarray(imgRe, mode='L').resize((150, 150), Image.Resampling.BICUBIC))) + saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150)))) + saveMe = np.array(saveMe) + return saveMe + +# print("IMGS TO SAVE SHAPE: ", imgs.shape) +# saveMe = [] +# for img in imgs: +# saveMe.append(np.array(Image.fromarray(img, mode='L').resize((150, 150), Image.Resampling.BICUBIC))) +# saveMe = np.array(saveMe) +# print("RESIZED IMGS TO SAVE SHAPE: ", saveMe.shape) +# return saveMe + + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + img_batch = self.get_formatted_images(n) + toSave_img_batch = self.assembleImgsToSave(img_batch) + if self.smallImgs is None: + self.smallImgs = toSave_img_batch + else: + self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) + self.john_apply_compression(img_batch) + + def john_apply_compression(self, X): + if self.processedData is None: + self.processedData = np.dot(X.T, self.components.T) + else: + self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T))) + + def write(self): + filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank) + with h5py.File(filename, 'w') as hf: + hf.create_dataset("ProjectedData", data=self.processedData) + hf.create_dataset("SmallImages", data=self.smallImgs) + self.comm.Barrier() + return filename def parse_input(): """ From 07d624e52b844210d9e13ba854e757399f057bf1 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 27 Jul 2023 18:31:48 -0700 Subject: [PATCH 11/57] Checkpoint --- btx/processing/freqdir.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index b948f26e6..d1e49f54f 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -32,8 +32,8 @@ from PIL import Image -#writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/" -writeDirec = "h5writes/" +writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/" +#writeDirec = "h5writes/" ############################################# class FreqDir: From 1f801ef9d5942c4643c1feed8b844eaeb1a91a59 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 28 Jul 2023 10:23:14 -0700 Subject: [PATCH 12/57] Cleaned up code --- btx/processing/freqdir.py | 207 ++++++++------------------------------ 1 file changed, 42 insertions(+), 165 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index d1e49f54f..77d7f3367 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -98,6 +98,8 @@ def __init__( self.rankAdapt = rankAdapt self.increaseEll = False + self.imgsTracked = [] + def set_params(self, num_images, num_components, bin_factor): """ Method to initialize FreqDir parameters. @@ -147,7 +149,6 @@ def run(self): """ for batch in range(0,self.noImgsToProcess,self.ell*6): -# print("aodijwaoij 1") self.fetch_and_update_model(self.ell*6) def get_formatted_images(self, n): @@ -168,6 +169,7 @@ def get_formatted_images(self, n): ndarray, shape (end_index-start_index, n-x) n-x retrieved image segments of dimension end_index-start_index """ + self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) bin_factor = self.bin_factor downsample = self.downsample @@ -197,9 +199,7 @@ def fetch_and_update_model(self, n): n : int number of images to incorporate """ -# print("aodijwaoij 2") img_batch = self.get_formatted_images(n) -# print("aodijwaoij 3") self.john_update_model(img_batch) @@ -212,48 +212,33 @@ def john_update_model(self, X): X: ndarray data to update matrix sketch with """ - -# print("aodijwaoij 4") _, numIncorp = X.shape origNumIncorp = numIncorp n = self.num_incorporated_images q = self.ell - with TaskTimer(self.task_durations, "total update"): -# print("aodijwaoij 5") - if self.rank==0 and not self.merger: print( "Factoring {m} sample{s} into {n} sample, {q} component model...".format( m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q ) ) -# print("aodijwaoij 5") for row in X.T: -# print(self.rank, " aodijwaoij 6") canRankAdapt = numIncorp > (self.ell + 15) -# print(self.rank,"CAN RANK ADAPT", canRankAdapt, numIncorp, self.ell+15) if self.nextZeroRow >= self.m: -# print(self.rank, " aodijwaoij 7") if self.increaseEll and canRankAdapt and self.rankAdapt: -# print(self.rank, " aodijwaoij 8") self.ell = self.ell + 10 self.m = 2*self.ell self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d)))) self.increaseEll = False print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell)) else: -# print(self.rank, " aodijwaoij 9") copyBatch = self.sketch[self.ell:,:].copy() self.john_rotate() -# print(self.rank, " aodijwaoij 9.25") if canRankAdapt and self.rankAdapt: -# print(self.rank, " aodijwaoij 9.5") reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch)) -# print("ITERATION {} - RECON ERROR OF RANK {}: {}".format(origNumIncorp - numIncorp, self.rank, reconError)) if (reconError > 0.08): self.increaseEll = True -# print(self.rank, " aodijwaoij 10") self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 self.num_incorporated_images += 1 @@ -278,15 +263,12 @@ def john_rotate(self): in Computer Science, vol 8737. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-44777-2_39 """ - try: [_,s,Vt] = svd(self.sketch , full_matrices=False) except LinAlgError as err: [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) - if len(s) >= self.ell: sCopy = s.copy() - #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise. toShrink = s[:self.ell]**2 - s[self.ell-1]**2 #John: Explicitly set this value to be 0, since sometimes it is negative @@ -314,7 +296,6 @@ def john_reconstructionError(self, matrixCentered): matrixCentered: ndarray Data to compare matrix sketch to """ - matSketch = self.sketch k = 10 matrixCenteredT = matrixCentered.T @@ -330,7 +311,7 @@ def john_reconstructionError(self, matrixCentered): matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): + def lowMemoryReconstructionError(self, matrixCentered): """ Compute the low memory reconstruction error of the matrix sketch against given data. This si the same as john_reconstructionError, @@ -341,15 +322,12 @@ def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): matrixCentered: ndarray Data to compare matrix sketch to """ - -# print("{} COMPUTING ERROR".format(self.rank)) matSketch = self.sketch k = 10 matrixCenteredT = matrixCentered.T matSketchT = matSketch.T U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) G = U[:,:k] -# print("{} FINISHED COMPUTING ERROR".format(self.rank)) return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/ np.linalg.norm(matrixCenteredT, 'fro')**2) @@ -385,7 +363,6 @@ def estimFrobNormSquared(self, addMe, arrs, its): Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix Analysis and Applications 2021 42:1, 202-223 """ - no_rows = arrs[-1].shape[1] v = np.random.normal(size=no_rows) v_hat = v / np.linalg.norm(v) @@ -400,14 +377,13 @@ def estimFrobNormSquared(self, addMe, arrs, its): return sumMe/its*no_rows - def gatherFreqDirs(self): + def gatherFreqDirsSerial(self): """ Gather local matrix sketches to root node and merge local sketches together. """ sendbuf = self.ell buffSizes = np.array(self.comm.allgather(sendbuf)) - if self.rank==0: origMatSketch = self.sketch.copy() origNextZeroRow = self.nextZeroRow @@ -424,7 +400,6 @@ def gatherFreqDirs(self): self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 counter += 1 -# print("DATA PROCESSED: {}".format(counter)) toReturn = self.sketch.copy() print("COMPLETED MERGE PROCESS: ", toReturn) self.sketch = origMatSketch @@ -435,12 +410,19 @@ def gatherFreqDirs(self): return def get(self): + """ + Fetch matrix sketch + """ return self.sketch[:self.ell, :] def write(self): + """ + Write matrix sketch to h5 file. + """ filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) + hf.create_dataset("imgsTracked", data=self.imgsTracked) self.comm.Barrier() return filename @@ -511,6 +493,9 @@ def merge(self): return def write(self): + """ + Write merged matrix sketch to h5 file + """ filename = writeDirec + '{}_merge.h5'.format(currRun) if self.rank==0: with h5py.File(filename, 'w') as hf: @@ -519,7 +504,7 @@ def write(self): return filename class ApplyCompression: - """Compute principal components of matrix sketch and apply to sketched data""" + """Compute principal components of matrix sketch and apply to data""" def __init__( self, @@ -644,54 +629,6 @@ def run(self): self.fetch_and_update_model(self.ell*6) self.imageIndicesProcessed.append((startCounter, self.psi.counter)) - -# def get_formatted_images(self, n): -# """ -# Fetch n - x image segments from run, where x is the number of 'dead' images. -# -# Parameters -# ---------- -# n : int -# number of images to retrieve -# start_index : int -# start index of subsection of data to retrieve -# end_index : int -# end index of subsection of data to retrieve -# -# Returns -# ------- -# ndarray, shape (end_index-start_index, n-x) -# n-x retrieved image segments of dimension end_index-start_index -# """ -# -# bin_factor = self.bin_factor -# downsample = self.downsample -# -# # may have to rewrite eventually when number of images becomes large, -# # i.e. streamed setting, either that or downsample aggressively -# imgs = self.psi.get_images(n, assemble=False) -# print(imgs.shape) -# -# toSaveImgs = bin_data(imgs, bin_factor) -# if downsample: -# imgs = bin_data(imgs, bin_factor) -# -# toSaveImgs = toSaveImgs[ -# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] -# ] -# imgs = imgs[ -# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] -# ] -# -# num_valid_imgs, p, x, y = imgs.shape -# toSave_num_valid_imgs, toSave_p, toSave_x, toSave_y = toSaveImgs.shape -# -# formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T -# toSave_formatted_imgs = np.reshape(toSaveImgs, (toSave_num_valid_imgs, toSave_p * toSave_x * toSave_y)).T -# print(toSave_formatted_imgs.shape) -# -# return (formatted_imgs,toSave_formatted_imgs) - def get_formatted_images(self, n): """ Fetch n - x image segments from run, where x is the number of 'dead' images. @@ -731,6 +668,19 @@ def get_formatted_images(self, n): return formatted_imgs def assembleImgsToSave(self, imgs): + """ + Form the images from psana pixel index map and downsample images. + + Parameters + ---------- + imgs: ndarray + images to downsample + + Notes + ----- + There is no need to use a for loop here, since assemble_image_stack_batch + works on batches of images, and reshape can as well. + """ pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) saveMe = [] @@ -742,23 +692,14 @@ def assembleImgsToSave(self, imgs): saveMe = np.array(saveMe) return saveMe -# print("IMGS TO SAVE SHAPE: ", imgs.shape) -# saveMe = [] -# for img in imgs: -# saveMe.append(np.array(Image.fromarray(img, mode='L').resize((150, 150), Image.Resampling.BICUBIC))) -# saveMe = np.array(saveMe) -# print("RESIZED IMGS TO SAVE SHAPE: ", saveMe.shape) -# return saveMe - - def fetch_and_update_model(self, n): """ - Fetch images and update model. + Fetch and downsample data, apply projection algorithm Parameters ---------- n : int - number of images to incorporate + number of images to process """ img_batch = self.get_formatted_images(n) toSave_img_batch = self.assembleImgsToSave(img_batch) @@ -769,90 +710,26 @@ def fetch_and_update_model(self, n): self.john_apply_compression(img_batch) def john_apply_compression(self, X): + """ + Project data X onto matrix sketch space. + + Parameters + ---------- + X: ndarray + data to project + """ if self.processedData is None: self.processedData = np.dot(X.T, self.components.T) else: self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T))) def write(self): + """ + Write projected data and downsampled data to h5 file + """ filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) self.comm.Barrier() return filename - -def parse_input(): - """ - Parse command line input. - """ - - parser = argparse.ArgumentParser() - parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) - parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) - parser.add_argument( - "-d", - "--det_type", - help="Detector name, e.g epix10k2M or jungfrau4M.", - required=True, - type=str, - ) - parser.add_argument( - "--start_offset", - help="Run index of first image to be incorporated into iPCA model.", - required=False, - type=int, - ) - parser.add_argument( - "--num_components", - help="Number of principal components to compute and maintain.", - required=False, - type=int, - ) - parser.add_argument( - "--batch_size", - help="Size of image batch incorporated in each model update.", - required=False, - type=int, - ) - parser.add_argument( - "--num_images", - help="Total number of images to be incorporated into model.", - required=False, - type=int, - ) - parser.add_argument( - "--output_dir", - help="Path to output directory for recording task duration data.", - required=False, - type=str, - ) - parser.add_argument( - "--priming", - help="Initialize model with PCA.", - required=False, - action="store_true", - ) - parser.add_argument( - "--downsample", - help="Enable downsampling of images.", - required=False, - action="store_true", - ) - parser.add_argument( - "--bin_factor", - help="Bin factor if using downsizing.", - required=False, - type=int, - ) - - return parser.parse_args() - - -if __name__ == "__main__": - - params = parse_input() - kwargs = {k: v for k, v in vars(params).items() if v is not None} - - pipca = PiPCA(**kwargs) - pipca.run() From 222ba0b437eedc4849ae08f67e8a483c760a8a9d Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 30 Jul 2023 15:21:19 -0700 Subject: [PATCH 13/57] Checkpoint. I don't think any significant changes have been made --- btx/processing/OLDfreqdir.py | 464 ----------------------------------- btx/processing/freqdir.py | 19 +- 2 files changed, 12 insertions(+), 471 deletions(-) delete mode 100644 btx/processing/OLDfreqdir.py diff --git a/btx/processing/OLDfreqdir.py b/btx/processing/OLDfreqdir.py deleted file mode 100644 index 430ea4d22..000000000 --- a/btx/processing/OLDfreqdir.py +++ /dev/null @@ -1,464 +0,0 @@ -import os, csv, argparse - -import numpy as np -from mpi4py import MPI - -from matplotlib import pyplot as plt -from matplotlib import colors - -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) - -########################################### -#John Imports -from numpy import zeros, sqrt, dot, diag -from numpy.linalg import svd, LinAlgError -from scipy.linalg import svd as scipy_svd -import numpy as np - -import time - -from datetime import datetime -currRun = datetime.now().strftime("%y%m%d%H%M%S") - -############################################# - -class FreqDir: - - """Parallel Frequent Directions.""" - - def __init__( - self, - john_start, - tot_imgs, - ell, - alpha, - exp, - run, - det_type, - downsample=False, - bin_factor=2, - output_dir="", - ): - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size - - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - _, - self.num_features, - ) = self.set_params(tot_imgs, ell, bin_factor) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - - self.d = self.num_features - self.ell = ell - self.m = 2*self.ell - self.sketch = zeros( (self.m, self.d) ) - self.nextZeroRow = 0 - self.alpha = alpha - - self.noImgsToProcess = tot_imgs//self.size - - def set_params(self, num_images, num_components, bin_factor): - """ - Method to initialize FreqDir parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - num_features : int - Number of features (dimension) in each image. - """ - - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, num_features - - def run(self): - """ - Perform frequent directions matrix sketching - on run subject to initialization parameters. - """ - - for batch in range(0,self.noImgsToProcess,self.ell): - self.fetch_and_update_model(self.ell) - - self.comm.Barrier() - - def get_formatted_images(self, n): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - - bin_factor = self.bin_factor - downsample = self.downsample - - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - - return formatted_imgs - - def fetch_and_update_model(self, n): - """ - Fetch images and update model. - - Parameters - ---------- - n : int - number of images to incorporate - """ - - img_batch = self.get_formatted_images(n) - - - self.john_update_model(img_batch) - - - def john_update_model(self, X): - """ - Update matrix sketch with new batch of observations - - Parameters - ---------- - X: ndarray - data to update matrix sketch with - """ - - _, numIncorp = X.shape - n = self.num_incorporated_images - q = self.ell - - with TaskTimer(self.task_durations, "total update"): - - if self.rank == 0: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q - ) - ) - for row in X.T: - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - self.num_incorporated_images += 1 -# if self.rank==0: -# print(f'{self.lowMemoryReconstructionErrorUnscaled():.6f}') - - def john_rotate(self): - """ - Apply Frequent Directions Algorithm to - current matrix sketch and adjoined buffer - - Notes - ----- - Based on [1] and [2]. - - [1] Frequent Directions: Simple and Deterministic Matrix - Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and - David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 - - [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved - Practical Matrix Sketching with Guarantees. In: Schulz, A.S., - Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes - in Computer Science, vol 8737. Springer, Berlin, Heidelberg. - https://doi.org/10.1007/978-3-662-44777-2_39 - """ - - try: - [_,s,Vt] = svd(self.sketch , full_matrices=False) - except LinAlgError as err: - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) - - if len(s) >= self.ell: - sCopy = s.copy() - - toShrink = s[:self.ell]**2 - s[self.ell-1]**2 - #John: Explicitly set this value to be 0, since sometimes it is negative - # or even turns to NaN due to roundoff error - toShrink[-1] = 0 - toShrink = sqrt(toShrink) - - toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - - self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) - self.sketch[self.ell:,:] = 0 - self.nextZeroRow = self.ell - else: - self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) - self.sketch[len(s):,:] = 0 - self.nextZeroRow = len(s) - - def john_reconstructionError(self, matrixCentered): - """ - Compute the reconstruction error of the matrix sketch - against given data - - Parameters - ---------- - matrixCentered: ndarray - Data to compare matrix sketch to - """ - - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - UA, SA, VtA = np.linalg.svd(matrixCenteredT) - UAk = UA[:,:k] - SAk = np.diag(SA[:k]) - VtAk = VtA[:k] - Ak = UAk @ SAk @ VtAk - return (np.linalg.norm( - matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( - (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - - def lowMemoryReconstructionErrorUnscaled(self, matrixCentered): - """ - Compute the low memory reconstruction error of the matrix sketch - against given data. This si the same as john_reconstructionError, - but estimates the norm computation and does not scale by the matrix. - - Parameters - ---------- - matrixCentered: ndarray - Data to compare matrix sketch to - """ - - matSketch = self.sketch - k = 10 - matrixCenteredT = matrixCentered.T - matSketchT = matSketch.T - U, S, Vt = np.linalg.svd(matSketchT) - G = U[:,:k] - return estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 100) - - def estimFrobNormSquared(self, addMe, arrs, its): - """ - Estimate the Frobenius Norm of product of arrs matrices - plus addME matrix using its iterations. - - Parameters - ---------- - arrs: list of ndarray - Matrices to multiply together - - addMe: ndarray - Matrix to add to others - - its: int - Number of iterations to average over - - Returns - ------- - sumMe/its*no_rows : float - Estimate of frobenius norm of produce - of arrs matrices plus addMe matrix - - Notes - ----- - Frobenius estimation is the expected value of matrix - multiplied by random vector from multivariate normal distribution - based on [1]. - - [1] Norm and Trace Estimation with Random Rank-one Vectors - Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix - Analysis and Applications 2021 42:1, 202-223 - """ - - no_rows = arrs[-1].shape[1] - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - sumMe = 0 - for j in range(its): - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - v_addMe = addMe @ v_hat - for arr in arrs[::-1]: - v_hat = arr @ v_hat - sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 - return sumMe/its*no_rows - - - def gatherFreqDirs(self): - print("STARTING GATHER") - """ - Gather local matrix sketches to root node and - merge local sketches together. - """ - - self.comm.Barrier() - sendbuf = self.sketch[:self.ell,:] - recvbuf = None - if self.rank == 0: - recvbuf = np.empty( - [self.size, self.ell, self.d], dtype=np.double) - self.comm.Gather(sendbuf, recvbuf, root=0) - print("{} FINISHED GATHER".format(self.rank)) - if self.rank==0: - origMatSketch = self.sketch.copy() - origNextZeroRow = self.nextZeroRow - self.nextZeroRow = self.ell - print("BUFFER SHAPE: ", recvbuf.shape) - for j in range(1, self.size): - print("CURRENT BUFFER: ", j) - print(recvbuf[j]) - for row in recvbuf[j]: - if(np.any(row)): - if self.nextZeroRow >= self.m: - self.john_rotate() - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - toReturn = self.sketch.copy() - self.sketch = origMatSketch - return toReturn - else: - return - -def parse_input(): - """ - Parse command line input. - """ - - parser = argparse.ArgumentParser() - parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) - parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) - parser.add_argument( - "-d", - "--det_type", - help="Detector name, e.g epix10k2M or jungfrau4M.", - required=True, - type=str, - ) - parser.add_argument( - "--start_offset", - help="Run index of first image to be incorporated into iPCA model.", - required=False, - type=int, - ) - parser.add_argument( - "--num_components", - help="Number of principal components to compute and maintain.", - required=False, - type=int, - ) - parser.add_argument( - "--batch_size", - help="Size of image batch incorporated in each model update.", - required=False, - type=int, - ) - parser.add_argument( - "--num_images", - help="Total number of images to be incorporated into model.", - required=False, - type=int, - ) - parser.add_argument( - "--output_dir", - help="Path to output directory for recording task duration data.", - required=False, - type=str, - ) - parser.add_argument( - "--priming", - help="Initialize model with PCA.", - required=False, - action="store_true", - ) - parser.add_argument( - "--downsample", - help="Enable downsampling of images.", - required=False, - action="store_true", - ) - parser.add_argument( - "--bin_factor", - help="Bin factor if using downsizing.", - required=False, - type=int, - ) - - return parser.parse_args() - - -if __name__ == "__main__": - - params = parse_input() - kwargs = {k: v for k, v in vars(params).items() if v is not None} - - pipca = PiPCA(**kwargs) - pipca.run() diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 77d7f3367..1683d73a6 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -32,7 +32,6 @@ from PIL import Image -writeDirec = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/" #writeDirec = "h5writes/" ############################################# @@ -50,13 +49,15 @@ def __init__( run, det_type, rankAdapt, + writeDirec, merger=False, mergerFeatures=0, downsample=False, bin_factor=2, - output_dir="", + output_dir="" ): + self.writeDirec = writeDirec self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() @@ -419,7 +420,7 @@ def write(self): """ Write matrix sketch to h5 file. """ - filename = writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank) + filename = self.writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) hf.create_dataset("imgsTracked", data=self.imgsTracked) @@ -431,7 +432,7 @@ class MergeTree: """Frequent Directions Merging Object.""" - def __init__(self, divBy, readFile, dataSetName): + def __init__(self, divBy, readFile, dataSetName, writeDirec): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() @@ -441,7 +442,7 @@ def __init__(self, divBy, readFile, dataSetName): with h5py.File(readFile, 'r') as hf: self.data = hf[dataSetName][:] - self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1]) + self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], writeDirec=writeDirec) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -451,6 +452,8 @@ def __init__(self, divBy, readFile, dataSetName): #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA self.fd.john_update_model(self.data.T) + self.writeDirec = writeDirec + def merge(self): @@ -496,7 +499,7 @@ def write(self): """ Write merged matrix sketch to h5 file """ - filename = writeDirec + '{}_merge.h5'.format(currRun) + filename = self.writeDirec + '{}_merge.h5'.format(currRun) if self.rank==0: with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) @@ -517,6 +520,7 @@ def __init__( det_type, rankAdapt, readFile, dataSetName, + writeDirec, merger=False, mergerFeatures=0, downsample=False, @@ -524,6 +528,7 @@ def __init__( output_dir="" ): + self.writeDirec = writeDirec self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() @@ -727,7 +732,7 @@ def write(self): """ Write projected data and downsampled data to h5 file """ - filename = writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank) + filename = self.writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) From fb68d4b76c944c4209d77330acdbf43d67c2d055 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 3 Aug 2023 04:16:24 -0700 Subject: [PATCH 14/57] Refactored code and addressed many of the code review comments. --- btx/processing/freqdir.py | 440 ++++++++++++++++++-------------------- 1 file changed, 209 insertions(+), 231 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 1683d73a6..4a4ccdebc 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1,6 +1,10 @@ import os, csv, argparse import numpy as np +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd + from mpi4py import MPI from matplotlib import pyplot as plt @@ -16,92 +20,130 @@ assemble_image_stack_batch, ) -########################################### -#John Imports -from numpy import zeros, sqrt, dot, diag -from numpy.linalg import svd, LinAlgError -from scipy.linalg import svd as scipy_svd -import numpy as np - import time -from datetime import datetime -currRun = datetime.now().strftime("%y%m%d%H%M%S") - import h5py - from PIL import Image -#writeDirec = "h5writes/" -############################################# class FreqDir: - """Parallel Frequent Directions.""" + """ + Parallel Frequent Directions. + + Based on [1] and [2]. Frequent Directions is a matrix sketching algorithm used to + approximate large data sets. The basic goal of matrix sketching is to process an + n x d matrix A to somehow represent a matrix B so that ||A-B|| or covariance error + is small. Frequent Directions provably acheives a spectral bound on covariance + error and greatly outperforms comparable existing sketching techniques. It acheives + similar runtime and performance to incremental SVD as well. + + In this module we implement the frequent directions algorithm. This is the first of + three modules in this data processing pipeline, and it produces a sketch of a subset + of the data into an h5 file. The "Merge Tree" module will be responsible for merging + each of the sketches together, parallelizing the process, and the apply compression + algorithm will be responsible for using the full matrix sketch projecting the + original data to low dimensional space for data exploration. + + One novel feature of this implementation is the rank adaption feature: users have the + ability to select the approximate reconstruction error they want the sketch to operate + over, and the algorithm will adjust the rank of the sketch to meet this error bound + as data streams in. The module also gives users the ability to perform the sketching + process over thresholded and non-zero image data. + + [1] Frequent Directions: Simple and Deterministic Matrix + Sketching Mina Ghashami, Edo Liberty, Jeff M. Phillips, and + David P. Woodruff SIAM Journal on Computing 2016 45:5, 1762-1792 + + [2] Ghashami, M., Desai, A., Phillips, J.M. (2014). Improved + Practical Matrix Sketching with Guarantees. In: Schulz, A.S., + Wagner, D. (eds) Algorithms - ESA 2014. ESA 2014. Lecture Notes + in Computer Science, vol 8737. Springer, Berlin, Heidelberg. + https://doi.org/10.1007/978-3-662-44777-2_39 + + Attributes: + start_offset: starting index of images to process + total_imgs: total number of images to process + ell: number of components of matrix sketch + alpha: proportion of components to not rotate in frequent directions algorithm + exp, run, det_type: experiment properties + rankAdapt: indicates whether to perform rank adaptive FD + increaseEll: internal variable indicating whether ell should be increased for rank adaption + dir: directory to write output + merger: indicates whether object will be used to merge other FD objects + mergerFeatures: used if merger is true and indicates number of features of local matrix sketches + downsample, bin: whether data should be downsampled and by how much + threshold: whether data should be thresholded (zero if less than threshold amount) + normalizeIntensity: whether data should be normalized to have total intensity of one + noZeroIntensity: whether data with low total intensity should be discarded + d: number of features (pixels) in data + m: internal frequent directions variable recording total number of components used in algorithm + sketch: numpy array housing current matrix sketch + mean: geometric mean of data processed + num_incorporated_images: number of images processed so far + imgsTracked: indices of images processed so far + currRun: Current datetime used to identify run + """ def __init__( self, - john_start, - tot_imgs, - ell, - alpha, + start_offset, + total_imgs, exp, run, det_type, - rankAdapt, - writeDirec, + dir, + currRun, + ell=0, + alpha=0, + rankAdapt=False, merger=False, mergerFeatures=0, downsample=False, bin_factor=2, - output_dir="" + threshold=False, + normalizeIntensity=False, + noZeroIntensity=False, ): - self.writeDirec = writeDirec - self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.merger = merger + self.currRun = currRun + + self.merger = merger if not merger: self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size - + self.psi.counter = start_offset + total_imgs*self.rank//self.size self.downsample = downsample self.bin_factor = bin_factor - self.output_dir = output_dir - ( self.num_images, - _, self.num_features, - ) = self.set_params(tot_imgs, ell, bin_factor) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 + ) = self.set_params(total_imgs, bin_factor) else: - #JOHN: NEED TO IMPROVE. THIS IS WACK. self.num_features = mergerFeatures - self.task_durations = dict({}) - self.num_incorporated_images = 0 + self.task_durations = dict({}) + self.num_incorporated_images = 0 + self.dir = dir self.d = self.num_features self.ell = ell self.m = 2*self.ell self.sketch = zeros( (self.m, self.d) ) self.nextZeroRow = 0 self.alpha = alpha - - self.noImgsToProcess = tot_imgs//self.size + self.mean = None + self.imgsTracked = [] self.rankAdapt = rankAdapt self.increaseEll = False + self.threshold = threshold + self.noZeroIntensity = noZeroIntensity + self.normalizeIntensity=normalizeIntensity - self.imgsTracked = [] - - def set_params(self, num_images, num_components, bin_factor): + def set_params(self, num_images, bin_factor): """ Method to initialize FreqDir parameters. @@ -109,8 +151,6 @@ def set_params(self, num_images, num_components, bin_factor): ---------- num_images : int Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. bin_factor : int Factor to bin data by. @@ -118,8 +158,6 @@ def set_params(self, num_images, num_components, bin_factor): ------- num_images : int Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. num_features : int Number of features (dimension) in each image. """ @@ -128,7 +166,6 @@ def set_params(self, num_images, num_components, bin_factor): downsample = self.downsample num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) # set d det_shape = self.psi.det.shape() @@ -141,7 +178,7 @@ def set_params(self, num_images, num_components, bin_factor): else: num_features = int(num_features / bin_factor**2) - return num_images, num_components, num_features + return num_images, num_features def run(self): """ @@ -149,8 +186,9 @@ def run(self): on run subject to initialization parameters. """ - for batch in range(0,self.noImgsToProcess,self.ell*6): - self.fetch_and_update_model(self.ell*6) + noImgsToProcess = self.num_images//self.size + for batch in range(0,noImgsToProcess,self.ell*3): + self.fetch_and_update_model(self.ell*3) def get_formatted_images(self, n): """ @@ -187,9 +225,24 @@ def get_formatted_images(self, n): ] num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - return formatted_imgs + img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + nimg_batch = [] + for img in img_batch: + if self.threshold: + secondQuartile = np.sort(img)[-1]//4 + nimg = (img>secondQuartile)*img + else: + nimg = img + currIntensity = np.sum(nimg.flatten()) + if self.noZeroIntensity and currIntensity<1000: + continue + else: + if currIntensity>10000 and self.normalizeIntensity: + nimg_batch.append(nimg/currIntensity) + else: + nimg_batch.append(nimg) + return np.array(nimg_batch) def fetch_and_update_model(self, n): """ @@ -201,10 +254,15 @@ def fetch_and_update_model(self, n): number of images to incorporate """ img_batch = self.get_formatted_images(n) - self.john_update_model(img_batch) + if self.mean is None: + self.mean = np.sum(img_batch, axis=0)/(img_batch.shape[0]) + else: + self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=0))/( + self.num_incorporated_images + (img_batch.shape[0])) + self.update_model(img_batch - self.mean) - def john_update_model(self, X): + def update_model(self, X): """ Update matrix sketch with new batch of observations @@ -215,16 +273,15 @@ def john_update_model(self, X): """ _, numIncorp = X.shape origNumIncorp = numIncorp - n = self.num_incorporated_images - q = self.ell with TaskTimer(self.task_durations, "total update"): if self.rank==0 and not self.merger: print( "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=n, q=q + m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell ) ) for row in X.T: + canRankAdapt = numIncorp > (self.ell + 15) if self.nextZeroRow >= self.m: if self.increaseEll and canRankAdapt and self.rankAdapt: @@ -235,7 +292,7 @@ def john_update_model(self, X): print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell)) else: copyBatch = self.sketch[self.ell:,:].copy() - self.john_rotate() + self.rotate() if canRankAdapt and self.rankAdapt: reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch)) if (reconError > 0.08): @@ -245,7 +302,7 @@ def john_update_model(self, X): self.num_incorporated_images += 1 numIncorp -= 1 - def john_rotate(self): + def rotate(self): """ Apply Frequent Directions Algorithm to current matrix sketch and adjoined buffer @@ -287,7 +344,7 @@ def john_rotate(self): self.sketch[len(s):,:] = 0 self.nextZeroRow = len(s) - def john_reconstructionError(self, matrixCentered): + def reconstructionError(self, matrixCentered): """ Compute the reconstruction error of the matrix sketch against given data @@ -296,6 +353,11 @@ def john_reconstructionError(self, matrixCentered): ---------- matrixCentered: ndarray Data to compare matrix sketch to + + Returns + ------- + float, + Data subtracted by data projected onto sketched space, scaled by minimum theoretical sketch """ matSketch = self.sketch k = 10 @@ -315,13 +377,18 @@ def john_reconstructionError(self, matrixCentered): def lowMemoryReconstructionError(self, matrixCentered): """ Compute the low memory reconstruction error of the matrix sketch - against given data. This si the same as john_reconstructionError, + against given data. This si the same as reconstructionError, but estimates the norm computation and does not scale by the matrix. Parameters ---------- matrixCentered: ndarray Data to compare matrix sketch to + + Returns + ------- + float, + Data subtracted by data projected onto sketched space, scaled by matrix elements """ matSketch = self.sketch k = 10 @@ -351,7 +418,7 @@ def estimFrobNormSquared(self, addMe, arrs, its): Returns ------- sumMe/its*no_rows : float - Estimate of frobenius norm of produce + Estimate of frobenius norm of product of arrs matrices plus addMe matrix Notes @@ -397,12 +464,11 @@ def gatherFreqDirsSerial(self): for row in bufferMe: if(np.any(row)): if self.nextZeroRow >= self.m: - self.john_rotate() + self.rotate() self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 counter += 1 toReturn = self.sketch.copy() - print("COMPLETED MERGE PROCESS: ", toReturn) self.sketch = origMatSketch return toReturn else: @@ -420,10 +486,12 @@ def write(self): """ Write matrix sketch to h5 file. """ - filename = self.writeDirec + '{}_sketch_{}.h5'.format(currRun, self.rank) + filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) - hf.create_dataset("imgsTracked", data=self.imgsTracked) + hf.create_dataset("mean", data=self.mean) + hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked)) + hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images self.comm.Barrier() return filename @@ -432,7 +500,7 @@ class MergeTree: """Frequent Directions Merging Object.""" - def __init__(self, divBy, readFile, dataSetName, writeDirec): + def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() @@ -440,32 +508,37 @@ def __init__(self, divBy, readFile, dataSetName, writeDirec): self.divBy = divBy with h5py.File(readFile, 'r') as hf: - self.data = hf[dataSetName][:] + self.data = hf["sketch"][:] - self.fd = FreqDir(0, 0, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], writeDirec=writeDirec) + self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], dir=dir) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) if self.rank==0: print(self.buffSizes) - #JOHN: MUST CHECK THAT THIS ACTION ONLY FILLS UP THE SKETCH WITH THE CURRENT SKETCH FROM THE DATA - self.fd.john_update_model(self.data.T) + self.fd.update_model(self.data.T) - self.writeDirec = writeDirec + self.dir = dir + self.allWriteDirecs = allWriteDirecs - def merge(self): + self.fullMean = None + self.fullNumIncorp = 0 + self.fullImgsTracked = [] + + self.currRun = currRun + + def merge(self): """ Merge Frequent Direction Components in a tree-like fashion. Returns ------- finalSketch : ndarray Merged matrix sketch of cumulative data - - """ + powerNum = 1 while(powerNum < self.size): powerNum = powerNum * self.divBy @@ -484,14 +557,25 @@ def merge(self): bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double) self.comm.Recv(bufferMe, source=proc, tag=17) bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1])) - self.fd.john_update_model(bufferMe.T) + self.fd.update_model(bufferMe.T) else: bufferMe = self.fd.get().copy().flatten() self.comm.Send(bufferMe, dest=root, tag=17) level += 1 if self.rank==0: - finalSketch = self.fd.get() - return finalSketch + fullLen = len(self.allWriteDirecs) + for readMe in self.allWriteDirecs: + with h5py.File(readMe, 'r') as hf: + if self.fullMean is None: + self.fullMean = hf["mean"][:] + self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"] + self.fullImgsTracked = hf["imgsTracked"][:] + else: + self.fullMean = (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp + + hf["sketch"].attrs["numImgsIncorp"]) + self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"] + self.fullImgsTracked = np.vstack((self.fullImgsTracked, hf["imgsTracked"][:])) + return self.fd.get() else: return @@ -499,10 +583,13 @@ def write(self): """ Write merged matrix sketch to h5 file """ - filename = self.writeDirec + '{}_merge.h5'.format(currRun) + filename = self.dir + '{}_merge.h5'.format(self.currRun) if self.rank==0: with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) + hf.create_dataset("mean", data=self.fullMean) + hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp + hf.create_dataset("imgsTracked", data=self.fullImgsTracked) self.comm.Barrier() return filename @@ -511,68 +598,46 @@ class ApplyCompression: def __init__( self, - john_start, - tot_imgs, - ell, - alpha, + start_offset, + total_imgs, exp, run, det_type, - rankAdapt, - readFile, dataSetName, - writeDirec, - merger=False, - mergerFeatures=0, + readFile, + dir, + batchSize, + threshold, + noZeroIntensity, + normalizeIntensity, + currRun, downsample=False, - bin_factor=2, - output_dir="" + bin_factor=2 ): - self.writeDirec = writeDirec + self.dir = dir self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.merger = merger - - if not merger: - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = john_start + tot_imgs*self.rank//self.size - - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - _, - self.num_features, - ) = self.set_params(tot_imgs, ell, bin_factor) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - else: - #JOHN: NEED TO IMPROVE. THIS IS WACK. - self.num_features = mergerFeatures - self.task_durations = dict({}) - self.num_incorporated_images = 0 - self.d = self.num_features - self.ell = ell - self.m = 2*self.ell - self.sketch = zeros( (self.m, self.d) ) - self.nextZeroRow = 0 - self.alpha = alpha + self.total_imgs = total_imgs - self.noImgsToProcess = tot_imgs//self.size + self.currRun = currRun - self.rankAdapt = rankAdapt - self.increaseEll = False + self.imgGrabber = FreqDir(start_offset=start_offset,total_imgs=total_imgs, currRun = currRun, + exp=exp,run=run,det_type=det_type,dir="", downsample=downsample, bin_factor=bin_factor, + threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity) + self.batchSize = batchSize + ( + self.num_images, + self.num_features + ) = self.imgGrabber.set_params(total_imgs, bin_factor) + self.num_incorporated_images = 0 with h5py.File(readFile, 'r') as hf: - self.data = hf[dataSetName][:] + self.data = hf["sketch"][:] + self.mean = hf["mean"][:] U, S, Vt = np.linalg.svd(self.data, full_matrices=False) self.components = Vt @@ -583,94 +648,30 @@ def __init__( self.imageIndicesProcessed = [] - def set_params(self, num_images, num_components, bin_factor): - """ - Method to initialize FreqDir parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - num_features : int - Number of features (dimension) in each image. - """ - - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, num_features - def run(self): """ Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. """ - for batch in range(0,self.noImgsToProcess,self.ell*6): - startCounter = self.psi.counter - self.fetch_and_update_model(self.ell*6) - self.imageIndicesProcessed.append((startCounter, self.psi.counter)) - - def get_formatted_images(self, n): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. + noImgsToProcess = self.num_images//self.size + for batch in range(0,noImgsToProcess,self.batchSize): + self.fetch_and_process_data() - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index + def fetch_and_process_data(self): """ + Fetch and downsample data, apply projection algorithm + """ + startCounter = self.imgGrabber.psi.counter + img_batch = self.imgGrabber.get_formatted_images(self.batchSize) + self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) - bin_factor = self.bin_factor - downsample = self.downsample - - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + toSave_img_batch = self.assembleImgsToSave(img_batch) - return formatted_imgs + if self.smallImgs is None: + self.smallImgs = toSave_img_batch + else: + self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) + self.apply_compression(img_batch - self.mean) def assembleImgsToSave(self, imgs): """ @@ -680,41 +681,18 @@ def assembleImgsToSave(self, imgs): ---------- imgs: ndarray images to downsample - - Notes - ----- - There is no need to use a for loop here, since assemble_image_stack_batch - works on batches of images, and reshape can as well. """ - pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run)) saveMe = [] for img in imgs.T: - imgRe = np.reshape(img, self.psi.det.shape()) + imgRe = np.reshape(img, self.imgGrabber.psi.det.shape()) imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) - #saveMe.append(np.array(Image.fromarray(imgRe, mode='L').resize((150, 150), Image.Resampling.BICUBIC))) saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150)))) saveMe = np.array(saveMe) return saveMe - def fetch_and_update_model(self, n): - """ - Fetch and downsample data, apply projection algorithm - - Parameters - ---------- - n : int - number of images to process - """ - img_batch = self.get_formatted_images(n) - toSave_img_batch = self.assembleImgsToSave(img_batch) - if self.smallImgs is None: - self.smallImgs = toSave_img_batch - else: - self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) - self.john_apply_compression(img_batch) - - def john_apply_compression(self, X): + def apply_compression(self, X): """ Project data X onto matrix sketch space. @@ -732,7 +710,7 @@ def write(self): """ Write projected data and downsampled data to h5 file """ - filename = self.writeDirec + '{}_ProjectedData_{}.h5'.format(currRun, self.rank) + filename = self.dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) From c0f4a4290d506a9556d486acb6dce6081b8528d2 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 3 Aug 2023 14:23:23 -0700 Subject: [PATCH 15/57] Addressed more pull comments, fixed mean bug, and added more documentation. --- btx/processing/freqdir.py | 45 ++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 4a4ccdebc..81e0f208c 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -255,16 +255,36 @@ def fetch_and_update_model(self, n): """ img_batch = self.get_formatted_images(n) if self.mean is None: - self.mean = np.sum(img_batch, axis=0)/(img_batch.shape[0]) + self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1]) else: - self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=0))/( - self.num_incorporated_images + (img_batch.shape[0])) - self.update_model(img_batch - self.mean) + self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( + self.num_incorporated_images + (img_batch.shape[1])) + print("DATA IS NOW SHAPE: ", img_batch.shape) + print("SKETCH IS SHAPE: ", self.sketch.shape) + print("MEAN IS SHAPE: ", self.mean.shape) + self.update_model((img_batch.T - self.mean).T) def update_model(self, X): """ - Update matrix sketch with new batch of observations + Update matrix sketch with new batch of observations. + + The matrix sketch array is of size 2*ell. The first ell rows maintained + represent the current matrix sketch. The next ell rows form a buffer. + Each row of the data is added to the buffer until ell rows have been + accumulated. Then, we apply the rotate function to the buffer, which + incorporates the buffer data into the matrix sketch. + + Following the rotation step, it is checked if rank adaption is enabled. Then, + is checked if there is enough data to perform one full rotation/shrinkage + step. Without this check, one runs the risk of having zero rows in the + sketch, which is innaccurate in representing the data one has seen. + If one can increase the rank, the increaseEll flag is raised, and once sufficient + data has been accumulated in the buffer, the sketch and buffer size is increased. + This happens when we check if increaseEll, canRankAdapt, and rankAdapt are all true, + whereby we check if we should be increasing the rank due to high error, we + have sufficient incoming data to do so (to avoid zero rows in the matrix sketch), + and the user would like for the rank to be adaptive, respectively. Parameters ---------- @@ -304,8 +324,14 @@ def update_model(self, X): def rotate(self): """ - Apply Frequent Directions Algorithm to - current matrix sketch and adjoined buffer + Apply Frequent Directions rotation/shrinkage step to current matrix sketch and adjoined buffer. + + The Frequent Directions algorithm is inspired by the well known Misra Gries Frequent Items + algorithm. The Frequent Items problem is informally as follows: given a sequence of items, find the items which occur most frequently. The Misra Gries Frequent Items algorithm maintains a dictionary of <= k items and counts. For each item in a sequence, if the item is in the dictionary, increase its count. if the item is not in the dictionary and the size of the dictionary is <= k, then add the item with a count of 1 to the dictionary. Otherwise, decrease all counts in the dictionary by 1 and remove any items with 0 count. Every item which occurs more than n/k times is guaranteed to appear in the output array. + + The Frequent Directions Algorithm works in an analogous way for vectors: in the same way that Frequent Items periodically deletes ell different elements, Frequent Directions periodically "shrinks? ell orthogonal vectors by roughly the same amount. To do so, at each step: 1) Data is appended to the matrix sketch (whereby the last ell rows form a buffer and are zeroed at the start of the algorithm and after each rotation). 2) Matrix Sketch is rotated from left via SVD so that its rows are orthogonal and in descending magnitude order. 3) Norm of sketch rows are shrunk so that the smallest direction is set to 0. + + This function performs the rotation and shrinkage step by performing SVD and left multiplying by the unitary U matrix, followed by a subtraction. This particular implementation follows the alpha FD algorithm, which only performs the shrinkage step on the first alpha rows of the sketch, which has been shown to perform better than vanilla FD in [2]. Notes ----- @@ -638,6 +664,9 @@ def __init__( with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] self.mean = hf["mean"][:] + + print("NEW DATA IS SHAPE: ", self.data.shape) + print("NEW MEAN IS SHAPE: ", self.mean.shape) U, S, Vt = np.linalg.svd(self.data, full_matrices=False) self.components = Vt @@ -671,7 +700,7 @@ def fetch_and_process_data(self): self.smallImgs = toSave_img_batch else: self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) - self.apply_compression(img_batch - self.mean) + self.apply_compression((img_batch.T - self.mean).T) def assembleImgsToSave(self, imgs): """ From a05ca8b8095d7c74f17006bb6801a3f71230f954 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 3 Aug 2023 20:29:30 -0700 Subject: [PATCH 16/57] Added additional documentation for MergeTree and ApplyCompression modules --- btx/processing/freqdir.py | 77 +++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 11 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 81e0f208c..da970a751 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -61,7 +61,8 @@ class FreqDir: in Computer Science, vol 8737. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-44777-2_39 - Attributes: + Attributes + ---------- start_offset: starting index of images to process total_imgs: total number of images to process ell: number of components of matrix sketch @@ -72,7 +73,7 @@ class FreqDir: dir: directory to write output merger: indicates whether object will be used to merge other FD objects mergerFeatures: used if merger is true and indicates number of features of local matrix sketches - downsample, bin: whether data should be downsampled and by how much + downsample, bin_factor: whether data should be downsampled and by how much threshold: whether data should be thresholded (zero if less than threshold amount) normalizeIntensity: whether data should be normalized to have total intensity of one noZeroIntensity: whether data with low total intensity should be discarded @@ -259,9 +260,6 @@ def fetch_and_update_model(self, n): else: self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( self.num_incorporated_images + (img_batch.shape[1])) - print("DATA IS NOW SHAPE: ", img_batch.shape) - print("SKETCH IS SHAPE: ", self.sketch.shape) - print("MEAN IS SHAPE: ", self.mean.shape) self.update_model((img_batch.T - self.mean).T) @@ -474,7 +472,12 @@ def estimFrobNormSquared(self, addMe, arrs, its): def gatherFreqDirsSerial(self): """ Gather local matrix sketches to root node and - merge local sketches together. + merge local sketches together in a serial fashion. + + Returns + ------- + toReturn : ndarray + Sketch of all data processed by all cores """ sendbuf = self.ell buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -505,12 +508,22 @@ def gatherFreqDirsSerial(self): def get(self): """ Fetch matrix sketch + + Returns + ------- + self.sketch[:self.ell,:] : ndarray + Sketch of data locally processed """ return self.sketch[:self.ell, :] def write(self): """ Write matrix sketch to h5 file. + + Returns + ------- + filename : string + Name of h5 file where sketch, mean of data, and indices of data processed is written """ filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: @@ -524,7 +537,28 @@ def write(self): class MergeTree: - """Frequent Directions Merging Object.""" + """ + Class used to efficiently merge Frequent Directions Matrix Sketches + + The Frequent Directions matrix sketch has the special property that it is a mergeable + summary. This means it can be merged easily and retain the same theoretical guarantees + by stacking two sketches ontop of one another and applying the algorithm again. + + We can perform this merging process in a tree-like fashion in order to merge any + number of sketches in log number of applications of the frequent directions algorithm. + + The class is designed to take in local sketches of data from h5 files produced by + the FreqDir class (where local refers to the fact that a subset of the total number + of images has been processed by the algorithm in a single core and saved to its own h5 file). + + Attributes + ---------- + divBy: Factor to merge by at each step: number of sketches must be a power of divBy + readFile: File name of local sketch for this particular core to process + dir: directory to write output + allWriteDirecs: all file names of local sketches + currRun: Current datetime used to identify run + """ def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun): self.comm = MPI.COMM_WORLD @@ -620,7 +654,31 @@ def write(self): return filename class ApplyCompression: - """Compute principal components of matrix sketch and apply to data""" + """ + Compute principal components of matrix sketch and apply to data + + Attributes + ---------- + start_offset: starting index of images to process + total_imgs: total number of images to process + exp, run, det_type: experiment properties + dir: directory to write output + downsample, bin_factor: whether data should be downsampled and by how much + threshold: whether data should be thresholded (zero if less than threshold amount) + normalizeIntensity: whether data should be normalized to have total intensity of one + noZeroIntensity: whether data with low total intensity should be discarded + readFile: H5 file with matrix sketch + batchSize: Number of images to process at each iteration + data: numpy array housing current matrix sketch + mean: geometric mean of data processed + num_incorporated_images: number of images processed so far + imgageIndicesProcessed: indices of images processed so far + currRun: Current datetime used to identify run + imgGrabber: FD object used solely to retrieve data from psana + components: Principal Components of matrix sketch + processedData: Data projected onto matrix sketch range + smallImages: Downsampled images for visualization purposes + """ def __init__( self, @@ -665,9 +723,6 @@ def __init__( self.data = hf["sketch"][:] self.mean = hf["mean"][:] - print("NEW DATA IS SHAPE: ", self.data.shape) - print("NEW MEAN IS SHAPE: ", self.mean.shape) - U, S, Vt = np.linalg.svd(self.data, full_matrices=False) self.components = Vt From 832226d7645dcb03b0114686205bee06c2a9c508 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 8 Aug 2023 03:14:06 -0700 Subject: [PATCH 17/57] Checkpoint. Not sure what has been changed --- btx/processing/freqdir.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index da970a751..b4b2e1efb 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -229,7 +229,7 @@ def get_formatted_images(self, n): img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T nimg_batch = [] - for img in img_batch: + for img in img_batch.T: if self.threshold: secondQuartile = np.sort(img)[-1]//4 nimg = (img>secondQuartile)*img @@ -243,7 +243,7 @@ def get_formatted_images(self, n): nimg_batch.append(nimg/currIntensity) else: nimg_batch.append(nimg) - return np.array(nimg_batch) + return np.array(nimg_batch).T def fetch_and_update_model(self, n): """ @@ -575,7 +575,7 @@ def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun): sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) if self.rank==0: - print(self.buffSizes) + print("BUFFER SIZES: ", self.buffSizes) self.fd.update_model(self.data.T) From 2a298ba781aff5a0ae4a796ebffe05dbfaac27c8 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 9 Aug 2023 08:49:59 -0700 Subject: [PATCH 18/57] Checkpoint --- btx/processing/freqdir.py | 62 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index b4b2e1efb..c3c18fa75 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -24,7 +24,8 @@ import h5py from PIL import Image - +import random +import heapq class FreqDir: @@ -84,6 +85,7 @@ class FreqDir: num_incorporated_images: number of images processed so far imgsTracked: indices of images processed so far currRun: Current datetime used to identify run + samplingFactor: Proportion of batch data to process based on Priority Sampling Algorithm """ def __init__( @@ -105,6 +107,7 @@ def __init__( threshold=False, normalizeIntensity=False, noZeroIntensity=False, + samplingFactor=1.0 ): self.comm = MPI.COMM_WORLD @@ -144,6 +147,8 @@ def __init__( self.noZeroIntensity = noZeroIntensity self.normalizeIntensity=normalizeIntensity + self.samplingFactor = samplingFactor + def set_params(self, num_images, bin_factor): """ Method to initialize FreqDir parameters. @@ -188,8 +193,8 @@ def run(self): """ noImgsToProcess = self.num_images//self.size - for batch in range(0,noImgsToProcess,self.ell*3): - self.fetch_and_update_model(self.ell*3) + for batch in range(0,noImgsToProcess,int(self.ell*3//self.samplingFactor)): + self.fetch_and_update_model(int(self.ell*3//self.samplingFactor)) def get_formatted_images(self, n): """ @@ -255,6 +260,15 @@ def fetch_and_update_model(self, n): number of images to incorporate """ img_batch = self.get_formatted_images(n) + + if self.samplingFactor <1: + print("PRE PSAMP REDUCTION SHAPE: ", img_batch.shape) + psamp = PrioritySampling(int(n*self.samplingFactor), self.d) + for row in img_batch.T: + psamp.update(row) + img_batch = np.array(psamp.sketch.get()).T + print("PSAMP REDUCTION SHAPE: ", img_batch.shape) + if self.mean is None: self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1]) else: @@ -800,3 +814,45 @@ def write(self): hf.create_dataset("SmallImages", data=self.smallImgs) self.comm.Barrier() return filename + + +class CustomPriorityQueue: + def __init__(self, max_size): + self.queue = [] + self.index = 0 # To handle items with the same priority + self.max_size = max_size + + def push(self, item, priority, origWeight): + if len(self.queue) >= self.max_size: + self.pop() # Remove the lowest-priority item if queue is full + heapq.heappush(self.queue, (priority, self.index, (item, priority, origWeight))) + self.index += 1 + + def pop(self): + return heapq.heappop(self.queue)[-1] + + def is_empty(self): + return len(self.queue) == 0 + + def size(self): + return len(self.queue) + + def get(self): + ret = [] + while self.queue: + curr = heapq.heappop(self.queue)[-1] + #ret.append(curr[0]*max(curr[1], curr[2])/curr[2]) + ret.append(curr[0]) + return ret + +class PrioritySampling: + def __init__(self, ell, d): + self.ell = ell + self.d = d + self.sketch = CustomPriorityQueue(self.ell) + + def update(self, vec): + ui = random.random() + wi = np.linalg.norm(vec)**2 + pi = wi/ui + self.sketch.push(vec, pi, wi) From d321fab0f08240e4a23ef671bda8027c5e7c499e Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 10 Aug 2023 16:06:36 -0700 Subject: [PATCH 19/57] Added priority sampling in previous commit. Addressed minor comments in code review in this commit. --- btx/processing/freqdir.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index c3c18fa75..4780f85dd 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -116,8 +116,7 @@ def __init__( self.currRun = currRun - self.merger = merger - if not merger: + if not self.merger: self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) self.psi.counter = start_offset + total_imgs*self.rank//self.size self.downsample = downsample @@ -270,7 +269,7 @@ def fetch_and_update_model(self, n): print("PSAMP REDUCTION SHAPE: ", img_batch.shape) if self.mean is None: - self.mean = np.sum(img_batch.T, axis=0)/(img_batch.shape[1]) + self.mean = np.mean(img_batch, axis=1) else: self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( self.num_incorporated_images + (img_batch.shape[1])) From ed9e9eaff2ad38acef8ca371f3f0f5c0da6a69b9 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 16 Aug 2023 13:12:04 -0700 Subject: [PATCH 20/57] Checkpoint. Didn't really change anythnig other than initial steps towards parent class (reverting old functions to pipca versions). --- btx/processing/dimRed.py | 228 ++++++++++++++++++++++++++++++++++++++ btx/processing/freqdir.py | 65 ++++++++++- 2 files changed, 288 insertions(+), 5 deletions(-) create mode 100644 btx/processing/dimRed.py diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py new file mode 100644 index 000000000..37d410b96 --- /dev/null +++ b/btx/processing/dimRed.py @@ -0,0 +1,228 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +import holoviews as hv +hv.extension('bokeh') +from holoviews.streams import Params + +import panel as pn +import panel.widgets as pnw + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +class DimRed: + + """Parallelized Incremental Principal Component Analysis.""" + + def __init__( + self, + exp, + run, + det_type, + start_offset=0, + num_images=10, + num_components=10, + batch_size=10, + priming=False, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = start_offset + self.start_offset = start_offset + + self.priming = priming + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + self.num_components, + self.batch_size, + self.num_features, + ) = self.set_params(num_images, num_components, batch_size, bin_factor) + + self.split_indices, self.split_counts = distribute_indices_over_ranks( + self.num_features, self.size + ) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + self.outliers, self.pc_data = [], [] + + def get_params(self): + """ + Method to retrieve iPCA params. + + Returns + ------- + num_incorporated_images : int + number of images used to build model + num_components : int + number of components maintained in model + batch_size : int + batch size used in model updates + num_features : int + dimensionality of incorporated images + """ + return ( + self.num_incorporated_images, + self.num_components, + self.batch_size, + self.num_features, + ) + + def set_params(self, num_images, num_components, batch_size, bin_factor): + """ + Method to initialize iPCA parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + batch_size : int + Desired size of image block to be incorporated into model at each update. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + batch_size : int + Size of image block to be incorporated into model at each update. + num_features : int + Number of features (dimension) in each image. + """ + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + batch_size = min(batch_size, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, batch_size, num_features + + + def display_dashboard(self): + """ + Displays a pipca dashboard with a PC plot and intensity heatmap. + """ + + start_img = self.start_offset + + # Create PC dictionary and widgets + PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)} + PC_options = list(PCs) + + PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options) + PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options) + widgets_scatter = pn.WidgetBox(PCx, PCy, width=100) + + tap_source = None + posxy = hv.streams.Tap(source=tap_source, x=0, y=0) + + # Create PC scatter plot + @pn.depends(PCx.param.value, PCy.param.value) + def create_scatter(PCx, PCy): + img_index_arr = np.arange(start_img, start_img + len(PCs[PCx])) + scatter_data = {**PCs, 'Image': img_index_arr} + + opts = dict(width=400, height=300, color='Image', cmap='rainbow', + colorbar=True, show_grid=True, toolbar='above', tools=['hover']) + scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], + label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts) + + posxy.source = scatter + return scatter + + # Define function to compute heatmap based on tap location + def tap_heatmap(x, y, pcx, pcy): + # Finds the index of image closest to the tap location + img_source = None + min_diff = None + square_diff = None + + for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])): + square_diff = (x - xv) ** 2 + (y - yv) ** 2 + if (min_diff is None or square_diff < min_diff): + min_diff = square_diff + img_source = i + + # Downsample so heatmap is at most 100 x 100 + counter = self.psi.counter + self.psi.counter = start_img + img_source + img = self.psi.get_images(1) + _, x_pixels, y_pixels = img.shape + self.psi.counter = counter + + max_pixels = 100 + bin_factor_x = int(x_pixels / max_pixels) + bin_factor_y = int(y_pixels / max_pixels) + + while x_pixels % bin_factor_x != 0: + bin_factor_x += 1 + while y_pixels % bin_factor_y != 0: + bin_factor_y += 1 + + img = img.reshape((x_pixels, y_pixels)) + binned_img = img.reshape(int(x_pixels / bin_factor_x), + bin_factor_x, + int(y_pixels / bin_factor_y), + bin_factor_y).mean(-1).mean(1) + + # Creates hm_data array for heatmap + bin_x_pixels, bin_y_pixels = binned_img.shape + rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten() + cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels) + + hm_data = np.stack((rows, cols, binned_img.flatten())) + hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3)) + + opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above') + heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts) + + return heatmap + + # Connect the Tap stream to the tap_heatmap callback + stream1 = [posxy] + stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value}) + tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2) + + return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector') diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 4780f85dd..9efe12c1e 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -107,7 +107,9 @@ def __init__( threshold=False, normalizeIntensity=False, noZeroIntensity=False, - samplingFactor=1.0 + samplingFactor=1.0, + num_components=10, + batch_size = 10 ): self.comm = MPI.COMM_WORLD @@ -116,15 +118,22 @@ def __init__( self.currRun = currRun + self.merger = merger if not self.merger: self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) self.psi.counter = start_offset + total_imgs*self.rank//self.size self.downsample = downsample self.bin_factor = bin_factor +# ( +# self.num_images, +# self.num_features, +# ) = self.set_params(total_imgs, bin_factor) ( - self.num_images, - self.num_features, - ) = self.set_params(total_imgs, bin_factor) + self.num_images, + self.num_components, + self.batch_size, + self.num_features, + ) = self.set_params(total_imgs, num_components, batch_size, bin_factor) else: self.num_features = mergerFeatures self.task_durations = dict({}) @@ -148,7 +157,53 @@ def __init__( self.samplingFactor = samplingFactor - def set_params(self, num_images, bin_factor): + def set_params(self, num_images, num_components, batch_size, bin_factor): + """ + Method to initialize iPCA parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + batch_size : int + Desired size of image block to be incorporated into model at each update. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + batch_size : int + Size of image block to be incorporated into model at each update. + num_features : int + Number of features (dimension) in each image. + """ + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + batch_size = min(batch_size, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, batch_size, num_features + + def OLD_set_params(self, num_images, bin_factor): """ Method to initialize FreqDir parameters. From 79894e5251049320ebe3f544500442224954e1d8 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 20 Aug 2023 12:12:44 -0700 Subject: [PATCH 21/57] Set up parent class for dimension reduction called DimRed. Shared functions are housed here for Frequent Directions and PIPCA module. ALso appropriately modified FD and PIPCA code. Fixed indexing issue, removed means, zeroed negative values and fixed overflowing issues. Other nice changes. --- btx/processing/dimRed.py | 85 ++-- btx/processing/freqdir.py | 225 ++++------- btx/processing/pipca.py | 143 +++++-- btx/processing/pipcaOLD.py | 790 +++++++++++++++++++++++++++++++++++++ 4 files changed, 1051 insertions(+), 192 deletions(-) create mode 100644 btx/processing/pipcaOLD.py diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py index 37d410b96..06ff78c32 100644 --- a/btx/processing/dimRed.py +++ b/btx/processing/dimRed.py @@ -39,7 +39,7 @@ def __init__( priming=False, downsample=False, bin_factor=2, - output_dir="", + output_dir="" ): self.comm = MPI.COMM_WORLD @@ -139,90 +139,127 @@ def set_params(self, num_images, num_components, batch_size, bin_factor): return num_images, num_components, batch_size, num_features - def display_dashboard(self): """ Displays a pipca dashboard with a PC plot and intensity heatmap. """ - + start_img = self.start_offset - + # Create PC dictionary and widgets PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)} PC_options = list(PCs) - + PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options) PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options) widgets_scatter = pn.WidgetBox(PCx, PCy, width=100) - + tap_source = None posxy = hv.streams.Tap(source=tap_source, x=0, y=0) - + # Create PC scatter plot @pn.depends(PCx.param.value, PCy.param.value) def create_scatter(PCx, PCy): img_index_arr = np.arange(start_img, start_img + len(PCs[PCx])) scatter_data = {**PCs, 'Image': img_index_arr} - - opts = dict(width=400, height=300, color='Image', cmap='rainbow', + + opts = dict(width=400, height=300, color='Image', cmap='rainbow', colorbar=True, show_grid=True, toolbar='above', tools=['hover']) - scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], + scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts) - + posxy.source = scatter return scatter - + # Define function to compute heatmap based on tap location def tap_heatmap(x, y, pcx, pcy): # Finds the index of image closest to the tap location img_source = None min_diff = None square_diff = None - - for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])): + + for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])): square_diff = (x - xv) ** 2 + (y - yv) ** 2 if (min_diff is None or square_diff < min_diff): min_diff = square_diff img_source = i - + # Downsample so heatmap is at most 100 x 100 counter = self.psi.counter self.psi.counter = start_img + img_source img = self.psi.get_images(1) _, x_pixels, y_pixels = img.shape self.psi.counter = counter - + max_pixels = 100 bin_factor_x = int(x_pixels / max_pixels) bin_factor_y = int(y_pixels / max_pixels) - + while x_pixels % bin_factor_x != 0: bin_factor_x += 1 while y_pixels % bin_factor_y != 0: bin_factor_y += 1 - + img = img.reshape((x_pixels, y_pixels)) binned_img = img.reshape(int(x_pixels / bin_factor_x), bin_factor_x, int(y_pixels / bin_factor_y), bin_factor_y).mean(-1).mean(1) - + # Creates hm_data array for heatmap bin_x_pixels, bin_y_pixels = binned_img.shape rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten() cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels) - + hm_data = np.stack((rows, cols, binned_img.flatten())) hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3)) - + opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above') heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts) - + return heatmap - + # Connect the Tap stream to the tap_heatmap callback stream1 = [posxy] stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value}) tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2) - + return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector') + + +def distribute_indices_over_ranks(d, size): + """ + + Parameters + ---------- + d : int + total number of dimensions + size : int + number of ranks in world + + Returns + ------- + split_indices : ndarray, shape (size+1 x 1) + division indices between ranks + split_counts : ndarray, shape (size x 1) + number of dimensions allocated per rank + """ + + total_indices = 0 + split_indices, split_counts = [0], [] + + for r in range(size): + num_per_rank = d // size + if r < (d % size): + num_per_rank += 1 + + split_counts.append(num_per_rank) + + total_indices += num_per_rank + split_indices.append(total_indices) + + split_indices = np.array(split_indices) + split_counts = np.array(split_counts) + + return split_indices, split_counts + diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 9efe12c1e..f81df0065 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1,3 +1,8 @@ +import sys +sys.path.append("/sdf/home/w/winnicki/btx/") +from btx.processing.dimRed import * + + import os, csv, argparse import numpy as np @@ -27,7 +32,7 @@ import random import heapq -class FreqDir: +class FreqDir(DimRed): """ Parallel Frequent Directions. @@ -65,13 +70,13 @@ class FreqDir: Attributes ---------- start_offset: starting index of images to process - total_imgs: total number of images to process + num_imgs: total number of images to process ell: number of components of matrix sketch alpha: proportion of components to not rotate in frequent directions algorithm exp, run, det_type: experiment properties rankAdapt: indicates whether to perform rank adaptive FD increaseEll: internal variable indicating whether ell should be increased for rank adaption - dir: directory to write output + output_dir: directory to write output merger: indicates whether object will be used to merge other FD objects mergerFeatures: used if merger is true and indicates number of features of local matrix sketches downsample, bin_factor: whether data should be downsampled and by how much @@ -91,13 +96,12 @@ class FreqDir: def __init__( self, start_offset, - total_imgs, + num_imgs, exp, run, det_type, - dir, + output_dir, currRun, - ell=0, alpha=0, rankAdapt=False, merger=False, @@ -109,39 +113,29 @@ def __init__( noZeroIntensity=False, samplingFactor=1.0, num_components=10, - batch_size = 10 + batch_size = 10, + priming=False ): - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() + super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, + num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming, + downsample=downsample, bin_factor=bin_factor, output_dir=output_dir) + + self.psi.counter = start_offset + self.num_images*self.rank//self.size self.currRun = currRun + self.output_dir = output_dir + self.merger = merger - if not self.merger: - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = start_offset + total_imgs*self.rank//self.size - self.downsample = downsample - self.bin_factor = bin_factor -# ( -# self.num_images, -# self.num_features, -# ) = self.set_params(total_imgs, bin_factor) - ( - self.num_images, - self.num_components, - self.batch_size, - self.num_features, - ) = self.set_params(total_imgs, num_components, batch_size, bin_factor) - else: + + if self.merger: self.num_features = mergerFeatures - self.task_durations = dict({}) + self.num_incorporated_images = 0 - self.dir = dir self.d = self.num_features - self.ell = ell + self.ell = num_components self.m = 2*self.ell self.sketch = zeros( (self.m, self.d) ) self.nextZeroRow = 0 @@ -157,89 +151,6 @@ def __init__( self.samplingFactor = samplingFactor - def set_params(self, num_images, num_components, batch_size, bin_factor): - """ - Method to initialize iPCA parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - batch_size : int - Desired size of image block to be incorporated into model at each update. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - batch_size : int - Size of image block to be incorporated into model at each update. - num_features : int - Number of features (dimension) in each image. - """ - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - batch_size = min(batch_size, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, batch_size, num_features - - def OLD_set_params(self, num_images, bin_factor): - """ - Method to initialize FreqDir parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_features : int - Number of features (dimension) in each image. - """ - - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_features - def run(self): """ Perform frequent directions matrix sketching @@ -247,8 +158,8 @@ def run(self): """ noImgsToProcess = self.num_images//self.size - for batch in range(0,noImgsToProcess,int(self.ell*3//self.samplingFactor)): - self.fetch_and_update_model(int(self.ell*3//self.samplingFactor)) + for batch in range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor)): + self.fetch_and_update_model(int(self.ell*2//self.samplingFactor)) def get_formatted_images(self, n): """ @@ -287,6 +198,10 @@ def get_formatted_images(self, n): num_valid_imgs, p, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + #JOHN NEW ADDITION 08 20 2023 08 55 + img_batch[img_batch<0] = 0 + nimg_batch = [] for img in img_batch.T: if self.threshold: @@ -294,16 +209,43 @@ def get_formatted_images(self, n): nimg = (img>secondQuartile)*img else: nimg = img - currIntensity = np.sum(nimg.flatten()) - if self.noZeroIntensity and currIntensity<1000: + + currIntensity = np.sum(nimg.flatten(), dtype=np.double) + if self.noZeroIntensity and currIntensity<50000: continue else: - if currIntensity>10000 and self.normalizeIntensity: + if currIntensity>=50000 and self.normalizeIntensity: nimg_batch.append(nimg/currIntensity) else: nimg_batch.append(nimg) return np.array(nimg_batch).T + ########################################################################### + + #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch. + def intensityFunc_threshold(img): + if img is None: + return img + else: + secondQuartile = np.sort(img)[-1]//4 + return (img>secondQuartile)*img + + def intensityFunc_removeZeroIntensity(img, currIntensity): + if currIntensity<50000: + return None + else: + return img + + def intensityFunc_normalizeIntensity(img, currIntensity): + if img is None: + return img + + if currIntensity<50000: + return img + else: + return img/currIntensity + ########################################################################### + def fetch_and_update_model(self, n): """ Fetch images and update model. @@ -324,11 +266,15 @@ def fetch_and_update_model(self, n): print("PSAMP REDUCTION SHAPE: ", img_batch.shape) if self.mean is None: - self.mean = np.mean(img_batch, axis=1) +# self.mean = np.mean(img_batch, axis=1) + self.mean = np.sum(img_batch, axis=1, dtype=np.double)/(img_batch.shape[1]) else: - self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( +# self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( +# self.num_incorporated_images + (img_batch.shape[1])) + self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/( self.num_incorporated_images + (img_batch.shape[1])) - self.update_model((img_batch.T - self.mean).T) +# self.update_model((img_batch.T - self.mean).T) + self.update_model(img_batch) def update_model(self, X): @@ -593,7 +539,7 @@ def write(self): filename : string Name of h5 file where sketch, mean of data, and indices of data processed is written """ - filename = self.dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) + filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) hf.create_dataset("mean", data=self.mean) @@ -628,7 +574,7 @@ class MergeTree: currRun: Current datetime used to identify run """ - def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun): + def __init__(self, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() @@ -638,16 +584,17 @@ def __init__(self, divBy, readFile, dir, allWriteDirecs, currRun): with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp='0', run='0', det_type='0', ell=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], dir=dir) + self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) if self.rank==0: print("BUFFER SIZES: ", self.buffSizes) + print(self.data.T.shape) self.fd.update_model(self.data.T) - self.dir = dir + self.output_dir = output_dir self.allWriteDirecs = allWriteDirecs @@ -711,7 +658,7 @@ def write(self): """ Write merged matrix sketch to h5 file """ - filename = self.dir + '{}_merge.h5'.format(self.currRun) + filename = self.output_dir + '{}_merge.h5'.format(self.currRun) if self.rank==0: with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) @@ -728,7 +675,7 @@ class ApplyCompression: Attributes ---------- start_offset: starting index of images to process - total_imgs: total number of images to process + num_imgs: total number of images to process exp, run, det_type: experiment properties dir: directory to write output downsample, bin_factor: whether data should be downsampled and by how much @@ -751,12 +698,12 @@ class ApplyCompression: def __init__( self, start_offset, - total_imgs, + num_imgs, exp, run, det_type, readFile, - dir, + output_dir, batchSize, threshold, noZeroIntensity, @@ -766,25 +713,24 @@ def __init__( bin_factor=2 ): - self.dir = dir + self.output_dir = output_dir self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.total_imgs = total_imgs + self.num_imgs = num_imgs self.currRun = currRun - self.imgGrabber = FreqDir(start_offset=start_offset,total_imgs=total_imgs, currRun = currRun, - exp=exp,run=run,det_type=det_type,dir="", downsample=downsample, bin_factor=bin_factor, - threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity) + self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, + exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor, + threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False) self.batchSize = batchSize - ( - self.num_images, - self.num_features - ) = self.imgGrabber.set_params(total_imgs, bin_factor) + self.num_images = self.imgGrabber.num_images + self.num_features = self.imgGrabber.num_features + self.num_incorporated_images = 0 with h5py.File(readFile, 'r') as hf: @@ -823,7 +769,8 @@ def fetch_and_process_data(self): self.smallImgs = toSave_img_batch else: self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) - self.apply_compression((img_batch.T - self.mean).T) +# self.apply_compression((img_batch.T - self.mean).T) + self.apply_compression(img_batch) def assembleImgsToSave(self, imgs): """ @@ -862,7 +809,7 @@ def write(self): """ Write projected data and downsampled data to h5 file """ - filename = self.dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank) + filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) diff --git a/btx/processing/pipca.py b/btx/processing/pipca.py index 5ce47ea8f..6c401be6f 100644 --- a/btx/processing/pipca.py +++ b/btx/processing/pipca.py @@ -1,3 +1,7 @@ +import sys +sys.path.append("/sdf/home/w/winnicki/btx/") +from btx.processing.dimRed import * + import os, csv, argparse import numpy as np @@ -6,6 +10,13 @@ from matplotlib import pyplot as plt from matplotlib import colors +import holoviews as hv +hv.extension('bokeh') +from holoviews.streams import Params + +import panel as pn +import panel.widgets as pnw + from btx.misc.shortcuts import TaskTimer from btx.interfaces.ipsana import ( @@ -16,7 +27,7 @@ assemble_image_stack_batch, ) -class PiPCA: +class PiPCA(DimRed): """Parallelized Incremental Principal Component Analysis.""" @@ -34,35 +45,20 @@ def __init__( bin_factor=2, output_dir="", ): - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = start_offset - - self.priming = priming - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - self.num_components, - self.batch_size, - self.num_features, - ) = self.set_params(num_images, num_components, batch_size, bin_factor) - - self.split_indices, self.split_counts = distribute_indices_over_ranks( - self.num_features, self.size + super().__init__( + exp=exp, + run=run, + det_type=det_type, + start_offset=start_offset, + num_images=num_images, + num_components=num_components, + batch_size=batch_size, + priming=priming, + downsample=downsample, + bin_factor=bin_factor, + output_dir=output_dir, ) - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - self.outliers, self.pc_data = [], [] - def get_params(self): """ Method to retrieve iPCA params. @@ -161,6 +157,8 @@ def run(self): # update model with remaining batches for batch_size in batch_sizes: self.fetch_and_update_model(batch_size) + + print("Model complete") def get_formatted_images(self, n, start_index, end_index): """ @@ -278,7 +276,7 @@ def update_model(self, X): with TaskTimer(self.task_durations, "record pc data"): if n > 0: - self.record_loadings(X, 5) + self.record_loadings(X, q) with TaskTimer(self.task_durations, "update mean and variance"): mu_n = self.mu @@ -672,6 +670,93 @@ def display_image(self, idx, output_dir="", save_image=False): plt.show() + + def display_dashboard(self): + """ + Displays a pipca dashboard with a PC plot and intensity heatmap. + """ + + start_img = self.start_offset + + # Create PC dictionary and widgets + PCs = {f'PC{i}' : v for i, v in enumerate(self.pc_data, start=1)} + PC_options = list(PCs) + + PCx = pnw.Select(name='X-Axis', value='PC1', options=PC_options) + PCy = pnw.Select(name='Y-Axis', value='PC2', options=PC_options) + widgets_scatter = pn.WidgetBox(PCx, PCy, width=100) + + tap_source = None + posxy = hv.streams.Tap(source=tap_source, x=0, y=0) + + # Create PC scatter plot + @pn.depends(PCx.param.value, PCy.param.value) + def create_scatter(PCx, PCy): + img_index_arr = np.arange(start_img, start_img + len(PCs[PCx])) + scatter_data = {**PCs, 'Image': img_index_arr} + + opts = dict(width=400, height=300, color='Image', cmap='rainbow', + colorbar=True, show_grid=True, toolbar='above', tools=['hover']) + scatter = hv.Points(scatter_data, kdims=[PCx, PCy], vdims=['Image'], + label="%s vs %s" % (PCx.title(), PCy.title())).opts(**opts) + + posxy.source = scatter + return scatter + + # Define function to compute heatmap based on tap location + def tap_heatmap(x, y, pcx, pcy): + # Finds the index of image closest to the tap location + img_source = None + min_diff = None + square_diff = None + + for i, (xv, yv) in enumerate(zip(PCs[pcx], PCs[pcy])): + square_diff = (x - xv) ** 2 + (y - yv) ** 2 + if (min_diff is None or square_diff < min_diff): + min_diff = square_diff + img_source = i + + # Downsample so heatmap is at most 100 x 100 + counter = self.psi.counter + self.psi.counter = start_img + img_source + img = self.psi.get_images(1) + _, x_pixels, y_pixels = img.shape + self.psi.counter = counter + + max_pixels = 100 + bin_factor_x = int(x_pixels / max_pixels) + bin_factor_y = int(y_pixels / max_pixels) + + while x_pixels % bin_factor_x != 0: + bin_factor_x += 1 + while y_pixels % bin_factor_y != 0: + bin_factor_y += 1 + + img = img.reshape((x_pixels, y_pixels)) + binned_img = img.reshape(int(x_pixels / bin_factor_x), + bin_factor_x, + int(y_pixels / bin_factor_y), + bin_factor_y).mean(-1).mean(1) + + # Creates hm_data array for heatmap + bin_x_pixels, bin_y_pixels = binned_img.shape + rows = np.tile(np.arange(bin_x_pixels).reshape((bin_x_pixels, 1)), bin_y_pixels).flatten() + cols = np.tile(np.arange(bin_y_pixels), bin_x_pixels) + + hm_data = np.stack((rows, cols, binned_img.flatten())) + hm_data = hm_data.T.reshape((bin_x_pixels * bin_y_pixels, 3)) + + opts = dict(width=400, height=300, cmap='plasma', colorbar=True, toolbar='above') + heatmap = hv.HeatMap(hm_data, label="Image %s" % (start_img+img_source)).aggregate(function=np.mean).opts(**opts) + + return heatmap + + # Connect the Tap stream to the tap_heatmap callback + stream1 = [posxy] + stream2 = Params.from_params({'pcx': PCx.param.value, 'pcy': PCy.param.value}) + tap_dmap = hv.DynamicMap(tap_heatmap, streams=stream1+stream2) + + return pn.Row(widgets_scatter, create_scatter, tap_dmap).servable('Cross-selector') def distribute_indices_over_ranks(d, size): """ diff --git a/btx/processing/pipcaOLD.py b/btx/processing/pipcaOLD.py new file mode 100644 index 000000000..5ce47ea8f --- /dev/null +++ b/btx/processing/pipcaOLD.py @@ -0,0 +1,790 @@ +import os, csv, argparse + +import numpy as np +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) + +class PiPCA: + + """Parallelized Incremental Principal Component Analysis.""" + + def __init__( + self, + exp, + run, + det_type, + start_offset=0, + num_images=10, + num_components=10, + batch_size=10, + priming=False, + downsample=False, + bin_factor=2, + output_dir="", + ): + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = start_offset + + self.priming = priming + self.downsample = downsample + self.bin_factor = bin_factor + self.output_dir = output_dir + + ( + self.num_images, + self.num_components, + self.batch_size, + self.num_features, + ) = self.set_params(num_images, num_components, batch_size, bin_factor) + + self.split_indices, self.split_counts = distribute_indices_over_ranks( + self.num_features, self.size + ) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + self.outliers, self.pc_data = [], [] + + def get_params(self): + """ + Method to retrieve iPCA params. + + Returns + ------- + num_incorporated_images : int + number of images used to build model + num_components : int + number of components maintained in model + batch_size : int + batch size used in model updates + num_features : int + dimensionality of incorporated images + """ + return ( + self.num_incorporated_images, + self.num_components, + self.batch_size, + self.num_features, + ) + + def set_params(self, num_images, num_components, batch_size, bin_factor): + """ + Method to initialize iPCA parameters. + + Parameters + ---------- + num_images : int + Desired number of images to incorporate into model. + num_components : int + Desired number of components for model to maintain. + batch_size : int + Desired size of image block to be incorporated into model at each update. + bin_factor : int + Factor to bin data by. + + Returns + ------- + num_images : int + Number of images to incorporate into model. + num_components : int + Number of components for model to maintain. + batch_size : int + Size of image block to be incorporated into model at each update. + num_features : int + Number of features (dimension) in each image. + """ + max_events = self.psi.max_events + downsample = self.downsample + + num_images = min(num_images, max_events) if num_images != -1 else max_events + num_components = min(num_components, num_images) + batch_size = min(batch_size, num_images) + + # set d + det_shape = self.psi.det.shape() + num_features = np.prod(det_shape).astype(int) + + if downsample: + if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: + print("Invalid bin factor, toggled off downsampling.") + self.downsample = False + else: + num_features = int(num_features / bin_factor**2) + + return num_images, num_components, batch_size, num_features + + def run(self): + """ + Perform iPCA on run subject to initialization parameters. + """ + m = self.batch_size + num_images = self.num_images + + # initialize and prime model, if specified + if self.priming: + img_batch = self.get_formatted_images( + self.num_components, 0, self.num_features + ) + self.prime_model(img_batch) + else: + self.U = np.zeros((self.split_counts[self.rank], self.num_components)) + self.S = np.ones(self.num_components) + self.mu = np.zeros((self.split_counts[self.rank], 1)) + self.total_variance = np.zeros((self.split_counts[self.rank], 1)) + + # divide remaining number of images into batches + # will become redundant in a streaming setting, need to change + rem_imgs = num_images - self.num_incorporated_images + batch_sizes = np.array( + [m] * np.floor(rem_imgs / m).astype(int) + + ([rem_imgs % m] if rem_imgs % m else []) + ) + + # update model with remaining batches + for batch_size in batch_sizes: + self.fetch_and_update_model(batch_size) + + def get_formatted_images(self, n, start_index, end_index): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + + bin_factor = self.bin_factor + downsample = self.downsample + + # may have to rewrite eventually when number of images becomes large, + # i.e. streamed setting, either that or downsample aggressively + imgs = self.psi.get_images(n, assemble=False) + + if downsample: + imgs = bin_data(imgs, bin_factor) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + num_valid_imgs, p, x, y = imgs.shape + formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + + return formatted_imgs[start_index:end_index, :] + + def prime_model(self, X): + """ + Initialize model on sample of data using batch PCA. + + Parameters + ---------- + X : ndarray, shape (d x n) + set of n (d x 1) observations + """ + + d, n = X.shape + + if self.rank == 0: + print(f"Priming model with {n} samples...") + + + mu_full, total_variance_full = self.calculate_sample_mean_and_variance(X) + + self.mu = mu_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]] + self.total_variance = total_variance_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]] + + centered_data = X - np.tile(mu_full, n) + + U, self.S, _ = np.linalg.svd(centered_data, full_matrices=False) + self.U = U[self.split_indices[self.rank]:self.split_indices[self.rank+1], :] + + self.num_incorporated_images += n + + def fetch_and_update_model(self, n): + """ + Fetch images and update model. + + Parameters + ---------- + n : int + number of images to incorporate + """ + + rank = self.rank + start_index, end_index = self.split_indices[rank], self.split_indices[rank + 1] + + img_batch = self.get_formatted_images(n, start_index, end_index) + + self.update_model(img_batch) + + def update_model(self, X): + """ + Update model with new batch of observations using iPCA. + + Parameters + ---------- + X : ndarray, shape (d x m) + batch of m (d x 1) observations + + Notes + ----- + Implementation of iPCA algorithm from [1]. + + References + ---------- + [1] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking. + International journal of computer vision. 2008 May;77(1):125-41. + """ + _, m = X.shape + n = self.num_incorporated_images + q = self.num_components + + with TaskTimer(self.task_durations, "total update"): + + if self.rank == 0: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=m, s="s" if m > 1 else "", n=n, q=q + ) + ) + + with TaskTimer(self.task_durations, "record pc data"): + if n > 0: + self.record_loadings(X, 5) + + with TaskTimer(self.task_durations, "update mean and variance"): + mu_n = self.mu + mu_m, s_m = self.calculate_sample_mean_and_variance(X) + + self.total_variance = self.update_sample_variance( + self.total_variance, s_m, mu_n, mu_m, n, m + ) + self.mu = self.update_sample_mean(mu_n, mu_m, n, m) + + with TaskTimer( + self.task_durations, "center data and compute augment vector" + ): + X_centered = X - np.tile(mu_m, m) + mean_augment_vector = np.sqrt(n * m / (n + m)) * (mu_m - mu_n) + + X_augmented = np.hstack((X_centered, mean_augment_vector)) + + with TaskTimer(self.task_durations, "first matrix product U@S"): + US = self.U @ np.diag(self.S) + + with TaskTimer(self.task_durations, "QR concatenate"): + A = np.hstack((US, X_augmented)) + + with TaskTimer(self.task_durations, "parallel QR"): + Q_r, U_tilde, S_tilde = self.parallel_qr(A) + + with TaskTimer(self.task_durations, "compute local U_prime"): + self.U = Q_r @ U_tilde[:, :q] + self.S = S_tilde[:q] + + self.num_incorporated_images += m + + + def calculate_sample_mean_and_variance(self, imgs): + """ + Compute the sample mean and variance of a flattened stack of n images. + + Parameters + ---------- + imgs : ndarray, shape (d x n) + horizonally stacked batch of flattened images + + Returns + ------- + mu_m : ndarray, shape (d x 1) + mean of imgs + su_m : ndarray, shape (d x 1) + sample variance of imgs (1 dof) + """ + d, m = imgs.shape + + mu_m = np.reshape(np.mean(imgs, axis=1), (d, 1)) + s_m = np.zeros((d, 1)) + + if m > 1: + s_m = np.reshape(np.var(imgs, axis=1, ddof=1), (d, 1)) + + return mu_m, s_m + + def parallel_qr(self, A): + """ + Perform parallelized qr factorization on input matrix A. + + Parameters + ---------- + A : ndarray, shape (_ x q+m+1) + Input data to be factorized. + + Returns + ------- + q_fin : ndarray, shape (_, q+m+1) + Q_{r,1} from TSQR algorithm, where r = self.rank + 1 + U_tilde : ndarray, shape (q+m+1, q+m+1) + Q_{r,2} from TSQR algorithm, where r = self.rank + 1 + S_tilde : ndarray, shape (q+m+1) + R_tilde from TSQR algorithm, where r = self.rank + 1 + + Notes + ----- + Parallel QR algorithm implemented from [1], with additional elements from [2] + sprinkled in to record elements for iPCA using SVD, etc. + + References + ---------- + [1] Benson AR, Gleich DF, Demmel J. Direct QR factorizations for tall-and-skinny + matrices in MapReduce architectures. In2013 IEEE international conference on + big data 2013 Oct 6 (pp. 264-272). IEEE. + + [2] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking. + International journal of computer vision. 2008 May;77(1):125-41. + + [3] Maulik, R., & Mengaldo, G. (2021, November). PyParSVD: A streaming, distributed and + randomized singular-value-decomposition library. In 2021 7th International Workshop on + Data Analysis and Reduction for Big Scientific Data (DRBSD-7) (pp. 19-25). IEEE. + """ + _, x = A.shape + q = self.num_components + m = x - q - 1 + + with TaskTimer(self.task_durations, "qr - local qr"): + Q_r1, R_r = np.linalg.qr(A, mode="reduced") + + self.comm.Barrier() + + with TaskTimer(self.task_durations, "qr - r_tot gather"): + if self.rank == 0: + R = np.empty((self.size * (q + m + 1), q + m + 1)) + else: + R = None + + self.comm.Gather(R_r, R, root=0) + + if self.rank == 0: + with TaskTimer(self.task_durations, "qr - global qr"): + Q_2, R_tilde = np.linalg.qr(R, mode="reduced") + + with TaskTimer(self.task_durations, "qr - global svd"): + U_tilde, S_tilde, _ = np.linalg.svd(R_tilde) + else: + U_tilde = np.empty((q + m + 1, q + m + 1)) + S_tilde = np.empty(q + m + 1) + Q_2 = None + + self.comm.Barrier() + + with TaskTimer(self.task_durations, "qr - scatter q_tot"): + Q_r2 = np.empty((q + m + 1, q + m + 1)) + self.comm.Scatter(Q_2, Q_r2, root=0) + + with TaskTimer(self.task_durations, "qr - local matrix build"): + Q_r = Q_r1 @ Q_r2 + + self.comm.Barrier() + + with TaskTimer(self.task_durations, "qr - bcast S_tilde"): + self.comm.Bcast(S_tilde, root=0) + + self.comm.Barrier() + + with TaskTimer(self.task_durations, "qr - bcast U_tilde"): + self.comm.Bcast(U_tilde, root=0) + + return Q_r, U_tilde, S_tilde + + def update_sample_mean(self, mu_n, mu_m, n, m): + """ + Compute combined mean of two blocks of data. + + Parameters + ---------- + mu_n : ndarray, shape (d x 1) + mean of first block of data + mu_m : ndarray, shape (d x 1) + mean of second block of data + n : int + number of observations in first block of data + m : int + number of observations in second block of data + + Returns + ------- + mu_nm : ndarray, shape (d x 1) + combined mean of both blocks of input data + """ + mu_nm = mu_m + + if n != 0: + mu_nm = (1 / (n + m)) * (n * mu_n + m * mu_m) + + return mu_nm + + def update_sample_variance(self, s_n, s_m, mu_n, mu_m, n, m): + """ + Compute combined sample variance of two blocks + of data described by input parameters. + + Parameters + ---------- + s_n : ndarray, shape (d x 1) + sample variance of first block of data + s_m : ndarray, shape (d x 1) + sample variance of second block of data + mu_n : ndarray, shape (d x 1) + mean of first block of data + mu_m : ndarray, shape (d x 1) + mean of second block of data + n : int + number of observations in first block of data + m : int + number of observations in second block of data + + Returns + ------- + s_nm : ndarray, shape (d x 1) + combined sample variance of both blocks of data described by input + parameters + """ + s_nm = s_m + + if n != 0: + s_nm = (((n - 1) * s_n + (m - 1) * s_m) + + (n * m * (mu_n - mu_m) ** 2) / (n + m)) / (n + m - 1) + + return s_nm + + def get_model(self): + """ + Method to retrieve model parameters. + + Returns + ------- + U_tot : ndarray, shape (d x q) + iPCA principal axes from model. + S_tot : ndarray, shape (1 x q) + iPCA singular values from model. + mu_tot : ndarray, shape (1 x d) + Data mean computed from all input images. + var_tot : ndarray, shape (1 x d) + Sample data variance computed from all input images. + """ + if self.rank == 0: + U_tot = np.empty(self.num_features * self.num_components) + mu_tot = np.empty((self.num_features, 1)) + var_tot = np.empty((self.num_features, 1)) + else: + U_tot, mu_tot, var_tot = None, None, None + + start_indices = self.split_indices[:-1] + + self.comm.Gatherv( + self.U.flatten(), + [ + U_tot, + self.split_counts * self.num_components, + start_indices * self.num_components, + MPI.DOUBLE, + ], + root=0, + ) + + if self.rank == 0: + U_tot = np.reshape(U_tot, (self.num_features, self.num_components)) + + self.comm.Gatherv( + self.mu, + [ + mu_tot, + self.split_counts * self.num_components, + start_indices, + MPI.DOUBLE, + ], + root=0, + ) + self.comm.Gatherv( + self.total_variance, + [ + var_tot, + self.split_counts * self.num_components, + start_indices, + MPI.DOUBLE, + ], + root=0, + ) + + S_tot = self.S + + return U_tot, S_tot, mu_tot, var_tot + + def get_outliers(self): + """ + Method to retrieve and print outliers on root process. + """ + + if self.rank == 0: + print(self.outliers) + + def record_loadings(self, X, q_sig): + """ + Method to store all loadings, ΣV^T, from present batch using past + model iteration. + + Parameters + ---------- + X : ndarray, shape (_ x m) + Local subdivision of current image data batch. + + q_sig : int + The q_sig components used in generating the loadings for + """ + _, m = X.shape + n, d = self.num_incorporated_images, self.num_features + + start_indices = self.split_indices[:-1] + + U, _, mu, _ = self.get_model() + + if self.rank == 0: + X_tot = np.empty((d, m)) + else: + X_tot = None + + self.comm.Gatherv( + X.flatten(), + [ + X_tot, + self.split_counts * m, + start_indices * m, + MPI.DOUBLE, + ], + root=0, + ) + + if self.rank == 0: + + X_tot = np.reshape(X_tot, (d, m)) + cb = X_tot - np.tile(mu, (1, m)) + + pcs = U.T @ cb + self.pc_data = ( + np.concatenate((self.pc_data, pcs), axis=1) + if len(self.pc_data) + else pcs + ) + + pc_dist = np.linalg.norm(pcs[:q_sig], axis=0) + std = np.std(pc_dist) + mu = np.mean(pc_dist) + + batch_outliers = np.where(np.abs(pc_dist - mu) > std)[0] + n - m + + self.outliers = ( + np.concatenate((self.outliers, batch_outliers), axis=0) + if len(self.outliers) + else batch_outliers + ) + + def display_image(self, idx, output_dir="", save_image=False): + """ + Method to retrieve single image from run subject to model binning constraints. + + Parameters + ---------- + idx : int + Run index of image to be retrieved. + output_dir : str, optional + File path to output directory, by default "" + save_image : bool, optional + Whether to save image to file, by default False + """ + + U, S, mu, var = self.get_model() + + if self.rank != 0: + return + + bin_factor = 1 + if self.downsample: + bin_factor = self.bin_factor + + n, q, m, d = self.get_params() + + a, b, c = self.psi.det.shape() + b = int(b / bin_factor) + c = int(c / bin_factor) + + fig, ax = plt.subplots(1) + + counter = self.psi.counter + self.psi.counter = idx + img = self.get_formatted_images(1, 0, d) + self.psi.counter = counter + + img = img - mu + img = np.reshape(img, (a, b, c)) + + pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + binned_pim = bin_pixel_index_map(pixel_index_map, bin_factor) + + img = assemble_image_stack_batch(img, binned_pim) + + vmax = np.max(img.flatten()) + ax.imshow( + img, + norm=colors.SymLogNorm(linthresh=1.0, linscale=1.0, vmin=0, vmax=vmax), + interpolation=None + ) + + if save_image: + plt.savefig(output_dir) + + plt.show() + + +def distribute_indices_over_ranks(d, size): + """ + + Parameters + ---------- + d : int + total number of dimensions + size : int + number of ranks in world + + Returns + ------- + split_indices : ndarray, shape (size+1 x 1) + division indices between ranks + split_counts : ndarray, shape (size x 1) + number of dimensions allocated per rank + """ + + total_indices = 0 + split_indices, split_counts = [0], [] + + for r in range(size): + num_per_rank = d // size + if r < (d % size): + num_per_rank += 1 + + split_counts.append(num_per_rank) + + total_indices += num_per_rank + split_indices.append(total_indices) + + split_indices = np.array(split_indices) + split_counts = np.array(split_counts) + + return split_indices, split_counts + + +#### for command line use ### + + +def parse_input(): + """ + Parse command line input. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) + parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) + parser.add_argument( + "-d", + "--det_type", + help="Detector name, e.g epix10k2M or jungfrau4M.", + required=True, + type=str, + ) + parser.add_argument( + "--start_offset", + help="Run index of first image to be incorporated into iPCA model.", + required=False, + type=int, + ) + parser.add_argument( + "--num_components", + help="Number of principal components to compute and maintain.", + required=False, + type=int, + ) + parser.add_argument( + "--batch_size", + help="Size of image batch incorporated in each model update.", + required=False, + type=int, + ) + parser.add_argument( + "--num_images", + help="Total number of images to be incorporated into model.", + required=False, + type=int, + ) + parser.add_argument( + "--output_dir", + help="Path to output directory for recording task duration data.", + required=False, + type=str, + ) + parser.add_argument( + "--priming", + help="Initialize model with PCA.", + required=False, + action="store_true", + ) + parser.add_argument( + "--downsample", + help="Enable downsampling of images.", + required=False, + action="store_true", + ) + parser.add_argument( + "--bin_factor", + help="Bin factor if using downsizing.", + required=False, + type=int, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + params = parse_input() + kwargs = {k: v for k, v in vars(params).items() if v is not None} + + pipca = PiPCA(**kwargs) + pipca.run() + pipca.get_outliers() From 2c57322fc1dc7cd21c0100db0a0ce4fef14ea84a Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 23 Aug 2023 15:08:38 -0700 Subject: [PATCH 22/57] Cleaned up code and added documentation where appropriate. --- btx/processing/dimRed.py | 6 ++--- btx/processing/freqdir.py | 55 ++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py index 06ff78c32..0bd1db85d 100644 --- a/btx/processing/dimRed.py +++ b/btx/processing/dimRed.py @@ -25,7 +25,7 @@ class DimRed: - """Parallelized Incremental Principal Component Analysis.""" + """Dimension Reduction Parent Class.""" def __init__( self, @@ -73,7 +73,7 @@ def __init__( def get_params(self): """ - Method to retrieve iPCA params. + Method to retrieve dimension reduction parameters. Returns ------- @@ -95,7 +95,7 @@ def get_params(self): def set_params(self, num_images, num_components, batch_size, bin_factor): """ - Method to initialize iPCA parameters. + Method to initialize dimension reduction parameters. Parameters ---------- diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index f81df0065..2d2125620 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -35,7 +35,7 @@ class FreqDir(DimRed): """ - Parallel Frequent Directions. + Parallel Rank Adaptive Frequent Directions. Based on [1] and [2]. Frequent Directions is a matrix sketching algorithm used to approximate large data sets. The basic goal of matrix sketching is to process an @@ -199,13 +199,16 @@ def get_formatted_images(self, n): img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - #JOHN NEW ADDITION 08 20 2023 08 55 img_batch[img_batch<0] = 0 nimg_batch = [] for img in img_batch.T: if self.threshold: - secondQuartile = np.sort(img)[-1]//4 +# secondQuartile = np.sort(img)[-1]//4 +# secondQuartile = np.mean(img) +# secondQuartile = np.median(img) +# secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] + secondQuartile = np.quantile(img, 0.85) nimg = (img>secondQuartile)*img else: nimg = img @@ -258,16 +261,13 @@ def fetch_and_update_model(self, n): img_batch = self.get_formatted_images(n) if self.samplingFactor <1: - print("PRE PSAMP REDUCTION SHAPE: ", img_batch.shape) psamp = PrioritySampling(int(n*self.samplingFactor), self.d) for row in img_batch.T: psamp.update(row) img_batch = np.array(psamp.sketch.get()).T - print("PSAMP REDUCTION SHAPE: ", img_batch.shape) if self.mean is None: -# self.mean = np.mean(img_batch, axis=1) - self.mean = np.sum(img_batch, axis=1, dtype=np.double)/(img_batch.shape[1]) + self.mean = np.mean(img_batch, axis=1) else: # self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( # self.num_incorporated_images + (img_batch.shape[1])) @@ -321,7 +321,7 @@ def update_model(self, X): self.m = 2*self.ell self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d)))) self.increaseEll = False - print("INCREASING RANK OF PROCESS {} TO {}".format(self.rank, self.ell)) + print("Increasing rank of process {} to {}".format(self.rank, self.ell)) else: copyBatch = self.sketch[self.ell:,:].copy() self.rotate() @@ -539,13 +539,14 @@ def write(self): filename : string Name of h5 file where sketch, mean of data, and indices of data processed is written """ + self.comm.barrier() filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) hf.create_dataset("mean", data=self.mean) hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked)) hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images - self.comm.Barrier() + self.comm.barrier() return filename @@ -690,6 +691,8 @@ class ApplyCompression: imgageIndicesProcessed: indices of images processed so far currRun: Current datetime used to identify run imgGrabber: FD object used solely to retrieve data from psana + grabberToSaveImages: FD object used solely to retrieve + non-downsampled data for thumbnail generation components: Principal Components of matrix sketch processedData: Data projected onto matrix sketch range smallImages: Downsampled images for visualization purposes @@ -726,6 +729,9 @@ def __init__( self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor, threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False) + self.grabberToSaveImages = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, + exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0, + threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False) self.batchSize = batchSize self.num_images = self.imgGrabber.num_images @@ -763,7 +769,7 @@ def fetch_and_process_data(self): img_batch = self.imgGrabber.get_formatted_images(self.batchSize) self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) - toSave_img_batch = self.assembleImgsToSave(img_batch) + toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize)) if self.smallImgs is None: self.smallImgs = toSave_img_batch @@ -818,6 +824,11 @@ def write(self): class CustomPriorityQueue: + """ + Custom Priority Queue. + + Maintains a priority queue of items based on user-inputted priority for said items. + """ def __init__(self, max_size): self.queue = [] self.index = 0 # To handle items with the same priority @@ -847,6 +858,30 @@ def get(self): return ret class PrioritySampling: + """ + Priority Sampling. + + Based on [1] and [2]. Frequent Directions is a sampling algorithm that, + given a high-volume stream of weighted items, creates a generic sample + of a certain limited size that can later be used to estimate the total + weight of arbitrary subsets. In our case, we use Priority Sampling to + generate a matrix sketch based, sampling rows of our data using the + 2-norm as weights. Priority Sampling "first assigns each element i a random + number u_i ∈ Unif(0, 1). This implies a priority p_i = w_i/u_i , based + on its weight w_i (which for matrix rows w_i = ||a||_i^2). We then simply + retain the l rows with largest priorities, using a priority queue of size l." + + [1] Nick Duffield, Carsten Lund, and Mikkel Thorup. 2007. Priority sampling for + estimation of arbitrary subset sums. J. ACM 54, 6 (December 2007), 32–es. + https://doi.org/10.1145/1314690.1314696 + + Attributes + ---------- + ell: Number of components to keep + d: Number of features of each datapoint + sketch: Matrix Sketch maintained by Priority Queue + + """ def __init__(self, ell, d): self.ell = ell self.d = d From a3a25ed68573023809b4b52b67c617fab1e63f01 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Mon, 28 Aug 2023 08:59:43 -0700 Subject: [PATCH 23/57] Added UMAP visualization and wrapper class to FD module. Set up parent class for FD and PIPCA called dimension reduction (dimRed). Made various bug fixes. --- btx/processing/freqdir.py | 541 ++++++++++++++++++++++++- btx/processing/pipcaOLD.py | 790 ------------------------------------- 2 files changed, 533 insertions(+), 798 deletions(-) delete mode 100644 btx/processing/pipcaOLD.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 2d2125620..90914f42e 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1,14 +1,21 @@ import sys sys.path.append("/sdf/home/w/winnicki/btx/") from btx.processing.dimRed import * - import os, csv, argparse +import math +import time +import random +from collections import Counter +import h5py import numpy as np from numpy import zeros, sqrt, dot, diag from numpy.linalg import svd, LinAlgError from scipy.linalg import svd as scipy_svd +import pandas as pd +from sklearn.neighbors import NearestNeighbors +import heapq from mpi4py import MPI @@ -25,12 +32,23 @@ assemble_image_stack_batch, ) -import time - -import h5py from PIL import Image -import random -import heapq +from io import BytesIO +import base64 + +from datetime import datetime + +import umap +import hdbscan + +from matplotlib import colors +import matplotlib as mpl +from matplotlib import cm + +from bokeh.plotting import figure, show, output_file, save +from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label +from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3 +from bokeh.layouts import column, row class FreqDir(DimRed): @@ -208,7 +226,7 @@ def get_formatted_images(self, n): # secondQuartile = np.mean(img) # secondQuartile = np.median(img) # secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] - secondQuartile = np.quantile(img, 0.85) + secondQuartile = np.quantile(img, 0.93) nimg = (img>secondQuartile)*img else: nimg = img @@ -793,7 +811,7 @@ def assembleImgsToSave(self, imgs): for img in imgs.T: imgRe = np.reshape(img, self.imgGrabber.psi.det.shape()) imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) - saveMe.append(np.array(Image.fromarray(imgRe).resize((150, 150)))) + saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) saveMe = np.array(saveMe) return saveMe @@ -892,3 +910,510 @@ def update(self, vec): wi = np.linalg.norm(vec)**2 pi = wi/ui self.sketch.push(vec, pi, wi) + + + + +class visualizeFD: + """ + Visualize FD Dimension Reduction using UMAP and DBSCAN + """ + def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings): + self.inputFile = inputFile + self.outputFile = outputFile + output_file(filename=outputFile, title="Static HTML file") + self.viewResults = None + self.numImgsToUse = numImgsToUse + self.nprocs = nprocs + self.includeABOD = includeABOD + self.userGroupings = userGroupings + + def embeddable_image(self, data): + img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) +# image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) + image = Image.fromarray(img_data, mode='RGBA') + buffer = BytesIO() + image.save(buffer, format='png') + for_encoding = buffer.getvalue() + return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8') + + def random_unique_numbers_from_range(self, start, end, count): + all_numbers = list(range(start, end + 1)) + random.shuffle(all_numbers) + return all_numbers[:count] + + def euclidean_distance(self, p1, p2): + return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2) + + def compute_medoid(self, points): + min_total_distance = float('inf') + medoid = None + for i, point in enumerate(points): + total_distance = 0 + for other_point in points: + total_distance += self.euclidean_distance(point, other_point) + if total_distance < min_total_distance: + min_total_distance = total_distance + medoid = point + return medoid + + def genMedoids(self, medoidLabels, clusterPoints): + dictMe = {} + for j in set(medoidLabels): + dictMe[j] = [] + for index, class_name in enumerate(medoidLabels): + dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1])) + medoid_lst = [] + for k, v in dictMe.items(): + lst = [(x[1], x[2]) for x in v] + medoid_point = self.compute_medoid(lst) + for test_index, test_point in enumerate(lst): + if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]): + fin_ind = test_index + medoid_lst.append((k, v[fin_ind][0])) + return medoid_lst + + def relabel_to_closest_zero(self, labels): + unique_labels = sorted(set(labels)) + relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)} + relabeled = [relabel_dict[label] for label in labels] + return relabeled + + def regABOD(self, pts): + abofs = [] + for a in range(len(pts)): + test_list = [x for x in range(len(pts)) if x != a] + otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] + outlier_factors = [] + for b, c in otherPts: + apt = pts[a] + bpt = pts[b] + cpt = pts[c] + ab = bpt - apt + ac = cpt - apt + outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) + abofs.append(np.var(np.array(outlier_factors))) + return abofs + + def fastABOD(self, pts, nsamples): + nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts) + k_inds = nbrs.kneighbors(pts)[1] + abofs = [] + count = 0 + for a in range(len(pts)): + test_list = k_inds[a][1:] + otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] + outlier_factors = [] + for (b, c) in otherPts: + apt = pts[a] + bpt = pts[b] + cpt = pts[c] + ab = bpt - apt + ac = cpt - apt + if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): + count += 1 + continue + outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) + abofs.append(np.var(np.array(outlier_factors))) + return abofs + + def getOutliers(self, lst, divBy): + lstCopy = lst.copy() + lstCopy.sort() + quart10 = lstCopy[len(lstCopy)//divBy] + outlierInds = [] + notOutlierInds = [] + for j in range(len(lst)): + if lst[j] +
+ +
+
+ Cluster # + @cluster +
+ + """)) + plot_figure.circle( + 'x', + 'y', + source=datasource, + color=dict(field='ptColor', transform=color_mapping), + line_alpha=0.6, + fill_alpha=0.6, + size='medoidBold', + legend_field='cluster' + ) + plot_figure.sizing_mode = 'scale_both' + plot_figure.legend.location = "bottom_right" + plot_figure.legend.title = "Clusters" + + vals = [x for x in self.newLabels] + trueSource = ColumnDataSource(data=dict(vals = vals)) + hist, maxCount = self.genHist(vals, max(vals)) + left, right = self.genLeftRight(max(vals)) + histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right)) + p = figure(width=2000, height=450, toolbar_location=None, + title="Histogram Testing") + p.quad(source=histsource, top='hist', bottom=0, left='left', right='right', + fill_color='skyblue', line_color="white") + p.y_range = Range1d(0, maxCount) + p.x_range = Range1d(0, max(vals)+1) + p.xaxis.axis_label = "Cluster Label" + p.yaxis.axis_label = "Count" + + indexCDS = ColumnDataSource(dict( + index=[*range(0, self.numImgsToUse, 10)] + ) + ) + cols = RangeSlider(title="ET", + start=0, + end=self.numImgsToUse, + value=(0, self.numImgsToUse-1), + step=1, sizing_mode="stretch_width") + callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, + histsource = histsource, datasource=datasource, indexCDS=indexCDS), code=""" + function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) { + let counts = new Array(largestVal-smallestVal); for (let i=0; i= index[index.length - 1]) { + cb_obj.label = '► Play'; + slider.value = [0, slider_val1-slider_val0]; + cb_obj.active = false; + clearInterval(looop); + } + else if(slider_val1 !== index[index.length - 1]){ + slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]]; + } + else { + clearInterval(looop); + } + } + if(cb_obj.active == false){ + cb_obj.label = '► Play'; + clearInterval(looop); + } + else { + cb_obj.label = '❚❚ Pause'; + var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']); + }; + """) + toggl.js_on_change('active',toggl_js) + + LABELS = ["DBSCAN Clustering", "Anomaly Detection"] + radio_button_group = RadioButtonGroup(labels=LABELS, active=0) + radioGroup_js = CustomJS(args=dict(datasource=datasource), code=""" + console.log(datasource.data.ptColor) + const x = datasource.data.x + const y = datasource.data.y + const image = datasource.data.image + const medoidBold = datasource.data.medoidBold + const cluster = datasource.data.cluster + const anomDet = datasource.data.anomDet + + let ptColor = null + + if (cb_obj.active==0){ + ptColor = cluster + } + else{ + ptColor = anomDet + } + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet} + """) + radio_button_group.js_on_change("active", radioGroup_js) + + self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group)) + + def fullVisualize(self): + self.genUMAP() + self.genABOD() + self.genLabels() + self.genHTML() + + def updateLabels(self): + self.genLabels() + self.genHTML() + + def userSave(self): + save(self.viewResults) + + def userShow(self): + from IPython.display import display, HTML + display(HTML("")) + display(HTML("")) + display(HTML("")) + display(HTML("")) + from bokeh.io import output_notebook + output_notebook() + show(self.viewResults) + + +class WrapperFullFD: + """ + Frequent Directions Data Processing Wrapper Class. + """ + def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize): + self.currRun = datetime.now().strftime("%y%m%d%H%M") + self.start_offset = start_offset + self.num_imgs = num_imgs + self.exp = exp + self.run = run + self.det_type = det_type + self.writeToHere = writeToHere + self.num_components=num_components + self.alpha = alpha + self.rankAdapt = rankAdapt + self.downsample=downsample + self.bin_factor= bin_factor + self.threshold= threshold + self.normalizeIntensity=normalizeIntensity + self.noZeroIntensity=noZeroIntensity + self.samplingFactor=samplingFactor + self.priming=priming + self.divBy = divBy + self.batchSize = batchSize + + def runMe(self): + stfull = time.perf_counter() + + #SKETCHING STEP + ########################################################################################## + freqDir = FreqDir(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, + det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, + merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, + threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, + currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming) + print("STARTING SKETCHING") + st = time.perf_counter() + freqDir.run() + localSketchFilename = freqDir.write() + et = time.perf_counter() + print("Estimated time for frequent directions rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st)) + + #MERGING STEP + ########################################################################################## + if freqDir.rank<10: + fullSketchFilename = localSketchFilename[:-4] + else: + fullSketchFilename = localSketchFilename[:-5] + allNames = [] + for j in range(freqDir.size): + allNames.append(fullSketchFilename + str(j) + ".h5") + mergeTree = MergeTree(exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, + output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun) + #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename, + # dir=writeToHere, allWriteDirecs=allNames, currRun = currRun) + + st = time.perf_counter() + mergeTree.merge() + mergedSketchFilename = mergeTree.write() + et = time.perf_counter() + print("Estimated time merge tree for rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st)) + + + + #PROJECTION STEP + ########################################################################################## + appComp = ApplyCompression(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, + det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, + batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, + downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun) + st = time.perf_counter() + appComp.run() + appComp.write() + et = time.perf_counter() + print("Estimated time projection for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, et - st)) + + + etfull = time.perf_counter() + print("Estimated full processing time for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, etfull - stfull)) + ########################################################################################## + + if freqDir.rank==0: + st = time.perf_counter() + visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), + outputFile="./UMAPVis_{}.html".format(self.currRun), + numImgsToUse=self.num_imgs, + nprocs=freqDir.size, + userGroupings=[], + includeABOD=True) + visMe.fullVisualize() + visMe.userSave() + et = time.perf_counter() + print("UMAP HTML Generation Processing time: {}".format(et - st)) + print("TOTAL PROCESING TIME: {}".format(et - stfull)) + + diff --git a/btx/processing/pipcaOLD.py b/btx/processing/pipcaOLD.py deleted file mode 100644 index 5ce47ea8f..000000000 --- a/btx/processing/pipcaOLD.py +++ /dev/null @@ -1,790 +0,0 @@ -import os, csv, argparse - -import numpy as np -from mpi4py import MPI - -from matplotlib import pyplot as plt -from matplotlib import colors - -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) - -class PiPCA: - - """Parallelized Incremental Principal Component Analysis.""" - - def __init__( - self, - exp, - run, - det_type, - start_offset=0, - num_images=10, - num_components=10, - batch_size=10, - priming=False, - downsample=False, - bin_factor=2, - output_dir="", - ): - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = start_offset - - self.priming = priming - self.downsample = downsample - self.bin_factor = bin_factor - self.output_dir = output_dir - - ( - self.num_images, - self.num_components, - self.batch_size, - self.num_features, - ) = self.set_params(num_images, num_components, batch_size, bin_factor) - - self.split_indices, self.split_counts = distribute_indices_over_ranks( - self.num_features, self.size - ) - - self.task_durations = dict({}) - - self.num_incorporated_images = 0 - self.outliers, self.pc_data = [], [] - - def get_params(self): - """ - Method to retrieve iPCA params. - - Returns - ------- - num_incorporated_images : int - number of images used to build model - num_components : int - number of components maintained in model - batch_size : int - batch size used in model updates - num_features : int - dimensionality of incorporated images - """ - return ( - self.num_incorporated_images, - self.num_components, - self.batch_size, - self.num_features, - ) - - def set_params(self, num_images, num_components, batch_size, bin_factor): - """ - Method to initialize iPCA parameters. - - Parameters - ---------- - num_images : int - Desired number of images to incorporate into model. - num_components : int - Desired number of components for model to maintain. - batch_size : int - Desired size of image block to be incorporated into model at each update. - bin_factor : int - Factor to bin data by. - - Returns - ------- - num_images : int - Number of images to incorporate into model. - num_components : int - Number of components for model to maintain. - batch_size : int - Size of image block to be incorporated into model at each update. - num_features : int - Number of features (dimension) in each image. - """ - max_events = self.psi.max_events - downsample = self.downsample - - num_images = min(num_images, max_events) if num_images != -1 else max_events - num_components = min(num_components, num_images) - batch_size = min(batch_size, num_images) - - # set d - det_shape = self.psi.det.shape() - num_features = np.prod(det_shape).astype(int) - - if downsample: - if det_shape[-1] % bin_factor or det_shape[-2] % bin_factor: - print("Invalid bin factor, toggled off downsampling.") - self.downsample = False - else: - num_features = int(num_features / bin_factor**2) - - return num_images, num_components, batch_size, num_features - - def run(self): - """ - Perform iPCA on run subject to initialization parameters. - """ - m = self.batch_size - num_images = self.num_images - - # initialize and prime model, if specified - if self.priming: - img_batch = self.get_formatted_images( - self.num_components, 0, self.num_features - ) - self.prime_model(img_batch) - else: - self.U = np.zeros((self.split_counts[self.rank], self.num_components)) - self.S = np.ones(self.num_components) - self.mu = np.zeros((self.split_counts[self.rank], 1)) - self.total_variance = np.zeros((self.split_counts[self.rank], 1)) - - # divide remaining number of images into batches - # will become redundant in a streaming setting, need to change - rem_imgs = num_images - self.num_incorporated_images - batch_sizes = np.array( - [m] * np.floor(rem_imgs / m).astype(int) - + ([rem_imgs % m] if rem_imgs % m else []) - ) - - # update model with remaining batches - for batch_size in batch_sizes: - self.fetch_and_update_model(batch_size) - - def get_formatted_images(self, n, start_index, end_index): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - - bin_factor = self.bin_factor - downsample = self.downsample - - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - formatted_imgs = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - - return formatted_imgs[start_index:end_index, :] - - def prime_model(self, X): - """ - Initialize model on sample of data using batch PCA. - - Parameters - ---------- - X : ndarray, shape (d x n) - set of n (d x 1) observations - """ - - d, n = X.shape - - if self.rank == 0: - print(f"Priming model with {n} samples...") - - - mu_full, total_variance_full = self.calculate_sample_mean_and_variance(X) - - self.mu = mu_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]] - self.total_variance = total_variance_full[self.split_indices[self.rank]:self.split_indices[self.rank+1]] - - centered_data = X - np.tile(mu_full, n) - - U, self.S, _ = np.linalg.svd(centered_data, full_matrices=False) - self.U = U[self.split_indices[self.rank]:self.split_indices[self.rank+1], :] - - self.num_incorporated_images += n - - def fetch_and_update_model(self, n): - """ - Fetch images and update model. - - Parameters - ---------- - n : int - number of images to incorporate - """ - - rank = self.rank - start_index, end_index = self.split_indices[rank], self.split_indices[rank + 1] - - img_batch = self.get_formatted_images(n, start_index, end_index) - - self.update_model(img_batch) - - def update_model(self, X): - """ - Update model with new batch of observations using iPCA. - - Parameters - ---------- - X : ndarray, shape (d x m) - batch of m (d x 1) observations - - Notes - ----- - Implementation of iPCA algorithm from [1]. - - References - ---------- - [1] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking. - International journal of computer vision. 2008 May;77(1):125-41. - """ - _, m = X.shape - n = self.num_incorporated_images - q = self.num_components - - with TaskTimer(self.task_durations, "total update"): - - if self.rank == 0: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=m, s="s" if m > 1 else "", n=n, q=q - ) - ) - - with TaskTimer(self.task_durations, "record pc data"): - if n > 0: - self.record_loadings(X, 5) - - with TaskTimer(self.task_durations, "update mean and variance"): - mu_n = self.mu - mu_m, s_m = self.calculate_sample_mean_and_variance(X) - - self.total_variance = self.update_sample_variance( - self.total_variance, s_m, mu_n, mu_m, n, m - ) - self.mu = self.update_sample_mean(mu_n, mu_m, n, m) - - with TaskTimer( - self.task_durations, "center data and compute augment vector" - ): - X_centered = X - np.tile(mu_m, m) - mean_augment_vector = np.sqrt(n * m / (n + m)) * (mu_m - mu_n) - - X_augmented = np.hstack((X_centered, mean_augment_vector)) - - with TaskTimer(self.task_durations, "first matrix product U@S"): - US = self.U @ np.diag(self.S) - - with TaskTimer(self.task_durations, "QR concatenate"): - A = np.hstack((US, X_augmented)) - - with TaskTimer(self.task_durations, "parallel QR"): - Q_r, U_tilde, S_tilde = self.parallel_qr(A) - - with TaskTimer(self.task_durations, "compute local U_prime"): - self.U = Q_r @ U_tilde[:, :q] - self.S = S_tilde[:q] - - self.num_incorporated_images += m - - - def calculate_sample_mean_and_variance(self, imgs): - """ - Compute the sample mean and variance of a flattened stack of n images. - - Parameters - ---------- - imgs : ndarray, shape (d x n) - horizonally stacked batch of flattened images - - Returns - ------- - mu_m : ndarray, shape (d x 1) - mean of imgs - su_m : ndarray, shape (d x 1) - sample variance of imgs (1 dof) - """ - d, m = imgs.shape - - mu_m = np.reshape(np.mean(imgs, axis=1), (d, 1)) - s_m = np.zeros((d, 1)) - - if m > 1: - s_m = np.reshape(np.var(imgs, axis=1, ddof=1), (d, 1)) - - return mu_m, s_m - - def parallel_qr(self, A): - """ - Perform parallelized qr factorization on input matrix A. - - Parameters - ---------- - A : ndarray, shape (_ x q+m+1) - Input data to be factorized. - - Returns - ------- - q_fin : ndarray, shape (_, q+m+1) - Q_{r,1} from TSQR algorithm, where r = self.rank + 1 - U_tilde : ndarray, shape (q+m+1, q+m+1) - Q_{r,2} from TSQR algorithm, where r = self.rank + 1 - S_tilde : ndarray, shape (q+m+1) - R_tilde from TSQR algorithm, where r = self.rank + 1 - - Notes - ----- - Parallel QR algorithm implemented from [1], with additional elements from [2] - sprinkled in to record elements for iPCA using SVD, etc. - - References - ---------- - [1] Benson AR, Gleich DF, Demmel J. Direct QR factorizations for tall-and-skinny - matrices in MapReduce architectures. In2013 IEEE international conference on - big data 2013 Oct 6 (pp. 264-272). IEEE. - - [2] Ross DA, Lim J, Lin RS, Yang MH. Incremental learning for robust visual tracking. - International journal of computer vision. 2008 May;77(1):125-41. - - [3] Maulik, R., & Mengaldo, G. (2021, November). PyParSVD: A streaming, distributed and - randomized singular-value-decomposition library. In 2021 7th International Workshop on - Data Analysis and Reduction for Big Scientific Data (DRBSD-7) (pp. 19-25). IEEE. - """ - _, x = A.shape - q = self.num_components - m = x - q - 1 - - with TaskTimer(self.task_durations, "qr - local qr"): - Q_r1, R_r = np.linalg.qr(A, mode="reduced") - - self.comm.Barrier() - - with TaskTimer(self.task_durations, "qr - r_tot gather"): - if self.rank == 0: - R = np.empty((self.size * (q + m + 1), q + m + 1)) - else: - R = None - - self.comm.Gather(R_r, R, root=0) - - if self.rank == 0: - with TaskTimer(self.task_durations, "qr - global qr"): - Q_2, R_tilde = np.linalg.qr(R, mode="reduced") - - with TaskTimer(self.task_durations, "qr - global svd"): - U_tilde, S_tilde, _ = np.linalg.svd(R_tilde) - else: - U_tilde = np.empty((q + m + 1, q + m + 1)) - S_tilde = np.empty(q + m + 1) - Q_2 = None - - self.comm.Barrier() - - with TaskTimer(self.task_durations, "qr - scatter q_tot"): - Q_r2 = np.empty((q + m + 1, q + m + 1)) - self.comm.Scatter(Q_2, Q_r2, root=0) - - with TaskTimer(self.task_durations, "qr - local matrix build"): - Q_r = Q_r1 @ Q_r2 - - self.comm.Barrier() - - with TaskTimer(self.task_durations, "qr - bcast S_tilde"): - self.comm.Bcast(S_tilde, root=0) - - self.comm.Barrier() - - with TaskTimer(self.task_durations, "qr - bcast U_tilde"): - self.comm.Bcast(U_tilde, root=0) - - return Q_r, U_tilde, S_tilde - - def update_sample_mean(self, mu_n, mu_m, n, m): - """ - Compute combined mean of two blocks of data. - - Parameters - ---------- - mu_n : ndarray, shape (d x 1) - mean of first block of data - mu_m : ndarray, shape (d x 1) - mean of second block of data - n : int - number of observations in first block of data - m : int - number of observations in second block of data - - Returns - ------- - mu_nm : ndarray, shape (d x 1) - combined mean of both blocks of input data - """ - mu_nm = mu_m - - if n != 0: - mu_nm = (1 / (n + m)) * (n * mu_n + m * mu_m) - - return mu_nm - - def update_sample_variance(self, s_n, s_m, mu_n, mu_m, n, m): - """ - Compute combined sample variance of two blocks - of data described by input parameters. - - Parameters - ---------- - s_n : ndarray, shape (d x 1) - sample variance of first block of data - s_m : ndarray, shape (d x 1) - sample variance of second block of data - mu_n : ndarray, shape (d x 1) - mean of first block of data - mu_m : ndarray, shape (d x 1) - mean of second block of data - n : int - number of observations in first block of data - m : int - number of observations in second block of data - - Returns - ------- - s_nm : ndarray, shape (d x 1) - combined sample variance of both blocks of data described by input - parameters - """ - s_nm = s_m - - if n != 0: - s_nm = (((n - 1) * s_n + (m - 1) * s_m) - + (n * m * (mu_n - mu_m) ** 2) / (n + m)) / (n + m - 1) - - return s_nm - - def get_model(self): - """ - Method to retrieve model parameters. - - Returns - ------- - U_tot : ndarray, shape (d x q) - iPCA principal axes from model. - S_tot : ndarray, shape (1 x q) - iPCA singular values from model. - mu_tot : ndarray, shape (1 x d) - Data mean computed from all input images. - var_tot : ndarray, shape (1 x d) - Sample data variance computed from all input images. - """ - if self.rank == 0: - U_tot = np.empty(self.num_features * self.num_components) - mu_tot = np.empty((self.num_features, 1)) - var_tot = np.empty((self.num_features, 1)) - else: - U_tot, mu_tot, var_tot = None, None, None - - start_indices = self.split_indices[:-1] - - self.comm.Gatherv( - self.U.flatten(), - [ - U_tot, - self.split_counts * self.num_components, - start_indices * self.num_components, - MPI.DOUBLE, - ], - root=0, - ) - - if self.rank == 0: - U_tot = np.reshape(U_tot, (self.num_features, self.num_components)) - - self.comm.Gatherv( - self.mu, - [ - mu_tot, - self.split_counts * self.num_components, - start_indices, - MPI.DOUBLE, - ], - root=0, - ) - self.comm.Gatherv( - self.total_variance, - [ - var_tot, - self.split_counts * self.num_components, - start_indices, - MPI.DOUBLE, - ], - root=0, - ) - - S_tot = self.S - - return U_tot, S_tot, mu_tot, var_tot - - def get_outliers(self): - """ - Method to retrieve and print outliers on root process. - """ - - if self.rank == 0: - print(self.outliers) - - def record_loadings(self, X, q_sig): - """ - Method to store all loadings, ΣV^T, from present batch using past - model iteration. - - Parameters - ---------- - X : ndarray, shape (_ x m) - Local subdivision of current image data batch. - - q_sig : int - The q_sig components used in generating the loadings for - """ - _, m = X.shape - n, d = self.num_incorporated_images, self.num_features - - start_indices = self.split_indices[:-1] - - U, _, mu, _ = self.get_model() - - if self.rank == 0: - X_tot = np.empty((d, m)) - else: - X_tot = None - - self.comm.Gatherv( - X.flatten(), - [ - X_tot, - self.split_counts * m, - start_indices * m, - MPI.DOUBLE, - ], - root=0, - ) - - if self.rank == 0: - - X_tot = np.reshape(X_tot, (d, m)) - cb = X_tot - np.tile(mu, (1, m)) - - pcs = U.T @ cb - self.pc_data = ( - np.concatenate((self.pc_data, pcs), axis=1) - if len(self.pc_data) - else pcs - ) - - pc_dist = np.linalg.norm(pcs[:q_sig], axis=0) - std = np.std(pc_dist) - mu = np.mean(pc_dist) - - batch_outliers = np.where(np.abs(pc_dist - mu) > std)[0] + n - m - - self.outliers = ( - np.concatenate((self.outliers, batch_outliers), axis=0) - if len(self.outliers) - else batch_outliers - ) - - def display_image(self, idx, output_dir="", save_image=False): - """ - Method to retrieve single image from run subject to model binning constraints. - - Parameters - ---------- - idx : int - Run index of image to be retrieved. - output_dir : str, optional - File path to output directory, by default "" - save_image : bool, optional - Whether to save image to file, by default False - """ - - U, S, mu, var = self.get_model() - - if self.rank != 0: - return - - bin_factor = 1 - if self.downsample: - bin_factor = self.bin_factor - - n, q, m, d = self.get_params() - - a, b, c = self.psi.det.shape() - b = int(b / bin_factor) - c = int(c / bin_factor) - - fig, ax = plt.subplots(1) - - counter = self.psi.counter - self.psi.counter = idx - img = self.get_formatted_images(1, 0, d) - self.psi.counter = counter - - img = img - mu - img = np.reshape(img, (a, b, c)) - - pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) - binned_pim = bin_pixel_index_map(pixel_index_map, bin_factor) - - img = assemble_image_stack_batch(img, binned_pim) - - vmax = np.max(img.flatten()) - ax.imshow( - img, - norm=colors.SymLogNorm(linthresh=1.0, linscale=1.0, vmin=0, vmax=vmax), - interpolation=None - ) - - if save_image: - plt.savefig(output_dir) - - plt.show() - - -def distribute_indices_over_ranks(d, size): - """ - - Parameters - ---------- - d : int - total number of dimensions - size : int - number of ranks in world - - Returns - ------- - split_indices : ndarray, shape (size+1 x 1) - division indices between ranks - split_counts : ndarray, shape (size x 1) - number of dimensions allocated per rank - """ - - total_indices = 0 - split_indices, split_counts = [0], [] - - for r in range(size): - num_per_rank = d // size - if r < (d % size): - num_per_rank += 1 - - split_counts.append(num_per_rank) - - total_indices += num_per_rank - split_indices.append(total_indices) - - split_indices = np.array(split_indices) - split_counts = np.array(split_counts) - - return split_indices, split_counts - - -#### for command line use ### - - -def parse_input(): - """ - Parse command line input. - """ - - parser = argparse.ArgumentParser() - parser.add_argument("-e", "--exp", help="Experiment name.", required=True, type=str) - parser.add_argument("-r", "--run", help="Run number.", required=True, type=int) - parser.add_argument( - "-d", - "--det_type", - help="Detector name, e.g epix10k2M or jungfrau4M.", - required=True, - type=str, - ) - parser.add_argument( - "--start_offset", - help="Run index of first image to be incorporated into iPCA model.", - required=False, - type=int, - ) - parser.add_argument( - "--num_components", - help="Number of principal components to compute and maintain.", - required=False, - type=int, - ) - parser.add_argument( - "--batch_size", - help="Size of image batch incorporated in each model update.", - required=False, - type=int, - ) - parser.add_argument( - "--num_images", - help="Total number of images to be incorporated into model.", - required=False, - type=int, - ) - parser.add_argument( - "--output_dir", - help="Path to output directory for recording task duration data.", - required=False, - type=str, - ) - parser.add_argument( - "--priming", - help="Initialize model with PCA.", - required=False, - action="store_true", - ) - parser.add_argument( - "--downsample", - help="Enable downsampling of images.", - required=False, - action="store_true", - ) - parser.add_argument( - "--bin_factor", - help="Bin factor if using downsizing.", - required=False, - type=int, - ) - - return parser.parse_args() - - -if __name__ == "__main__": - - params = parse_input() - kwargs = {k: v for k, v in vars(params).items() if v is not None} - - pipca = PiPCA(**kwargs) - pipca.run() - pipca.get_outliers() From b3b57b6c67895cc3ebd6afadad7da14da4b23f7c Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 31 Aug 2023 15:14:38 -0700 Subject: [PATCH 24/57] Changed data grabbing to only retrieve data once at the beginning. Also made a number of bug fixes. There still is expected to be bug in this version where too much accessing h5 causes h5 file not found error due to h5py not having parallel h5py configured. UMAP is back to working correctly though. --- btx/processing/freqdir.py | 664 ++++++++++++++++++++++++++++---------- 1 file changed, 498 insertions(+), 166 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 90914f42e..00d520bac 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -15,6 +15,7 @@ from scipy.linalg import svd as scipy_svd import pandas as pd from sklearn.neighbors import NearestNeighbors +from sklearn.metrics.pairwise import euclidean_distances import heapq from mpi4py import MPI @@ -35,11 +36,13 @@ from PIL import Image from io import BytesIO import base64 +import tables from datetime import datetime import umap import hdbscan +from sklearn.cluster import OPTICS, cluster_optics_dbscan from matplotlib import colors import matplotlib as mpl @@ -50,6 +53,9 @@ from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3 from bokeh.layouts import column, row +import cProfile +import string + class FreqDir(DimRed): """ @@ -113,6 +119,9 @@ class FreqDir(DimRed): def __init__( self, + comm, + rank, + size, start_offset, num_imgs, exp, @@ -120,6 +129,7 @@ def __init__( det_type, output_dir, currRun, + imgData, alpha=0, rankAdapt=False, merger=False, @@ -139,6 +149,10 @@ def __init__( num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming, downsample=downsample, bin_factor=bin_factor, output_dir=output_dir) + self.comm = comm + self.rank= rank + self.size = size + self.psi.counter = start_offset + self.num_images*self.rank//self.size self.currRun = currRun @@ -169,6 +183,8 @@ def __init__( self.samplingFactor = samplingFactor + self.imgData = imgData + def run(self): """ Perform frequent directions matrix sketching @@ -176,70 +192,102 @@ def run(self): """ noImgsToProcess = self.num_images//self.size - for batch in range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor)): - self.fetch_and_update_model(int(self.ell*2//self.samplingFactor)) - - def get_formatted_images(self, n): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) - - bin_factor = self.bin_factor - downsample = self.downsample + for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))): + self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd) - # may have to rewrite eventually when number of images becomes large, - # i.e. streamed setting, either that or downsample aggressively - imgs = self.psi.get_images(n, assemble=False) - - if downsample: - imgs = bin_data(imgs, bin_factor) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - - num_valid_imgs, p, x, y = imgs.shape - - img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - - img_batch[img_batch<0] = 0 - - nimg_batch = [] - for img in img_batch.T: - if self.threshold: -# secondQuartile = np.sort(img)[-1]//4 -# secondQuartile = np.mean(img) -# secondQuartile = np.median(img) -# secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] - secondQuartile = np.quantile(img, 0.93) - nimg = (img>secondQuartile)*img - else: - nimg = img - - currIntensity = np.sum(nimg.flatten(), dtype=np.double) - if self.noZeroIntensity and currIntensity<50000: - continue - else: - if currIntensity>=50000 and self.normalizeIntensity: - nimg_batch.append(nimg/currIntensity) - else: - nimg_batch.append(nimg) - return np.array(nimg_batch).T + def elu(self,x): + if x > 0: + return x + else: + return 0.01*(math.exp(x)-1) + +# def get_formatted_images(self, n, includeUnformatted=False): +# """ +# Fetch n - x image segments from run, where x is the number of 'dead' images. +# +# Parameters +# ---------- +# n : int +# number of images to retrieve +# start_index : int +# start index of subsection of data to retrieve +# end_index : int +# end index of subsection of data to retrieve +# +# Returns +# ------- +# ndarray, shape (end_index-start_index, n-x) +# n-x retrieved image segments of dimension end_index-start_index +# """ +# self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) +# # may have to rewrite eventually when number of images becomes large, +# # i.e. streamed setting, either that or downsample aggressively +# imgs = self.psi.get_images(n, assemble=False) +# +# if includeUnformatted: +# imgsCopy = imgs.copy() +# imgsCopy = imgsCopy[ +# [i for i in range(imgsCopy.shape[0]) if not np.isnan(imgsCopy[i : i + 1]).any()] +# ] +# num_valid_imgsCopy, p, x, y = imgsCopy.shape +# img_batchCopy = np.reshape(imgsCopy, (num_valid_imgsCopy, p * x * y)).T +# img_batchCopy[img_batchCopy<0] = 0 +# nimg_batchCopy = [] +# for img in img_batchCopy.T: +# if self.threshold: +# # secondQuartile = np.sort(img)[-1]//4 +# # secondQuartile = np.mean(img) +# # secondQuartile = np.median(img) +# # secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] +# secondQuartile = np.quantile(img, 0.93) +# nimg = (img>secondQuartile)*img +# # elu_v = np.vectorize(self.elu) +# # nimg = elu_v(img-secondQuartile)+secondQuartile +# else: +# nimg = img +# currIntensity = np.sum(nimg.flatten(), dtype=np.double) +# if self.noZeroIntensity and currIntensity<50000: +# continue +# else: +# if currIntensity>=50000 and self.normalizeIntensity: +# nimg_batchCopy.append(nimg/currIntensity) +# else: +# nimg_batchCopy.append(nimg) +# +# if self.downsample: +# imgs = bin_data(imgs, self.bin_factor) +# imgs = imgs[ +# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] +# ] +# num_valid_imgs, p, x, y = imgs.shape +# img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T +# img_batch[img_batch<0] = 0 +# nimg_batch = [] +# for img in img_batch.T: +# if self.threshold: +## secondQuartile = np.sort(img)[-1]//4 +## secondQuartile = np.mean(img) +## secondQuartile = np.median(img) +## secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] +# secondQuartile = np.quantile(img, 0.93) +# nimg = (img>secondQuartile)*img +## elu_v = np.vectorize(self.elu) +## nimg = elu_v(img-secondQuartile)+secondQuartile +# else: +# nimg = img +# +# currIntensity = np.sum(nimg.flatten(), dtype=np.double) +# if self.noZeroIntensity and currIntensity<50000: +# continue +# else: +# if currIntensity>=50000 and self.normalizeIntensity: +# nimg_batch.append(nimg/currIntensity) +# else: +# nimg_batch.append(nimg) +# if includeUnformatted: +# return (np.array(nimg_batch).T, np.array(nimg_batchCopy).T) +# else: +# return np.array(nimg_batch).T ########################################################################### @@ -267,7 +315,7 @@ def intensityFunc_normalizeIntensity(img, currIntensity): return img/currIntensity ########################################################################### - def fetch_and_update_model(self, n): + def fetch_and_update_model(self, n, currInd): """ Fetch images and update model. @@ -276,7 +324,9 @@ def fetch_and_update_model(self, n): n : int number of images to incorporate """ - img_batch = self.get_formatted_images(n) +# img_batch = self.get_formatted_images(n) + img_batch = self.imgData[currInd] +# print("1414oiioqdca", img_batch.shape) if self.samplingFactor <1: psamp = PrioritySampling(int(n*self.samplingFactor), self.d) @@ -321,7 +371,7 @@ def update_model(self, X): X: ndarray data to update matrix sketch with """ - _, numIncorp = X.shape + _, numIncorp = X.shape origNumIncorp = numIncorp with TaskTimer(self.task_durations, "total update"): if self.rank==0 and not self.merger: @@ -331,7 +381,6 @@ def update_model(self, X): ) ) for row in X.T: - canRankAdapt = numIncorp > (self.ell + 15) if self.nextZeroRow >= self.m: if self.increaseEll and canRankAdapt and self.rankAdapt: @@ -377,28 +426,24 @@ def rotate(self): in Computer Science, vol 8737. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-44777-2_39 """ - try: - [_,s,Vt] = svd(self.sketch , full_matrices=False) - except LinAlgError as err: - [_,s,Vt] = scipy_svd(self.sketch, full_matrices = False) - if len(s) >= self.ell: - sCopy = s.copy() + [_,S,Vt] = np.linalg.svd(self.sketch , full_matrices=False) + ssize = S.shape[0] + if ssize >= self.ell: + sCopy = S.copy() #JOHN: I think actually this should be ell+1 and ell. We lose a component otherwise. - toShrink = s[:self.ell]**2 - s[self.ell-1]**2 + toShrink = S[:self.ell]**2 - S[self.ell-1]**2 #John: Explicitly set this value to be 0, since sometimes it is negative # or even turns to NaN due to roundoff error toShrink[-1] = 0 toShrink = sqrt(toShrink) - toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) self.sketch[self.ell:,:] = 0 self.nextZeroRow = self.ell else: - self.sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) - self.sketch[len(s):,:] = 0 - self.nextZeroRow = len(s) + self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:] + self.sketch[ssize:,:] = 0 + self.nextZeroRow = ssize def reconstructionError(self, matrixCentered): """ @@ -564,6 +609,8 @@ def write(self): hf.create_dataset("mean", data=self.mean) hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked)) hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images + tables.file._open_files.close_all() + print("CREATED FILE: ", filename) self.comm.barrier() return filename @@ -593,24 +640,25 @@ class MergeTree: currRun: Current datetime used to identify run """ - def __init__(self, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun): - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() + def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun): + self.comm = comm + self.rank = rank + self.size = size self.divBy = divBy with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] + tables.file._open_files.close_all() - self.fd = FreqDir(0, 0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) if self.rank==0: print("BUFFER SIZES: ", self.buffSizes) - print(self.data.T.shape) +# print(self.data.shape) self.fd.update_model(self.data.T) self.output_dir = output_dir @@ -651,6 +699,8 @@ def merge(self): bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double) self.comm.Recv(bufferMe, source=proc, tag=17) bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1])) +# print("BUFFERME SHAPE", bufferMe.shape) +# self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1]))))) self.fd.update_model(bufferMe.T) else: bufferMe = self.fd.get().copy().flatten() @@ -669,6 +719,7 @@ def merge(self): + hf["sketch"].attrs["numImgsIncorp"]) self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"] self.fullImgsTracked = np.vstack((self.fullImgsTracked, hf["imgsTracked"][:])) + tables.file._open_files.close_all() return self.fd.get() else: return @@ -677,6 +728,7 @@ def write(self): """ Write merged matrix sketch to h5 file """ +# print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked) filename = self.output_dir + '{}_merge.h5'.format(self.currRun) if self.rank==0: with h5py.File(filename, 'w') as hf: @@ -684,7 +736,9 @@ def write(self): hf.create_dataset("mean", data=self.fullMean) hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp hf.create_dataset("imgsTracked", data=self.fullImgsTracked) - self.comm.Barrier() + print("CREATED FILE: ", filename) + tables.file._open_files.close_all() + self.comm.barrier() return filename class ApplyCompression: @@ -718,6 +772,9 @@ class ApplyCompression: def __init__( self, + comm, + rank, + size, start_offset, num_imgs, exp, @@ -730,36 +787,39 @@ def __init__( noZeroIntensity, normalizeIntensity, currRun, + imgData, + thumbnailData, downsample=False, bin_factor=2 ): - self.output_dir = output_dir + self.comm = comm + self.rank = rank + self.size= size - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() + self.output_dir = output_dir self.num_imgs = num_imgs self.currRun = currRun - self.imgGrabber = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, - exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor, - threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False) - self.grabberToSaveImages = FreqDir(start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, - exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0, - threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False) - self.batchSize = batchSize - - self.num_images = self.imgGrabber.num_images - self.num_features = self.imgGrabber.num_features +# self.imgGrabber = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, +# exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor, +# threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None) +# self.grabberToSaveImages = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, +# exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0, +# threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None) +# self.batchSize = batchSize self.num_incorporated_images = 0 + print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile, os.path.isfile(readFile))) + while(not os.path.isfile(readFile)): + print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile, self.rank)) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] self.mean = hf["mean"][:] + tables.file._open_files.close_all() U, S, Vt = np.linalg.svd(self.data, full_matrices=False) self.components = Vt @@ -769,25 +829,45 @@ def __init__( self.imageIndicesProcessed = [] + self.imgData = imgData + self.thumbnailData = thumbnailData + def run(self): """ Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. """ - noImgsToProcess = self.num_images//self.size - for batch in range(0,noImgsToProcess,self.batchSize): - self.fetch_and_process_data() + noImgsToProcess = self.num_imgs//self.size +# for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)): + for currInd in range(len(self.imgData)): + self.fetch_and_process_data(currInd) +# print("RANK {} IS DONE".format(self.rank)) +# self.fetch_and_process_data() - def fetch_and_process_data(self): + def fetch_and_process_data(self, currInd): """ Fetch and downsample data, apply projection algorithm """ - startCounter = self.imgGrabber.psi.counter - img_batch = self.imgGrabber.get_formatted_images(self.batchSize) - self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) +# startCounter = self.imgGrabber.psi.counter - toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize)) +# stimggrab = time.perf_counter() +# img_batch,img_batchUnformatted = self.imgGrabber.get_formatted_images(self.batchSize,includeUnformatted=True) +# img_batch = self.imgGrabber.get_formatted_images(self.batchSize) +# self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) +# etimggrab = time.perf_counter() +# print("{} Image Grab TIME: ".format(self.rank), etimggrab - stimggrab) + +# stassemble = time.perf_counter() +# toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize)) +# toSave_img_batch = self.assembleImgsToSave(img_batchUnformatted) +# etassemble = time.perf_counter() +# print("{} Assemble TIME: ".format(self.rank), etassemble - stassemble) + +# stassemble = time.perf_counter() + + img_batch = self.imgData[currInd] + toSave_img_batch = self.thumbnailData[currInd] if self.smallImgs is None: self.smallImgs = toSave_img_batch @@ -795,25 +875,52 @@ def fetch_and_process_data(self): self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) # self.apply_compression((img_batch.T - self.mean).T) self.apply_compression(img_batch) +# etassemble = time.perf_counter() +# print("{} Apply Compression TIME: ".format(self.rank), etassemble - stassemble) + + +# noImgsToProcess = self.num_images//self.size +# startCounter = self.imgGrabber.psi.counter +# img_batch = self.imgGrabber.get_formatted_images(noImgsToProcess) +# self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) +# st_compress = time.perf_counter() +# self.apply_compression(img_batch) +# et_compress = time.perf_counter() +# print("COMPRESSION TIME: ", et_compress - st_compress#) +# +# st_assemble = time.perf_counter() +# toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(noImgsToProcess)) +# if self.smallImgs is None: +# self.smallImgs = toSave_img_batch +# else: +# self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) +# et_assemble = time.perf_counter() +# print("ASSEMBLE TIME: ", et_assemble-st_assemble) + + +# def assembleImgsToSave(self, imgs): +# """ +# Form the images from psana pixel index map and downsample images. +# +# Parameters +# ---------- +# imgs: ndarray +# images to downsample +# """ +# pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run)) +# +# saveMe = [] +# for img in imgs.T: +# imgRe = np.reshape(img, self.imgGrabber.psi.det.shape()) +# imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) +# saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) +# return np.array(saveMe) +## imgsRe = np.reshape(imgs.T, (imgs.shape[1], +## self.imgGrabber.psi.det.shape()[0], +## self.imgGrabber.psi.det.shape()[1], +## self.imgGrabber.psi.det.shape()[2])) +## return assemble_image_stack_batch(imgsRe, pixel_index_map) - def assembleImgsToSave(self, imgs): - """ - Form the images from psana pixel index map and downsample images. - - Parameters - ---------- - imgs: ndarray - images to downsample - """ - pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run)) - - saveMe = [] - for img in imgs.T: - imgRe = np.reshape(img, self.imgGrabber.psi.det.shape()) - imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) - saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) - saveMe = np.array(saveMe) - return saveMe def apply_compression(self, X): """ @@ -837,7 +944,9 @@ def write(self): with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) - self.comm.Barrier() + tables.file._open_files.close_all() + print("CREATED FILE: ", filename) + self.comm.barrier() return filename @@ -942,20 +1051,23 @@ def random_unique_numbers_from_range(self, start, end, count): random.shuffle(all_numbers) return all_numbers[:count] - def euclidean_distance(self, p1, p2): - return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2) +# def euclidean_distance(self, p1, p2): +# return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2) + +# def compute_medoid(self, points): +# min_total_distance = float('inf') +# medoid = None +# for i, point in enumerate(points): +# total_distance = 0 +# for other_point in points: +# total_distance += self.euclidean_distance(point, other_point) +# if total_distance < min_total_distance: +# min_total_distance = total_distance +# medoid = point +# return medoid def compute_medoid(self, points): - min_total_distance = float('inf') - medoid = None - for i, point in enumerate(points): - total_distance = 0 - for other_point in points: - total_distance += self.euclidean_distance(point, other_point) - if total_distance < min_total_distance: - min_total_distance = total_distance - medoid = point - return medoid + return points[np.argmin(euclidean_distances(points).sum(axis=0))] def genMedoids(self, medoidLabels, clusterPoints): dictMe = {} @@ -1043,9 +1155,12 @@ def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] def genUMAP(self): + for dirval in os.listdir(self.inputFile[:-26]): + print("ITEM IN DIRECTORY:", dirval) imgs = None projections = None for currRank in range(self.nprocs): + print("GETTING CURRENT RANK: ", currRank) with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: if imgs is None: imgs = hf["SmallImages"][:] @@ -1053,15 +1168,18 @@ def genUMAP(self): else: imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + tables.file._open_files.close_all() intensities = [] for img in imgs: intensities.append(np.sum(img.flatten())) intensities = np.array(intensities) - self.imgs = imgs[:self.numImgsToUse] - self.projections = projections[:self.numImgsToUse] - self.intensities = intensities[:self.numImgsToUse] + skipMe = 4 + self.imgs = imgs[:self.numImgsToUse:skipMe] + self.projections = projections[:self.numImgsToUse:skipMe] + self.intensities = intensities[:self.numImgsToUse:skipMe] + self.numImgsToUse = int(self.numImgsToUse/skipMe) if len(self.imgs)!= self.numImgsToUse: raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) @@ -1076,12 +1194,21 @@ def genUMAP(self): min_samples=int(self.numImgsToUse*0.75//40), min_cluster_size=int(self.numImgsToUse//40), ).fit_predict(self.clusterable_embedding) - exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) + self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05) + self.opticsClust.fit(self.clusterable_embedding) + self.opticsLabels = cluster_optics_dbscan( + reachability=self.opticsClust.reachability_, + core_distances=self.opticsClust.core_distances_, + ordering=self.opticsClust.ordering_, + eps=2, + ) + self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) + self.experData_df['imgind'] = np.arange(self.numImgsToUse) def genABOD(self): if self.includeABOD: @@ -1130,6 +1257,18 @@ def genLabels(self): medoidBold.append(4) self.experData_df['medoidBold'] = medoidBold + opticsNewLabels = [] + for j in self.opticsLabels[self.clustered]: + doneChecking = False + for grouping in self.userGroupings: + if j in grouping and not doneChecking: + opticsNewLabels.append(min(grouping)) + doneChecking=True + if not doneChecking: + opticsNewLabels.append(j) + opticsNewLabels = list(np.array(opticsNewLabels) + 1) + self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) + def genHTML(self): datasource = ColumnDataSource(self.experData_df) color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) @@ -1146,6 +1285,7 @@ def genHTML(self):
Cluster # @cluster + @imgind
""")) @@ -1178,7 +1318,7 @@ def genHTML(self): p.yaxis.axis_label = "Count" indexCDS = ColumnDataSource(dict( - index=[*range(0, self.numImgsToUse, 10)] + index=[*range(0, self.numImgsToUse, 2)] ) ) cols = RangeSlider(title="ET", @@ -1218,7 +1358,8 @@ def genHTML(self): const cluster = datasource.data.cluster const ptColor = datasource.data.ptColor const anomDet = datasource.data.anomDet - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet} + const imgind = datasource.data.imgind + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind} """) cols.js_on_change('value', callback) @@ -1272,9 +1413,37 @@ def genHTML(self): """) toggl.js_on_change('active',toggl_js) - LABELS = ["DBSCAN Clustering", "Anomaly Detection"] + reachabilityDiag = figure( + title='OPTICS Reachability Diag', + tools=('pan, wheel_zoom, reset'), + width = 2000, height = 400 + ) + + space = np.arange(self.numImgsToUse) + reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] + + opticsData_df = pd.DataFrame({'x':space,'y':reachability}) + opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels] + opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']] + color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))], + palette=Category20[20]) + opticssource = ColumnDataSource(opticsData_df) + + reachabilityDiag.circle( + 'x', + 'y', + source=opticssource, + color=dict(field='ptColor', transform=color_mapping2), + line_alpha=0.6, + fill_alpha=0.6, + legend_field='cluster' + ) + reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed") + reachabilityDiag.y_range = Range1d(-1, 10) + + LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"] radio_button_group = RadioButtonGroup(labels=LABELS, active=0) - radioGroup_js = CustomJS(args=dict(datasource=datasource), code=""" + radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code=""" console.log(datasource.data.ptColor) const x = datasource.data.x const y = datasource.data.y @@ -1282,26 +1451,37 @@ def genHTML(self): const medoidBold = datasource.data.medoidBold const cluster = datasource.data.cluster const anomDet = datasource.data.anomDet + const imgind = datasource.data.imgind + + const opticsClust = opticssource.data.cluster let ptColor = null if (cb_obj.active==0){ ptColor = cluster } + else if (cb_obj.active==1){ + ptColor = opticsClust + } else{ ptColor = anomDet } - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet} + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind} """) radio_button_group.js_on_change("active", radioGroup_js) - self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group)) + self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) def fullVisualize(self): + print("here 4") self.genUMAP() + print("here 5") self.genABOD() + print("here 6") self.genLabels() + print("here 7") self.genHTML() + print("here 8") def updateLabels(self): self.genLabels() @@ -1320,13 +1500,32 @@ def userShow(self): output_notebook() show(self.viewResults) +def profile(filename=None, comm=MPI.COMM_WORLD): + def prof_decorator(f): + def wrap_f(*args, **kwargs): + pr = cProfile.Profile() + pr.enable() + result = f(*args, **kwargs) + pr.disable() + + if filename is None: + pr.print_stats() + else: + filename_r = filename + ".{}".format(comm.rank) + pr.dump_stats(filename_r) + + return result + return wrap_f + return prof_decorator + +def id_generator(size=6, chars=string.ascii_uppercase + string.digits): + return ''.join(random.choice(chars) for _ in range(size)) class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize): - self.currRun = datetime.now().strftime("%y%m%d%H%M") self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1346,22 +1545,146 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.divBy = divBy self.batchSize = batchSize + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size + self.imgsTracked = [] + + if self.rank==0: + self.currRun = datetime.now().strftime("%y%m%d%H%M%S") + else: + self.currRun = None + self.currRun = self.comm.bcast(self.currRun, root=0) + + def assembleImgsToSave(self, imgs): + """ + Form the images from psana pixel index map and downsample images. + + Parameters + ---------- + imgs: ndarray + images to downsample + """ + pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + + saveMe = [] + for img in imgs: + imgRe = np.reshape(img, self.psi.det.shape()) + imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) + saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) + return np.array(saveMe) +# imgsRe = np.reshape(imgs.T, (imgs.shape[1], +# self.imgGrabber.psi.det.shape()[0], +# self.imgGrabber.psi.det.shape()[1], +# self.imgGrabber.psi.det.shape()[2])) +# return assemble_image_stack_batch(imgsRe, pixel_index_map) + + def get_formatted_images(self, startInd, n, includeThumbnails=False): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + self.psi.counter = startInd + self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) + + imgs = self.psi.get_images(n, assemble=False) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + num_valid_imgs, p, x, y = imgs.shape + img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + img_batch[img_batch<0] = 0 + nimg_batch = [] + for img in img_batch.T: + if self.threshold: + secondQuartile = np.quantile(img, 0.93) + nimg = (img>secondQuartile)*img +# elu_v = np.vectorize(self.elu) +# nimg = elu_v(img-secondQuartile)+secondQuartile + else: + nimg = img + + currIntensity = np.sum(nimg.flatten(), dtype=np.double) +# print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) + if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: + continue + else: + if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: +# if not self.normalizeIntensity: + nimg_batch.append(nimg/currIntensity) + else: +# nimg_batch.append(nimg) + nimg_batch.append(np.zeros(nimg.shape)) + nimg_batch = np.array(nimg_batch) + if self.downsample: + binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) + binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape + binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T +# print(binned_imgs.shape) + else: + binned_imgs = nimg_batch.T + if includeThumbnails: + return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y)))) + else: + return binned_imgs + + @profile(filename="fullFD_profile") def runMe(self): stfull = time.perf_counter() + #DATA RETRIEVAL STEP + ########################################################################################## + self.fullImgData = [] + self.fullThumbnailData = [] + noImgsToProcess = self.num_imgs//self.size + startingPoint = self.start_offset + self.num_imgs*self.rank//self.size + batchSize = int(self.num_components*2//self.samplingFactor) + for batch in range(0, noImgsToProcess, batchSize): + startInd = startingPoint+batch + binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True) +# print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape) + self.fullImgData.append(binned_imgs) + self.fullThumbnailData.append(thumbnails) + print(self.imgsTracked) + + filenameTest0 = random.randint(0, 10) + filenameTest0 = self.comm.allgather(filenameTest0) + print("TEST 0: ", self.rank, filenameTest0) + #SKETCHING STEP ########################################################################################## - freqDir = FreqDir(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, + freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, - currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming) - print("STARTING SKETCHING") + currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData) + print("STARTING SKETCHING FOR {}".format(self.currRun)) st = time.perf_counter() freqDir.run() localSketchFilename = freqDir.write() et = time.perf_counter() - print("Estimated time for frequent directions rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st)) + print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) + + filenameTest1 = random.randint(0, 10) + filenameTest1 = self.comm.allgather(filenameTest1) + print("TEST 1: ", self.rank, filenameTest1) #MERGING STEP ########################################################################################## @@ -1372,37 +1695,44 @@ def runMe(self): allNames = [] for j in range(freqDir.size): allNames.append(fullSketchFilename + str(j) + ".h5") - mergeTree = MergeTree(exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, + mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun) #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename, # dir=writeToHere, allWriteDirecs=allNames, currRun = currRun) - st = time.perf_counter() mergeTree.merge() mergedSketchFilename = mergeTree.write() et = time.perf_counter() - print("Estimated time merge tree for rank {0}/{1}: {2}".format(freqDir.rank, freqDir.size, et - st)) - + print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) + filenameTest2 = random.randint(0, 10) + filenameTest2 = self.comm.allgather(filenameTest2) + print("TEST 2: ", self.rank, filenameTest2) #PROJECTION STEP ########################################################################################## - appComp = ApplyCompression(start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, + appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, - downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun) + downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData) st = time.perf_counter() appComp.run() appComp.write() et = time.perf_counter() - print("Estimated time projection for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, et - st)) - + print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) + print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) + + self.comm.barrier() + self.comm.Barrier() + filenameTest3 = random.randint(0, 10) + filenameTest3 = self.comm.allgather(filenameTest3) + print("TEST 3: ", self.rank, filenameTest3) - etfull = time.perf_counter() - print("Estimated full processing time for rank {0}/{1}: {2}".format(appComp.rank, appComp.size, etfull - stfull)) ########################################################################################## - - if freqDir.rank==0: + + + if self.rank==0: + print("here 1") st = time.perf_counter() visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), outputFile="./UMAPVis_{}.html".format(self.currRun), @@ -1410,7 +1740,9 @@ def runMe(self): nprocs=freqDir.size, userGroupings=[], includeABOD=True) + print("here 2") visMe.fullVisualize() + print("here 3") visMe.userSave() et = time.perf_counter() print("UMAP HTML Generation Processing time: {}".format(et - st)) From e75809ca4aeebf0642d0c9d08342e01ab1ae7037 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 1 Sep 2023 19:32:10 -0700 Subject: [PATCH 25/57] 124 hz officially a achieved. Permission denied and h5 truncated, h5 file signature found error resolved (sleep for a couple seconds and a bunch of allgather statements seems to have fixed the issue. --- btx/processing/freqdir.py | 155 ++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 72 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 00d520bac..f3c4eb0c4 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -36,7 +36,6 @@ from PIL import Image from io import BytesIO import base64 -import tables from datetime import datetime @@ -172,7 +171,7 @@ def __init__( self.sketch = zeros( (self.m, self.d) ) self.nextZeroRow = 0 self.alpha = alpha - self.mean = None +# self.mean = None self.imgsTracked = [] self.rankAdapt = rankAdapt @@ -325,8 +324,9 @@ def fetch_and_update_model(self, n, currInd): number of images to incorporate """ # img_batch = self.get_formatted_images(n) - img_batch = self.imgData[currInd] -# print("1414oiioqdca", img_batch.shape) + print("a90wjufipoamfoawfa09opi", self.imgData.shape) + img_batch = self.imgData[:, currInd*n:currInd*(n+1)] + print("1414oiioqdca", img_batch.shape) if self.samplingFactor <1: psamp = PrioritySampling(int(n*self.samplingFactor), self.d) @@ -334,13 +334,13 @@ def fetch_and_update_model(self, n, currInd): psamp.update(row) img_batch = np.array(psamp.sketch.get()).T - if self.mean is None: - self.mean = np.mean(img_batch, axis=1) - else: -# self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( +# if self.mean is None: +# self.mean = np.mean(img_batch, axis=1) +# else: +## self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch.T, axis=0))/( +## self.num_incorporated_images + (img_batch.shape[1])) +# self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/( # self.num_incorporated_images + (img_batch.shape[1])) - self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/( - self.num_incorporated_images + (img_batch.shape[1])) # self.update_model((img_batch.T - self.mean).T) self.update_model(img_batch) @@ -606,11 +606,10 @@ def write(self): filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) - hf.create_dataset("mean", data=self.mean) +# hf.create_dataset("mean", data=self.mean) hf.create_dataset("imgsTracked", data=np.array(self.imgsTracked)) hf["sketch"].attrs["numImgsIncorp"] = self.num_incorporated_images - tables.file._open_files.close_all() - print("CREATED FILE: ", filename) + print(self.rank, "CREATED FILE: ", filename) self.comm.barrier() return filename @@ -647,16 +646,16 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy + time.sleep(5) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - tables.file._open_files.close_all() self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) - if self.rank==0: - print("BUFFER SIZES: ", self.buffSizes) +# if self.rank==0: +# print("BUFFER SIZES: ", self.buffSizes) # print(self.data.shape) self.fd.update_model(self.data.T) @@ -711,15 +710,14 @@ def merge(self): for readMe in self.allWriteDirecs: with h5py.File(readMe, 'r') as hf: if self.fullMean is None: - self.fullMean = hf["mean"][:] +# self.fullMean = hf["mean"][:] self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"] self.fullImgsTracked = hf["imgsTracked"][:] else: - self.fullMean = (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp - + hf["sketch"].attrs["numImgsIncorp"]) +# self.fullMean = (self.fullMean*self.fullNumIncorp + hf["mean"][:])/(self.fullNumIncorp +# + hf["sketch"].attrs["numImgsIncorp"]) self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"] self.fullImgsTracked = np.vstack((self.fullImgsTracked, hf["imgsTracked"][:])) - tables.file._open_files.close_all() return self.fd.get() else: return @@ -730,14 +728,19 @@ def write(self): """ # print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked) filename = self.output_dir + '{}_merge.h5'.format(self.currRun) + if self.rank==0: - with h5py.File(filename, 'w') as hf: - hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) - hf.create_dataset("mean", data=self.fullMean) - hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp - hf.create_dataset("imgsTracked", data=self.fullImgsTracked) - print("CREATED FILE: ", filename) - tables.file._open_files.close_all() + for ind in range(self.size): + filename2 = filename[:-3] + "_"+str(ind)+".h5" + with h5py.File(filename2, 'w') as hf: + hf.create_dataset("sketch", data=self.fd.sketch[:self.fd.ell, :]) +# hf.create_dataset("mean", data=self.fullMean) + hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp + hf.create_dataset("imgsTracked", data=self.fullImgsTracked) +# print("CREATED FILE: ", filename2) + self.comm.send(filename2, dest=ind, tag=ind) + else: + print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank)) self.comm.barrier() return filename @@ -813,13 +816,15 @@ def __init__( self.num_incorporated_images = 0 - print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile, os.path.isfile(readFile))) - while(not os.path.isfile(readFile)): - print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile, self.rank)) - with h5py.File(readFile, 'r') as hf: + readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5" + +# print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) +# while(not os.path.isfile(readFile2)): +# print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) + time.sleep(5) + with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] - self.mean = hf["mean"][:] - tables.file._open_files.close_all() +# self.mean = hf["mean"][:] U, S, Vt = np.linalg.svd(self.data, full_matrices=False) self.components = Vt @@ -837,10 +842,10 @@ def run(self): """ Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. """ - noImgsToProcess = self.num_imgs//self.size +# noImgsToProcess = self.num_imgs//self.size # for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)): - for currInd in range(len(self.imgData)): - self.fetch_and_process_data(currInd) +# for currInd in range(len(self.imgData)): + self.fetch_and_process_data(0) # print("RANK {} IS DONE".format(self.rank)) # self.fetch_and_process_data() @@ -866,8 +871,8 @@ def fetch_and_process_data(self, currInd): # stassemble = time.perf_counter() - img_batch = self.imgData[currInd] - toSave_img_batch = self.thumbnailData[currInd] + img_batch = self.imgData + toSave_img_batch = self.thumbnailData if self.smallImgs is None: self.smallImgs = toSave_img_batch @@ -944,8 +949,7 @@ def write(self): with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) hf.create_dataset("SmallImages", data=self.smallImgs) - tables.file._open_files.close_all() - print("CREATED FILE: ", filename) +# print("CREATED FILE: ", filename) self.comm.barrier() return filename @@ -1155,12 +1159,12 @@ def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] def genUMAP(self): - for dirval in os.listdir(self.inputFile[:-26]): - print("ITEM IN DIRECTORY:", dirval) +# for dirval in os.listdir(self.inputFile[:-26]): +# print("ITEM IN DIRECTORY:", dirval) imgs = None projections = None for currRank in range(self.nprocs): - print("GETTING CURRENT RANK: ", currRank) +# print("GETTING CURRENT RANK: ", currRank) with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: if imgs is None: imgs = hf["SmallImages"][:] @@ -1168,14 +1172,13 @@ def genUMAP(self): else: imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) - tables.file._open_files.close_all() intensities = [] for img in imgs: intensities.append(np.sum(img.flatten())) intensities = np.array(intensities) - skipMe = 4 + skipMe = 8 self.imgs = imgs[:self.numImgsToUse:skipMe] self.projections = projections[:self.numImgsToUse:skipMe] self.intensities = intensities[:self.numImgsToUse:skipMe] @@ -1199,12 +1202,15 @@ def genUMAP(self): self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05) self.opticsClust.fit(self.clusterable_embedding) - self.opticsLabels = cluster_optics_dbscan( - reachability=self.opticsClust.reachability_, - core_distances=self.opticsClust.core_distances_, - ordering=self.opticsClust.ordering_, - eps=2, - ) +# self.opticsLabels = cluster_optics_dbscan( +# reachability=self.opticsClust.reachability_, +# core_distances=self.opticsClust.core_distances_, +# ordering=self.opticsClust.ordering_, +# eps=2, +# ) + +# self.opticsLabels = self.opticsClust.labels_[self.opticsClust.ordering_] + self.opticsLabels = self.opticsClust.labels_ self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) @@ -1419,8 +1425,10 @@ def genHTML(self): width = 2000, height = 400 ) - space = np.arange(self.numImgsToUse) - reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] +# space = np.arange(self.numImgsToUse) + space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_] +# reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] + reachability = self.opticsClust.reachability_ opticsData_df = pd.DataFrame({'x':space,'y':reachability}) opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels] @@ -1473,15 +1481,15 @@ def genHTML(self): self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) def fullVisualize(self): - print("here 4") +# print("here 4") self.genUMAP() - print("here 5") +# print("here 5") self.genABOD() - print("here 6") +# print("here 6") self.genLabels() - print("here 7") +# print("here 7") self.genHTML() - print("here 8") +# print("here 8") def updateLabels(self): self.genLabels() @@ -1602,6 +1610,7 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False): """ self.psi.counter = startInd self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) + print(self.imgsTracked) imgs = self.psi.get_images(n, assemble=False) @@ -1651,18 +1660,20 @@ def runMe(self): #DATA RETRIEVAL STEP ########################################################################################## - self.fullImgData = [] - self.fullThumbnailData = [] - noImgsToProcess = self.num_imgs//self.size - startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - batchSize = int(self.num_components*2//self.samplingFactor) - for batch in range(0, noImgsToProcess, batchSize): - startInd = startingPoint+batch - binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True) +# self.fullImgData = [] +# self.fullThumbnailData = [] +# noImgsToProcess = self.num_imgs//self.size +# batchSize = int(self.num_components*2//self.samplingFactor) +# for batch in range(0, noImgsToProcess, batchSize): +# startInd = startingPoint+batch +# binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True) # print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape) - self.fullImgData.append(binned_imgs) - self.fullThumbnailData.append(thumbnails) - print(self.imgsTracked) +# self.fullImgData.append(binned_imgs) +# self.fullThumbnailData.append(thumbnails) +# print(self.imgsTracked) + + startingPoint = self.start_offset + self.num_imgs*self.rank//self.size + self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True) filenameTest0 = random.randint(0, 10) filenameTest0 = self.comm.allgather(filenameTest0) @@ -1732,7 +1743,7 @@ def runMe(self): if self.rank==0: - print("here 1") +# print("here 1") st = time.perf_counter() visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), outputFile="./UMAPVis_{}.html".format(self.currRun), @@ -1740,9 +1751,9 @@ def runMe(self): nprocs=freqDir.size, userGroupings=[], includeABOD=True) - print("here 2") +# print("here 2") visMe.fullVisualize() - print("here 3") +# print("here 3") visMe.userSave() et = time.perf_counter() print("UMAP HTML Generation Processing time: {}".format(et - st)) From 697723eb3414f567ea1fb231f9993bbcdc6cec3a Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 3 Sep 2023 12:07:01 -0700 Subject: [PATCH 26/57] Everything working and produces 120 hz processing speed. Added modularity to image processing (again), fixed optics labeling bug, added visualization parameters, background color of images is now dbscan cluster color --- btx/processing/freqdir.py | 237 ++++++++++++++++++++++++-------------- 1 file changed, 149 insertions(+), 88 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index f3c4eb0c4..6605df459 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -288,31 +288,6 @@ def elu(self,x): # else: # return np.array(nimg_batch).T - ########################################################################### - - #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch. - def intensityFunc_threshold(img): - if img is None: - return img - else: - secondQuartile = np.sort(img)[-1]//4 - return (img>secondQuartile)*img - - def intensityFunc_removeZeroIntensity(img, currIntensity): - if currIntensity<50000: - return None - else: - return img - - def intensityFunc_normalizeIntensity(img, currIntensity): - if img is None: - return img - - if currIntensity<50000: - return img - else: - return img/currIntensity - ########################################################################### def fetch_and_update_model(self, n, currInd): """ @@ -646,7 +621,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy - time.sleep(5) + time.sleep(30) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] @@ -821,7 +796,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(5) + time.sleep(30) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -1031,7 +1006,9 @@ class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN """ - def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings): + def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, + skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, + optics_min_samples, optics_xi, optics_min_cluster_size): self.inputFile = inputFile self.outputFile = outputFile output_file(filename=outputFile, title="Static HTML file") @@ -1040,6 +1017,14 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use self.nprocs = nprocs self.includeABOD = includeABOD self.userGroupings = userGroupings + self.skipSize = skipSize + self.umap_n_neighbors = umap_n_neighbors + self.umap_random_state = umap_random_state + self.hdbscan_min_samples=hdbscan_min_samples + self.hdbscan_min_cluster_size=hdbscan_min_cluster_size + self.optics_min_samples=optics_min_samples + self.optics_xi = optics_xi + self.optics_min_cluster_size = optics_min_cluster_size def embeddable_image(self, data): img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) @@ -1178,29 +1163,29 @@ def genUMAP(self): intensities.append(np.sum(img.flatten())) intensities = np.array(intensities) - skipMe = 8 - self.imgs = imgs[:self.numImgsToUse:skipMe] - self.projections = projections[:self.numImgsToUse:skipMe] - self.intensities = intensities[:self.numImgsToUse:skipMe] - self.numImgsToUse = int(self.numImgsToUse/skipMe) + self.imgs = imgs[:self.numImgsToUse:self.skipSize] + self.projections = projections[:self.numImgsToUse:self.skipSize] + self.intensities = intensities[:self.numImgsToUse:self.skipSize] + + self.numImgsToUse = int(self.numImgsToUse/self.skipSize) if len(self.imgs)!= self.numImgsToUse: raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) self.clusterable_embedding = umap.UMAP( - n_neighbors=self.numImgsToUse//40, + n_neighbors=self.umap_n_neighbors, + random_state=self.umap_random_state, n_components=2, - random_state=42 ).fit_transform(self.projections) self.labels = hdbscan.HDBSCAN( - min_samples=int(self.numImgsToUse*0.75//40), - min_cluster_size=int(self.numImgsToUse//40), + min_samples = self.hdbscan_min_samples, + min_cluster_size = self.hdbscan_min_cluster_size ).fit_predict(self.clusterable_embedding) exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) - self.opticsClust = OPTICS(min_samples=150, xi=0.05, min_cluster_size=0.05) + self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) self.opticsClust.fit(self.clusterable_embedding) # self.opticsLabels = cluster_optics_dbscan( # reachability=self.opticsClust.reachability_, @@ -1214,7 +1199,7 @@ def genUMAP(self): self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) - self.experData_df['imgind'] = np.arange(self.numImgsToUse) + self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize def genABOD(self): if self.includeABOD: @@ -1253,6 +1238,7 @@ def genLabels(self): self.newLabels = np.array(self.relabel_to_closest_zero(newLabels)) self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]] self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']] + self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding) self.medoidInds = [x[1] for x in medoid_lst] medoidBold = [] @@ -1284,14 +1270,17 @@ def genHTML(self): width = 2000, height = 600 ) plot_figure.add_tools(HoverTool(tooltips=""" -
+
+
+ +
- + Cluster # + @cluster
- Cluster # - @cluster - @imgind + Image # + @imgind
""")) @@ -1365,13 +1354,14 @@ def genHTML(self): const ptColor = datasource.data.ptColor const anomDet = datasource.data.anomDet const imgind = datasource.data.imgind - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind} + const backgroundColor = datasource.data.backgroundColor + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor} """) cols.js_on_change('value', callback) imgsPlot = figure(width=2000, height=150, toolbar_location=None) - imgsPlot.image(image=[self.imgs[imgind][::-1] for imgind in self.medoidInds], + imgsPlot.image(image=[self.imgs[imgindMe][::-1] for imgindMe in self.medoidInds], x=[0.25+xind for xind in range(len(self.medoidInds))], y=0, dw=0.5, dh=1, @@ -1396,10 +1386,10 @@ def genHTML(self): clearInterval(looop); } else if(slider_val1 >= index[index.length - 1]) { - cb_obj.label = '► Play'; +// cb_obj.label = '► Play'; slider.value = [0, slider_val1-slider_val0]; - cb_obj.active = false; - clearInterval(looop); +// cb_obj.active = false; +// clearInterval(looop); } else if(slider_val1 !== index[index.length - 1]){ slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]]; @@ -1425,13 +1415,14 @@ def genHTML(self): width = 2000, height = 400 ) -# space = np.arange(self.numImgsToUse) - space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_] + space = np.arange(self.numImgsToUse) +# space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_] # reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] reachability = self.opticsClust.reachability_ opticsData_df = pd.DataFrame({'x':space,'y':reachability}) - opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels] + opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels] + opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]] opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']] color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))], palette=Category20[20]) @@ -1460,8 +1451,9 @@ def genHTML(self): const cluster = datasource.data.cluster const anomDet = datasource.data.anomDet const imgind = datasource.data.imgind + const backgroundColor = datasource.data.backgroundColor - const opticsClust = opticssource.data.cluster + const opticsClust = opticssource.data.clusterForScatterPlot let ptColor = null @@ -1474,7 +1466,7 @@ def genHTML(self): else{ ptColor = anomDet } - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind} + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor} """) radio_button_group.js_on_change("active", radioGroup_js) @@ -1533,7 +1525,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize): + def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize, thresholdQuantile): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1552,6 +1544,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.priming=priming self.divBy = divBy self.batchSize = batchSize + self.thresholdQuantile = thresholdQuantile self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() @@ -1567,6 +1560,8 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) + self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01) + def assembleImgsToSave(self, imgs): """ Form the images from psana pixel index map and downsample images. @@ -1617,31 +1612,47 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False): imgs = imgs[ [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] ] - num_valid_imgs, p, x, y = imgs.shape + if len(imgs.shape)==4: + num_valid_imgs, p, x, y = imgs.shape + else: + p = 1 + num_valid_imgs, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T img_batch[img_batch<0] = 0 nimg_batch = [] for img in img_batch.T: - if self.threshold: - secondQuartile = np.quantile(img, 0.93) - nimg = (img>secondQuartile)*img -# elu_v = np.vectorize(self.elu) -# nimg = elu_v(img-secondQuartile)+secondQuartile - else: - nimg = img - + nimg = img currIntensity = np.sum(nimg.flatten(), dtype=np.double) -# print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) - if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: - continue - else: - if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: -# if not self.normalizeIntensity: - nimg_batch.append(nimg/currIntensity) - else: -# nimg_batch.append(nimg) - nimg_batch.append(np.zeros(nimg.shape)) + if self.threshold: + nimg = self.imageProcessor.threshold(nimg) + if self.noZeroIntensity: + nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity) + if self.normalizeIntensity: + nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) nimg_batch = np.array(nimg_batch) +# self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img)) +# if self.threshold: +# secondQuartile = np.quantile(img, self.thresholdQuantile) +# nimg = (img>secondQuartile)*img +## elu_v = np.vectorize(self.elu) +## nimg = elu_v(img-secondQuartile)+secondQuartile +# else: +# nimg = img +# +# currIntensity = np.sum(nimg.flatten(), dtype=np.double) +## print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) +# if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: +# continue +# else: +# if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: +## if not self.normalizeIntensity: +# nimg_batch.append(nimg/currIntensity) +# else: +## nimg_batch.append(nimg) +# nimg_batch.append(np.zeros(nimg.shape)) +# nimg_batch = np.array(nimg_batch) if self.downsample: binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape @@ -1675,9 +1686,9 @@ def runMe(self): startingPoint = self.start_offset + self.num_imgs*self.rank//self.size self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True) - filenameTest0 = random.randint(0, 10) - filenameTest0 = self.comm.allgather(filenameTest0) - print("TEST 0: ", self.rank, filenameTest0) +# filenameTest0 = random.randint(0, 10) +# filenameTest0 = self.comm.allgather(filenameTest0) +# print("TEST 0: ", self.rank, filenameTest0) #SKETCHING STEP ########################################################################################## @@ -1693,9 +1704,9 @@ def runMe(self): et = time.perf_counter() print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) - filenameTest1 = random.randint(0, 10) - filenameTest1 = self.comm.allgather(filenameTest1) - print("TEST 1: ", self.rank, filenameTest1) +# filenameTest1 = random.randint(0, 10) +# filenameTest1 = self.comm.allgather(filenameTest1) +# print("TEST 1: ", self.rank, filenameTest1) #MERGING STEP ########################################################################################## @@ -1716,9 +1727,9 @@ def runMe(self): et = time.perf_counter() print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) - filenameTest2 = random.randint(0, 10) - filenameTest2 = self.comm.allgather(filenameTest2) - print("TEST 2: ", self.rank, filenameTest2) +# filenameTest2 = random.randint(0, 10) +# filenameTest2 = self.comm.allgather(filenameTest2) +# print("TEST 2: ", self.rank, filenameTest2) #PROJECTION STEP ########################################################################################## @@ -1735,9 +1746,9 @@ def runMe(self): self.comm.barrier() self.comm.Barrier() - filenameTest3 = random.randint(0, 10) - filenameTest3 = self.comm.allgather(filenameTest3) - print("TEST 3: ", self.rank, filenameTest3) +# filenameTest3 = random.randint(0, 10) +# filenameTest3 = self.comm.allgather(filenameTest3) +# print("TEST 3: ", self.rank, filenameTest3) ########################################################################################## @@ -1745,12 +1756,21 @@ def runMe(self): if self.rank==0: # print("here 1") st = time.perf_counter() + + skipSize = 8 + numImgsToUse = int(self.num_imgs/skipSize) visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), outputFile="./UMAPVis_{}.html".format(self.currRun), numImgsToUse=self.num_imgs, - nprocs=freqDir.size, + nprocs=self.size, userGroupings=[], - includeABOD=True) + includeABOD=True, + skipSize = skipSize, + umap_n_neighbors=numImgsToUse//40, + umap_random_state=42, + hdbscan_min_samples=int(numImgsToUse*0.75//40), + hdbscan_min_cluster_size=int(numImgsToUse//40), + optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) # print("here 2") visMe.fullVisualize() # print("here 3") @@ -1759,4 +1779,45 @@ def runMe(self): print("UMAP HTML Generation Processing time: {}".format(et - st)) print("TOTAL PROCESING TIME: {}".format(et - stfull)) +class FD_ImageProcessing: + #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch. + def __init__(self, minIntensity, thresholdQuantile, eluAlpha): + self.minIntensity = minIntensity + self.thresholdQuantile = thresholdQuantile + self.eluAlpha = eluAlpha + + def elu(self,x): + if x > 0: + return x + else: + return self.eluAlpha*(math.exp(x)-1) + + def eluThreshold(self, img): + if img is None: + return img + else: + elu_v = np.vectorize(self.elu) + secondQuartile = np.quantile(img, self.thresholdQuantile) + return(elu_v(img-secondQuartile)+secondQuartile) + + + def threshold(self, img): + if img is None: + return img + else: + secondQuartile = np.quantile(img, self.thresholdQuantile) + return (img>secondQuartile)*img + def removeZeroIntensity(self, img, currIntensity): + if currIntensity Date: Sun, 3 Sep 2023 16:21:51 -0700 Subject: [PATCH 27/57] Fixed UMAP html --- btx/processing/freqdir.py | 93 +++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 6605df459..7ebb67140 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1215,6 +1215,7 @@ def genABOD(self): else: outlierLabels.append(str(0)) self.experData_df['anomDet'] = outlierLabels + self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels] def setUserGroupings(self, userGroupings): """ @@ -1238,6 +1239,7 @@ def genLabels(self): self.newLabels = np.array(self.relabel_to_closest_zero(newLabels)) self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]] self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']] + self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels] self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding) self.medoidInds = [x[1] for x in medoid_lst] @@ -1260,6 +1262,7 @@ def genLabels(self): opticsNewLabels.append(j) opticsNewLabels = list(np.array(opticsNewLabels) + 1) self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) + self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] def genHTML(self): datasource = ColumnDataSource(self.experData_df) @@ -1270,17 +1273,19 @@ def genHTML(self): width = 2000, height = 600 ) plot_figure.add_tools(HoverTool(tooltips=""" -
-
- +
+
+
-
- Cluster # - @cluster -
-
- Image # - @imgind +
+
+ Cluster + @cluster +
+
+ Image + @imgind +
""")) @@ -1355,7 +1360,10 @@ def genHTML(self): const anomDet = datasource.data.anomDet const imgind = datasource.data.imgind const backgroundColor = datasource.data.backgroundColor - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor} + const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor + const anom_backgroundColor = datasource.data.anom_backgroundColor + const optics_backgroundColor = datasource.data.optics_backgroundColor + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} """) cols.js_on_change('value', callback) @@ -1414,12 +1422,8 @@ def genHTML(self): tools=('pan, wheel_zoom, reset'), width = 2000, height = 400 ) - space = np.arange(self.numImgsToUse) -# space = np.arange(self.numImgsToUse)[self.opticsClust.ordering_] -# reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] reachability = self.opticsClust.reachability_ - opticsData_df = pd.DataFrame({'x':space,'y':reachability}) opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels] opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]] @@ -1427,7 +1431,6 @@ def genHTML(self): color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))], palette=Category20[20]) opticssource = ColumnDataSource(opticsData_df) - reachabilityDiag.circle( 'x', 'y', @@ -1451,22 +1454,28 @@ def genHTML(self): const cluster = datasource.data.cluster const anomDet = datasource.data.anomDet const imgind = datasource.data.imgind - const backgroundColor = datasource.data.backgroundColor + const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor + const anom_backgroundColor = datasource.data.anom_backgroundColor + const optics_backgroundColor = datasource.data.optics_backgroundColor const opticsClust = opticssource.data.clusterForScatterPlot let ptColor = null + let backgroundColor = null if (cb_obj.active==0){ ptColor = cluster + backgroundColor = dbscan_backgroundColor } else if (cb_obj.active==1){ ptColor = opticsClust + backgroundColor = optics_backgroundColor } else{ ptColor = anomDet + backgroundColor = anom_backgroundColor } - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor} + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} """) radio_button_group.js_on_change("active", radioGroup_js) @@ -1753,31 +1762,31 @@ def runMe(self): ########################################################################################## - if self.rank==0: +# if self.rank==0: # print("here 1") - st = time.perf_counter() - - skipSize = 8 - numImgsToUse = int(self.num_imgs/skipSize) - visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), - outputFile="./UMAPVis_{}.html".format(self.currRun), - numImgsToUse=self.num_imgs, - nprocs=self.size, - userGroupings=[], - includeABOD=True, - skipSize = skipSize, - umap_n_neighbors=numImgsToUse//40, - umap_random_state=42, - hdbscan_min_samples=int(numImgsToUse*0.75//40), - hdbscan_min_cluster_size=int(numImgsToUse//40), - optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) -# print("here 2") - visMe.fullVisualize() -# print("here 3") - visMe.userSave() - et = time.perf_counter() - print("UMAP HTML Generation Processing time: {}".format(et - st)) - print("TOTAL PROCESING TIME: {}".format(et - stfull)) +# st = time.perf_counter() +# +# skipSize = 8 +# numImgsToUse = int(self.num_imgs/skipSize) +# visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), +# outputFile="./UMAPVis_{}.html".format(self.currRun), +# numImgsToUse=self.num_imgs, +# nprocs=self.size, +# userGroupings=[], +# includeABOD=True, +# skipSize = skipSize, +# umap_n_neighbors=numImgsToUse//40, +# umap_random_state=42, +# hdbscan_min_samples=int(numImgsToUse*0.75//40), +# hdbscan_min_cluster_size=int(numImgsToUse//40), +# optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) +## print("here 2") +# visMe.fullVisualize() +## print("here 3") +# visMe.userSave() +# et = time.perf_counter() +# print("UMAP HTML Generation Processing time: {}".format(et - st)) +# print("TOTAL PROCESING TIME: {}".format(et - stfull)) class FD_ImageProcessing: #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch. From 4692ae352b55d8abbea9ad0467da5428937bc5ee Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 3 Sep 2023 21:07:26 -0700 Subject: [PATCH 28/57] Fixed img range processed tracking and counting bug. Also moved data loading into its own class. --- btx/processing/freqdir.py | 318 +++++++++++++++++++++++++------------- 1 file changed, 210 insertions(+), 108 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 7ebb67140..a9e985e89 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -129,6 +129,7 @@ def __init__( output_dir, currRun, imgData, + imgsTracked, alpha=0, rankAdapt=False, merger=False, @@ -152,8 +153,6 @@ def __init__( self.rank= rank self.size = size - self.psi.counter = start_offset + self.num_images*self.rank//self.size - self.currRun = currRun self.output_dir = output_dir @@ -172,7 +171,7 @@ def __init__( self.nextZeroRow = 0 self.alpha = alpha # self.mean = None - self.imgsTracked = [] + self.imgsTracked = imgsTracked self.rankAdapt = rankAdapt self.increaseEll = False @@ -625,7 +624,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None, imgsTracked=None) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -684,8 +683,9 @@ def merge(self): fullLen = len(self.allWriteDirecs) for readMe in self.allWriteDirecs: with h5py.File(readMe, 'r') as hf: - if self.fullMean is None: +# if self.fullMean is None: # self.fullMean = hf["mean"][:] + if self.fullNumIncorp==0: self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"] self.fullImgsTracked = hf["imgsTracked"][:] else: @@ -714,6 +714,7 @@ def write(self): hf.create_dataset("imgsTracked", data=self.fullImgsTracked) # print("CREATED FILE: ", filename2) self.comm.send(filename2, dest=ind, tag=ind) + print("aodiwjaomwdklmduhi22adjdqoi2jd", self.fullImgsTracked) else: print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank)) self.comm.barrier() @@ -1273,16 +1274,16 @@ def genHTML(self): width = 2000, height = 600 ) plot_figure.add_tools(HoverTool(tooltips=""" -
+
-
+
Cluster @cluster
-
+
Image @imgind
@@ -1571,108 +1572,110 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01) - def assembleImgsToSave(self, imgs): - """ - Form the images from psana pixel index map and downsample images. - - Parameters - ---------- - imgs: ndarray - images to downsample - """ - pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) - - saveMe = [] - for img in imgs: - imgRe = np.reshape(img, self.psi.det.shape()) - imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) - saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) - return np.array(saveMe) -# imgsRe = np.reshape(imgs.T, (imgs.shape[1], -# self.imgGrabber.psi.det.shape()[0], -# self.imgGrabber.psi.det.shape()[1], -# self.imgGrabber.psi.det.shape()[2])) -# return assemble_image_stack_batch(imgsRe, pixel_index_map) - - def get_formatted_images(self, startInd, n, includeThumbnails=False): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - self.psi.counter = startInd - self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) - print(self.imgsTracked) + self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile) - imgs = self.psi.get_images(n, assemble=False) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - if len(imgs.shape)==4: - num_valid_imgs, p, x, y = imgs.shape - else: - p = 1 - num_valid_imgs, x, y = imgs.shape - img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - img_batch[img_batch<0] = 0 - nimg_batch = [] - for img in img_batch.T: - nimg = img - currIntensity = np.sum(nimg.flatten(), dtype=np.double) - if self.threshold: - nimg = self.imageProcessor.threshold(nimg) - if self.noZeroIntensity: - nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity) - if self.normalizeIntensity: - nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity) - if nimg is not None: - nimg_batch.append(nimg) - nimg_batch = np.array(nimg_batch) -# self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img)) -# if self.threshold: -# secondQuartile = np.quantile(img, self.thresholdQuantile) -# nimg = (img>secondQuartile)*img -## elu_v = np.vectorize(self.elu) -## nimg = elu_v(img-secondQuartile)+secondQuartile -# else: -# nimg = img +# def assembleImgsToSave(self, imgs): +# """ +# Form the images from psana pixel index map and downsample images. +# +# Parameters +# ---------- +# imgs: ndarray +# images to downsample +# """ +# pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) # +# saveMe = [] +# for img in imgs: +# imgRe = np.reshape(img, self.psi.det.shape()) +# imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) +# saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) +# return np.array(saveMe) +## imgsRe = np.reshape(imgs.T, (imgs.shape[1], +## self.imgGrabber.psi.det.shape()[0], +## self.imgGrabber.psi.det.shape()[1], +## self.imgGrabber.psi.det.shape()[2])) +## return assemble_image_stack_batch(imgsRe, pixel_index_map) +# +# def get_formatted_images(self, startInd, n, includeThumbnails=False): +# """ +# Fetch n - x image segments from run, where x is the number of 'dead' images. +# +# Parameters +# ---------- +# n : int +# number of images to retrieve +# start_index : int +# start index of subsection of data to retrieve +# end_index : int +# end index of subsection of data to retrieve +# +# Returns +# ------- +# ndarray, shape (end_index-start_index, n-x) +# n-x retrieved image segments of dimension end_index-start_index +# """ +# self.psi.counter = startInd +# self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) +# print(self.imgsTracked) +# +# imgs = self.psi.get_images(n, assemble=False) +# +# imgs = imgs[ +# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] +# ] +# if len(imgs.shape)==4: +# num_valid_imgs, p, x, y = imgs.shape +# else: +# p = 1 +# num_valid_imgs, x, y = imgs.shape +# img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T +# img_batch[img_batch<0] = 0 +# nimg_batch = [] +# for img in img_batch.T: +# nimg = img # currIntensity = np.sum(nimg.flatten(), dtype=np.double) -## print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) -# if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: -# continue -# else: -# if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: -## if not self.normalizeIntensity: -# nimg_batch.append(nimg/currIntensity) -# else: -## nimg_batch.append(nimg) -# nimg_batch.append(np.zeros(nimg.shape)) +# if self.threshold: +# nimg = self.imageProcessor.threshold(nimg) +# if self.noZeroIntensity: +# nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity) +# if self.normalizeIntensity: +# nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity) +# if nimg is not None: +# nimg_batch.append(nimg) # nimg_batch = np.array(nimg_batch) - if self.downsample: - binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) - binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape - binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T -# print(binned_imgs.shape) - else: - binned_imgs = nimg_batch.T - if includeThumbnails: - return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y)))) - else: - return binned_imgs +## self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img)) +## if self.threshold: +## secondQuartile = np.quantile(img, self.thresholdQuantile) +## nimg = (img>secondQuartile)*img +### elu_v = np.vectorize(self.elu) +### nimg = elu_v(img-secondQuartile)+secondQuartile +## else: +## nimg = img +## +## currIntensity = np.sum(nimg.flatten(), dtype=np.double) +### print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) +## if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: +## continue +## else: +## if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: +### if not self.normalizeIntensity: +## nimg_batch.append(nimg/currIntensity) +## else: +### nimg_batch.append(nimg) +## nimg_batch.append(np.zeros(nimg.shape)) +## nimg_batch = np.array(nimg_batch) +# if self.downsample: +# binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) +# binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape +# binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T +## print(binned_imgs.shape) +# else: +# binned_imgs = nimg_batch.T +# if includeThumbnails: +# return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y)))) +# else: +# return binned_imgs @profile(filename="fullFD_profile") def runMe(self): @@ -1693,7 +1696,7 @@ def runMe(self): # print(self.imgsTracked) startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.fullThumbnailData = self.get_formatted_images(startingPoint, self.num_imgs//self.size, includeThumbnails=True) + self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True) # filenameTest0 = random.randint(0, 10) # filenameTest0 = self.comm.allgather(filenameTest0) @@ -1705,7 +1708,7 @@ def runMe(self): det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, - currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData) + currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked) print("STARTING SKETCHING FOR {}".format(self.currRun)) st = time.perf_counter() freqDir.run() @@ -1830,3 +1833,102 @@ def normalizeIntensity(self, img, currIntensity): return np.zeros(img.shape) else: return img/currIntensity + + +class DataRetriever: + def __init__(self, exp, det_type, run, start_offset, num_imgs, threshold, noZeroIntensity, normalizeIntensity, downsample, bin_factor, thresholdQuantile): + self.exp = exp + self.det_type = det_type + self.run = run + self.start_offset = start_offset + self.num_imgs = num_imgs + self.threshold = threshold + self.noZeroIntensity = noZeroIntensity + self.normalizeIntensity = normalizeIntensity + self.downsample = downsample + self.bin_factor = bin_factor + self.thresholdQuantile = thresholdQuantile + self.imgsTracked = [] + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = self.start_offset + + self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01) + + + def assembleImgsToSave(self, imgs): + """ + Form the images from psana pixel index map and downsample images. + + Parameters + ---------- + imgs: ndarray + images to downsample + """ + pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + + saveMe = [] + for img in imgs: + imgRe = np.reshape(img, self.psi.det.shape()) + imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) + saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) + return np.array(saveMe) + + def get_formatted_images(self, startInd, n, includeThumbnails=False): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + self.psi.counter = startInd + self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) + print(self.imgsTracked) + + imgs = self.psi.get_images(n, assemble=False) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + if len(imgs.shape)==4: + num_valid_imgs, p, x, y = imgs.shape + else: + p = 1 + num_valid_imgs, x, y = imgs.shape + img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + img_batch[img_batch<0] = 0 + nimg_batch = [] + for img in img_batch.T: + nimg = img + currIntensity = np.sum(nimg.flatten(), dtype=np.double) + if self.threshold: + nimg = self.imageProcessor.threshold(nimg) + if self.noZeroIntensity: + nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity) + if self.normalizeIntensity: + nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nimg_batch = np.array(nimg_batch) + if self.downsample: + binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) + binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape + binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T +# print(binned_imgs.shape) + else: + binned_imgs = nimg_batch.T + if includeThumbnails: + return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y))), self.imgsTracked) + else: + return (binned_imgs, self.imgsTracked) From aa1b561b6c479b91cc48f86a0ef049ef1249eebc Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 3 Sep 2023 22:14:29 -0700 Subject: [PATCH 29/57] Cleaned up code. Produces 130hz with nice clustering. --- btx/processing/freqdir.py | 436 ++++---------------------------------- 1 file changed, 43 insertions(+), 393 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index a9e985e89..71a0bd6d0 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -188,106 +188,10 @@ def run(self): Perform frequent directions matrix sketching on run subject to initialization parameters. """ - noImgsToProcess = self.num_images//self.size for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))): self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd) - def elu(self,x): - if x > 0: - return x - else: - return 0.01*(math.exp(x)-1) - -# def get_formatted_images(self, n, includeUnformatted=False): -# """ -# Fetch n - x image segments from run, where x is the number of 'dead' images. -# -# Parameters -# ---------- -# n : int -# number of images to retrieve -# start_index : int -# start index of subsection of data to retrieve -# end_index : int -# end index of subsection of data to retrieve -# -# Returns -# ------- -# ndarray, shape (end_index-start_index, n-x) -# n-x retrieved image segments of dimension end_index-start_index -# """ -# self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) -# # may have to rewrite eventually when number of images becomes large, -# # i.e. streamed setting, either that or downsample aggressively -# imgs = self.psi.get_images(n, assemble=False) -# -# if includeUnformatted: -# imgsCopy = imgs.copy() -# imgsCopy = imgsCopy[ -# [i for i in range(imgsCopy.shape[0]) if not np.isnan(imgsCopy[i : i + 1]).any()] -# ] -# num_valid_imgsCopy, p, x, y = imgsCopy.shape -# img_batchCopy = np.reshape(imgsCopy, (num_valid_imgsCopy, p * x * y)).T -# img_batchCopy[img_batchCopy<0] = 0 -# nimg_batchCopy = [] -# for img in img_batchCopy.T: -# if self.threshold: -# # secondQuartile = np.sort(img)[-1]//4 -# # secondQuartile = np.mean(img) -# # secondQuartile = np.median(img) -# # secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] -# secondQuartile = np.quantile(img, 0.93) -# nimg = (img>secondQuartile)*img -# # elu_v = np.vectorize(self.elu) -# # nimg = elu_v(img-secondQuartile)+secondQuartile -# else: -# nimg = img -# currIntensity = np.sum(nimg.flatten(), dtype=np.double) -# if self.noZeroIntensity and currIntensity<50000: -# continue -# else: -# if currIntensity>=50000 and self.normalizeIntensity: -# nimg_batchCopy.append(nimg/currIntensity) -# else: -# nimg_batchCopy.append(nimg) -# -# if self.downsample: -# imgs = bin_data(imgs, self.bin_factor) -# imgs = imgs[ -# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] -# ] -# num_valid_imgs, p, x, y = imgs.shape -# img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T -# img_batch[img_batch<0] = 0 -# nimg_batch = [] -# for img in img_batch.T: -# if self.threshold: -## secondQuartile = np.sort(img)[-1]//4 -## secondQuartile = np.mean(img) -## secondQuartile = np.median(img) -## secondQuartile = np.partition(img, -len(img)//4)[-len(img)//4] -# secondQuartile = np.quantile(img, 0.93) -# nimg = (img>secondQuartile)*img -## elu_v = np.vectorize(self.elu) -## nimg = elu_v(img-secondQuartile)+secondQuartile -# else: -# nimg = img -# -# currIntensity = np.sum(nimg.flatten(), dtype=np.double) -# if self.noZeroIntensity and currIntensity<50000: -# continue -# else: -# if currIntensity>=50000 and self.normalizeIntensity: -# nimg_batch.append(nimg/currIntensity) -# else: -# nimg_batch.append(nimg) -# if includeUnformatted: -# return (np.array(nimg_batch).T, np.array(nimg_batchCopy).T) -# else: -# return np.array(nimg_batch).T - - def fetch_and_update_model(self, n, currInd): """ Fetch images and update model. @@ -297,10 +201,7 @@ def fetch_and_update_model(self, n, currInd): n : int number of images to incorporate """ -# img_batch = self.get_formatted_images(n) - print("a90wjufipoamfoawfa09opi", self.imgData.shape) img_batch = self.imgData[:, currInd*n:currInd*(n+1)] - print("1414oiioqdca", img_batch.shape) if self.samplingFactor <1: psamp = PrioritySampling(int(n*self.samplingFactor), self.d) @@ -620,7 +521,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy - time.sleep(30) + time.sleep(10) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] @@ -628,10 +529,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) -# if self.rank==0: -# print("BUFFER SIZES: ", self.buffSizes) -# print(self.data.shape) self.fd.update_model(self.data.T) self.output_dir = output_dir @@ -672,9 +570,8 @@ def merge(self): bufferMe = np.empty(self.buffSizes[proc] * self.data.shape[1], dtype=np.double) self.comm.Recv(bufferMe, source=proc, tag=17) bufferMe = np.reshape(bufferMe, (self.buffSizes[proc], self.data.shape[1])) -# print("BUFFERME SHAPE", bufferMe.shape) -# self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1]))))) - self.fd.update_model(bufferMe.T) + self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1],1))))) +# self.fd.update_model(bufferMe.T) else: bufferMe = self.fd.get().copy().flatten() self.comm.Send(bufferMe, dest=root, tag=17) @@ -701,7 +598,6 @@ def write(self): """ Write merged matrix sketch to h5 file """ -# print("IMAGES TRACKED: ", self.fullNumIncorp, " ******* ", self.fullImgsTracked) filename = self.output_dir + '{}_merge.h5'.format(self.currRun) if self.rank==0: @@ -712,11 +608,9 @@ def write(self): # hf.create_dataset("mean", data=self.fullMean) hf["sketch"].attrs["numImgsIncorp"] = self.fullNumIncorp hf.create_dataset("imgsTracked", data=self.fullImgsTracked) -# print("CREATED FILE: ", filename2) self.comm.send(filename2, dest=ind, tag=ind) - print("aodiwjaomwdklmduhi22adjdqoi2jd", self.fullImgsTracked) else: - print("RECEIVED FILE NAME: ", self.comm.recv(source=0, tag=self.rank)) + print("{} RECEIVED FILE NAME: {}".format(self.rank, self.comm.recv(source=0, tag=self.rank))) self.comm.barrier() return filename @@ -782,14 +676,6 @@ def __init__( self.currRun = currRun -# self.imgGrabber = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, -# exp=exp,run=run,det_type=det_type,output_dir="", downsample=downsample, bin_factor=bin_factor, -# threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None) -# self.grabberToSaveImages = FreqDir(comm=comm, rank=rank, size=size, start_offset=start_offset,num_imgs=num_imgs, currRun = currRun, -# exp=exp,run=run,det_type=det_type,output_dir="", downsample=False, bin_factor=0, -# threshold=threshold, normalizeIntensity=normalizeIntensity, noZeroIntensity=noZeroIntensity, priming=False, imgData = None) -# self.batchSize = batchSize - self.num_incorporated_images = 0 readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5" @@ -797,7 +683,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(30) + time.sleep(10) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -816,92 +702,15 @@ def __init__( def run(self): """ - Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. - """ -# noImgsToProcess = self.num_imgs//self.size -# for currInd, batch in enumerate(range(0,noImgsToProcess,self.batchSize)): -# for currInd in range(len(self.imgData)): - self.fetch_and_process_data(0) -# print("RANK {} IS DONE".format(self.rank)) -# self.fetch_and_process_data() - + Retrieve sketch, project images onto new coordinates. Save new coordinates to h5 file. - def fetch_and_process_data(self, currInd): - """ - Fetch and downsample data, apply projection algorithm + Note: If-Else statement is from previous/future work enabling streaming processing. """ -# startCounter = self.imgGrabber.psi.counter - -# stimggrab = time.perf_counter() -# img_batch,img_batchUnformatted = self.imgGrabber.get_formatted_images(self.batchSize,includeUnformatted=True) -# img_batch = self.imgGrabber.get_formatted_images(self.batchSize) -# self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) -# etimggrab = time.perf_counter() -# print("{} Image Grab TIME: ".format(self.rank), etimggrab - stimggrab) - -# stassemble = time.perf_counter() -# toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(self.batchSize)) -# toSave_img_batch = self.assembleImgsToSave(img_batchUnformatted) -# etassemble = time.perf_counter() -# print("{} Assemble TIME: ".format(self.rank), etassemble - stassemble) - -# stassemble = time.perf_counter() - - img_batch = self.imgData - toSave_img_batch = self.thumbnailData - if self.smallImgs is None: - self.smallImgs = toSave_img_batch + self.smallImgs = self.thumbnailData else: - self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) -# self.apply_compression((img_batch.T - self.mean).T) - self.apply_compression(img_batch) -# etassemble = time.perf_counter() -# print("{} Apply Compression TIME: ".format(self.rank), etassemble - stassemble) - - -# noImgsToProcess = self.num_images//self.size -# startCounter = self.imgGrabber.psi.counter -# img_batch = self.imgGrabber.get_formatted_images(noImgsToProcess) -# self.imageIndicesProcessed.append((startCounter, self.imgGrabber.psi.counter)) -# st_compress = time.perf_counter() -# self.apply_compression(img_batch) -# et_compress = time.perf_counter() -# print("COMPRESSION TIME: ", et_compress - st_compress#) -# -# st_assemble = time.perf_counter() -# toSave_img_batch = self.assembleImgsToSave(self.grabberToSaveImages.get_formatted_images(noImgsToProcess)) -# if self.smallImgs is None: -# self.smallImgs = toSave_img_batch -# else: -# self.smallImgs = np.concatenate((self.smallImgs, toSave_img_batch), axis=0) -# et_assemble = time.perf_counter() -# print("ASSEMBLE TIME: ", et_assemble-st_assemble) - - -# def assembleImgsToSave(self, imgs): -# """ -# Form the images from psana pixel index map and downsample images. -# -# Parameters -# ---------- -# imgs: ndarray -# images to downsample -# """ -# pixel_index_map = retrieve_pixel_index_map(self.imgGrabber.psi.det.geometry(self.imgGrabber.psi.run)) -# -# saveMe = [] -# for img in imgs.T: -# imgRe = np.reshape(img, self.imgGrabber.psi.det.shape()) -# imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) -# saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) -# return np.array(saveMe) -## imgsRe = np.reshape(imgs.T, (imgs.shape[1], -## self.imgGrabber.psi.det.shape()[0], -## self.imgGrabber.psi.det.shape()[1], -## self.imgGrabber.psi.det.shape()[2])) -## return assemble_image_stack_batch(imgsRe, pixel_index_map) - + self.smallImgs = np.concatenate((self.smallImgs, self.thumbnailData), axis=0) + self.apply_compression(self.imgData) def apply_compression(self, X): """ @@ -1041,21 +850,6 @@ def random_unique_numbers_from_range(self, start, end, count): random.shuffle(all_numbers) return all_numbers[:count] -# def euclidean_distance(self, p1, p2): -# return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2) - -# def compute_medoid(self, points): -# min_total_distance = float('inf') -# medoid = None -# for i, point in enumerate(points): -# total_distance = 0 -# for other_point in points: -# total_distance += self.euclidean_distance(point, other_point) -# if total_distance < min_total_distance: -# min_total_distance = total_distance -# medoid = point -# return medoid - def compute_medoid(self, points): return points[np.argmin(euclidean_distances(points).sum(axis=0))] @@ -1145,12 +939,9 @@ def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] def genUMAP(self): -# for dirval in os.listdir(self.inputFile[:-26]): -# print("ITEM IN DIRECTORY:", dirval) imgs = None projections = None for currRank in range(self.nprocs): -# print("GETTING CURRENT RANK: ", currRank) with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: if imgs is None: imgs = hf["SmallImages"][:] @@ -1447,7 +1238,6 @@ def genHTML(self): LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"] radio_button_group = RadioButtonGroup(labels=LABELS, active=0) radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code=""" - console.log(datasource.data.ptColor) const x = datasource.data.x const y = datasource.data.y const image = datasource.data.image @@ -1483,15 +1273,10 @@ def genHTML(self): self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) def fullVisualize(self): -# print("here 4") self.genUMAP() -# print("here 5") self.genABOD() -# print("here 6") self.genLabels() -# print("here 7") self.genHTML() -# print("here 8") def updateLabels(self): self.genLabels() @@ -1574,134 +1359,13 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile) -# def assembleImgsToSave(self, imgs): -# """ -# Form the images from psana pixel index map and downsample images. -# -# Parameters -# ---------- -# imgs: ndarray -# images to downsample -# """ -# pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) -# -# saveMe = [] -# for img in imgs: -# imgRe = np.reshape(img, self.psi.det.shape()) -# imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) -# saveMe.append(np.array(Image.fromarray(imgRe).resize((64, 64)))) -# return np.array(saveMe) -## imgsRe = np.reshape(imgs.T, (imgs.shape[1], -## self.imgGrabber.psi.det.shape()[0], -## self.imgGrabber.psi.det.shape()[1], -## self.imgGrabber.psi.det.shape()[2])) -## return assemble_image_stack_batch(imgsRe, pixel_index_map) -# -# def get_formatted_images(self, startInd, n, includeThumbnails=False): -# """ -# Fetch n - x image segments from run, where x is the number of 'dead' images. -# -# Parameters -# ---------- -# n : int -# number of images to retrieve -# start_index : int -# start index of subsection of data to retrieve -# end_index : int -# end index of subsection of data to retrieve -# -# Returns -# ------- -# ndarray, shape (end_index-start_index, n-x) -# n-x retrieved image segments of dimension end_index-start_index -# """ -# self.psi.counter = startInd -# self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) -# print(self.imgsTracked) -# -# imgs = self.psi.get_images(n, assemble=False) -# -# imgs = imgs[ -# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] -# ] -# if len(imgs.shape)==4: -# num_valid_imgs, p, x, y = imgs.shape -# else: -# p = 1 -# num_valid_imgs, x, y = imgs.shape -# img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T -# img_batch[img_batch<0] = 0 -# nimg_batch = [] -# for img in img_batch.T: -# nimg = img -# currIntensity = np.sum(nimg.flatten(), dtype=np.double) -# if self.threshold: -# nimg = self.imageProcessor.threshold(nimg) -# if self.noZeroIntensity: -# nimg = self.imageProcessor.removeZeroIntensity(nimg, currIntensity) -# if self.normalizeIntensity: -# nimg = self.imageProcessor.normalizeIntensity(nimg, currIntensity) -# if nimg is not None: -# nimg_batch.append(nimg) -# nimg_batch = np.array(nimg_batch) -## self.imageProcessor.normalizeIntensity(self.imageProcessor(removeZeroIntensity(self.imageProcessor.threshold(img)) -## if self.threshold: -## secondQuartile = np.quantile(img, self.thresholdQuantile) -## nimg = (img>secondQuartile)*img -### elu_v = np.vectorize(self.elu) -### nimg = elu_v(img-secondQuartile)+secondQuartile -## else: -## nimg = img -## -## currIntensity = np.sum(nimg.flatten(), dtype=np.double) -### print("RANK: {} ***** INTENSITY: {}".format(self.rank, currIntensity)) -## if self.noZeroIntensity and currIntensity< (self.bin_factor**2) * 50000: -## continue -## else: -## if currIntensity>=(self.bin_factor**2) * 50000 and self.normalizeIntensity: -### if not self.normalizeIntensity: -## nimg_batch.append(nimg/currIntensity) -## else: -### nimg_batch.append(nimg) -## nimg_batch.append(np.zeros(nimg.shape)) -## nimg_batch = np.array(nimg_batch) -# if self.downsample: -# binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) -# binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape -# binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T -## print(binned_imgs.shape) -# else: -# binned_imgs = nimg_batch.T -# if includeThumbnails: -# return (binned_imgs, self.assembleImgsToSave(np.reshape(nimg_batch, (num_valid_imgs, p, x, y)))) -# else: -# return binned_imgs - - @profile(filename="fullFD_profile") +# @profile(filename="fullFD_profile") def runMe(self): stfull = time.perf_counter() - #DATA RETRIEVAL STEP - ########################################################################################## -# self.fullImgData = [] -# self.fullThumbnailData = [] -# noImgsToProcess = self.num_imgs//self.size -# batchSize = int(self.num_components*2//self.samplingFactor) -# for batch in range(0, noImgsToProcess, batchSize): -# startInd = startingPoint+batch -# binned_imgs, thumbnails = self.get_formatted_images(startInd, batchSize, includeThumbnails=True) -# print("aodijwaodijaodij", binned_imgs.shape, thumbnails.shape) -# self.fullImgData.append(binned_imgs) -# self.fullThumbnailData.append(thumbnails) -# print(self.imgsTracked) - startingPoint = self.start_offset + self.num_imgs*self.rank//self.size self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True) -# filenameTest0 = random.randint(0, 10) -# filenameTest0 = self.comm.allgather(filenameTest0) -# print("TEST 0: ", self.rank, filenameTest0) - #SKETCHING STEP ########################################################################################## freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, @@ -1709,17 +1373,13 @@ def runMe(self): merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked) - print("STARTING SKETCHING FOR {}".format(self.currRun)) + print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) st = time.perf_counter() freqDir.run() localSketchFilename = freqDir.write() et = time.perf_counter() print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) -# filenameTest1 = random.randint(0, 10) -# filenameTest1 = self.comm.allgather(filenameTest1) -# print("TEST 1: ", self.rank, filenameTest1) - #MERGING STEP ########################################################################################## if freqDir.rank<10: @@ -1731,18 +1391,12 @@ def runMe(self): allNames.append(fullSketchFilename + str(j) + ".h5") mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun) - #mergeTree = MergeTree(divBy=2, readFile = localSketchFilename, - # dir=writeToHere, allWriteDirecs=allNames, currRun = currRun) st = time.perf_counter() mergeTree.merge() mergedSketchFilename = mergeTree.write() et = time.perf_counter() print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) -# filenameTest2 = random.randint(0, 10) -# filenameTest2 = self.comm.allgather(filenameTest2) -# print("TEST 2: ", self.rank, filenameTest2) - #PROJECTION STEP ########################################################################################## appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, @@ -1755,44 +1409,42 @@ def runMe(self): et = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) - + self.comm.barrier() self.comm.Barrier() -# filenameTest3 = random.randint(0, 10) -# filenameTest3 = self.comm.allgather(filenameTest3) -# print("TEST 3: ", self.rank, filenameTest3) + filenameTest3 = random.randint(0, 10) + filenameTest3 = self.comm.allgather(filenameTest3) + print("TEST 3: ", self.rank, filenameTest3) + #UMAP STEP ########################################################################################## - - -# if self.rank==0: -# print("here 1") -# st = time.perf_counter() -# -# skipSize = 8 -# numImgsToUse = int(self.num_imgs/skipSize) -# visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), -# outputFile="./UMAPVis_{}.html".format(self.currRun), -# numImgsToUse=self.num_imgs, -# nprocs=self.size, -# userGroupings=[], -# includeABOD=True, -# skipSize = skipSize, -# umap_n_neighbors=numImgsToUse//40, -# umap_random_state=42, -# hdbscan_min_samples=int(numImgsToUse*0.75//40), -# hdbscan_min_cluster_size=int(numImgsToUse//40), -# optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) -## print("here 2") -# visMe.fullVisualize() -## print("here 3") -# visMe.userSave() -# et = time.perf_counter() -# print("UMAP HTML Generation Processing time: {}".format(et - st)) -# print("TOTAL PROCESING TIME: {}".format(et - stfull)) + if self.rank==0: + print("here 1") + st = time.perf_counter() + + skipSize = 8 + numImgsToUse = int(self.num_imgs/skipSize) + visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), + outputFile="./UMAPVis_{}.html".format(self.currRun), + numImgsToUse=self.num_imgs, + nprocs=self.size, + userGroupings=[], + includeABOD=True, + skipSize = skipSize, + umap_n_neighbors=numImgsToUse//40, + umap_random_state=42, + hdbscan_min_samples=int(numImgsToUse*0.75//40), + hdbscan_min_cluster_size=int(numImgsToUse//40), + optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) +# print("here 2") + visMe.fullVisualize() +# print("here 3") + visMe.userSave() + et = time.perf_counter() + print("UMAP HTML Generation Processing time: {}".format(et - st)) + print("TOTAL PROCESING TIME: {}".format(et - stfull)) class FD_ImageProcessing: - #How to use these functions: call each of them on the image. Append the result if it is not "None" to nimg_batch. def __init__(self, minIntensity, thresholdQuantile, eluAlpha): self.minIntensity = minIntensity self.thresholdQuantile = thresholdQuantile @@ -1812,7 +1464,6 @@ def eluThreshold(self, img): secondQuartile = np.quantile(img, self.thresholdQuantile) return(elu_v(img-secondQuartile)+secondQuartile) - def threshold(self, img): if img is None: return img @@ -1894,7 +1545,7 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False): """ self.psi.counter = startInd self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) - print(self.imgsTracked) + print("Images tracked:", self.imgsTracked) imgs = self.psi.get_images(n, assemble=False) @@ -1925,7 +1576,6 @@ def get_formatted_images(self, startInd, n, includeThumbnails=False): binned_imgs = bin_data(np.reshape(nimg_batch,(num_valid_imgs, p, x, y)), self.bin_factor) binned_num_valid_imgs, binned_p, binned_x, binned_y = binned_imgs.shape binned_imgs = np.reshape(binned_imgs, (binned_num_valid_imgs, binned_p * binned_x * binned_y)).T -# print(binned_imgs.shape) else: binned_imgs = nimg_batch.T if includeThumbnails: From 7b182bd4f1913fd20e89d9eed9cc40e8dccb97bb Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 5 Sep 2023 21:02:18 -0700 Subject: [PATCH 30/57] Cleaned up code and made it so that you don't need to evenly divide everything for things to work --- btx/processing/freqdir.py | 179 ++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 106 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 71a0bd6d0..e741a53e8 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -130,23 +130,18 @@ def __init__( currRun, imgData, imgsTracked, - alpha=0, - rankAdapt=False, - merger=False, - mergerFeatures=0, - downsample=False, - bin_factor=2, - threshold=False, - normalizeIntensity=False, - noZeroIntensity=False, - samplingFactor=1.0, - num_components=10, - batch_size = 10, - priming=False + alpha, + rankAdapt, + merger, + mergerFeatures, + downsample, + bin_factor, + samplingFactor, + num_components, ): super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, - num_images=num_imgs, num_components=num_components, batch_size=batch_size, priming=priming, + num_images=num_imgs, num_components=num_components, batch_size=0, priming=False, downsample=downsample, bin_factor=bin_factor, output_dir=output_dir) self.comm = comm @@ -171,44 +166,27 @@ def __init__( self.nextZeroRow = 0 self.alpha = alpha # self.mean = None - self.imgsTracked = imgsTracked self.rankAdapt = rankAdapt self.increaseEll = False - self.threshold = threshold - self.noZeroIntensity = noZeroIntensity - self.normalizeIntensity=normalizeIntensity self.samplingFactor = samplingFactor self.imgData = imgData + self.imgsTracked = imgsTracked def run(self): """ Perform frequent directions matrix sketching on run subject to initialization parameters. """ - noImgsToProcess = self.num_images//self.size - for currInd, batch in enumerate(range(0,noImgsToProcess,int(self.ell*2//self.samplingFactor))): - self.fetch_and_update_model(int(self.ell*2//self.samplingFactor), currInd) - - def fetch_and_update_model(self, n, currInd): - """ - Fetch images and update model. - - Parameters - ---------- - n : int - number of images to incorporate - """ - img_batch = self.imgData[:, currInd*n:currInd*(n+1)] - + img_batch = self.imgData if self.samplingFactor <1: - psamp = PrioritySampling(int(n*self.samplingFactor), self.d) + psamp = PrioritySampling(int((img_batch.shape[1])*self.samplingFactor), self.d) for row in img_batch.T: psamp.update(row) img_batch = np.array(psamp.sketch.get()).T - + self.update_model(img_batch) # if self.mean is None: # self.mean = np.mean(img_batch, axis=1) # else: @@ -217,8 +195,6 @@ def fetch_and_update_model(self, n, currInd): # self.mean = (self.mean*self.num_incorporated_images + np.sum(img_batch, axis=1, dtype=np.double))/( # self.num_incorporated_images + (img_batch.shape[1])) # self.update_model((img_batch.T - self.mean).T) - self.update_model(img_batch) - def update_model(self, X): """ @@ -521,11 +497,11 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy - time.sleep(10) + time.sleep(15) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, downsample=False, bin_factor=0, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, priming=False, imgData = None, imgsTracked=None) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -629,7 +605,6 @@ class ApplyCompression: normalizeIntensity: whether data should be normalized to have total intensity of one noZeroIntensity: whether data with low total intensity should be discarded readFile: H5 file with matrix sketch - batchSize: Number of images to process at each iteration data: numpy array housing current matrix sketch mean: geometric mean of data processed num_incorporated_images: number of images processed so far @@ -655,15 +630,9 @@ def __init__( det_type, readFile, output_dir, - batchSize, - threshold, - noZeroIntensity, - normalizeIntensity, currRun, imgData, thumbnailData, - downsample=False, - bin_factor=2 ): self.comm = comm @@ -683,7 +652,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(10) + time.sleep(15) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -1165,7 +1134,7 @@ def genHTML(self): x=[0.25+xind for xind in range(len(self.medoidInds))], y=0, dw=0.5, dh=1, - palette="Plasma256", level="image") + palette="Turbo256", level="image") imgsPlot.axis.visible = False imgsPlot.grid.visible = False for xind in range(len(self.medoidInds)): @@ -1320,7 +1289,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, normalizeIntensity, noZeroIntensity, samplingFactor, priming, divBy, batchSize, thresholdQuantile): + def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1333,12 +1302,13 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.downsample=downsample self.bin_factor= bin_factor self.threshold= threshold + self.eluThreshold = eluThreshold + self.eluAlpha = eluAlpha self.normalizeIntensity=normalizeIntensity self.noZeroIntensity=noZeroIntensity + self.minIntensity = minIntensity self.samplingFactor=samplingFactor - self.priming=priming self.divBy = divBy - self.batchSize = batchSize self.thresholdQuantile = thresholdQuantile self.comm = MPI.COMM_WORLD @@ -1355,24 +1325,22 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) - self.imageProcessor = FD_ImageProcessing(minIntensity=(self.bin_factor**2)*50000, thresholdQuantile=self.thresholdQuantile, eluAlpha=0.01) - - self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, start_offset=start_offset, num_imgs=num_imgs, threshold=threshold, noZeroIntensity=noZeroIntensity, normalizeIntensity=normalizeIntensity, downsample=downsample, bin_factor=bin_factor, thresholdQuantile=thresholdQuantile) + self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile) + self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # @profile(filename="fullFD_profile") def runMe(self): stfull = time.perf_counter() startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, includeThumbnails=True) + self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size) #SKETCHING STEP ########################################################################################## freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, - threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, - currRun = self.currRun, samplingFactor=self.samplingFactor, priming=self.priming, imgData = self.fullImgData, imgsTracked = self.imgsTracked) + currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked) print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) st = time.perf_counter() freqDir.run() @@ -1399,10 +1367,7 @@ def runMe(self): #PROJECTION STEP ########################################################################################## - appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, - det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, - batchSize=self.batchSize, threshold=self.threshold, normalizeIntensity=self.normalizeIntensity, noZeroIntensity=self.noZeroIntensity, - downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData) + appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData) st = time.perf_counter() appComp.run() appComp.write() @@ -1445,10 +1410,25 @@ def runMe(self): print("TOTAL PROCESING TIME: {}".format(et - stfull)) class FD_ImageProcessing: - def __init__(self, minIntensity, thresholdQuantile, eluAlpha): + def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): + self.threshold = threshold + self.eluThreshold = eluThreshold + self.eluAlpha = eluAlpha + self.noZeroIntensity = noZeroIntensity + self.normalizeIntensity = normalizeIntensity self.minIntensity = minIntensity self.thresholdQuantile = thresholdQuantile - self.eluAlpha = eluAlpha + + def processImg(self, nimg, currIntensity): + if self.threshold: + nimg = self.thresholdFunc(nimg) + if self.eluThreshold: + nimg = self.eluThresholdFunc(nimg) + if self.noZeroIntensity: + nimg = self.removeZeroIntensityFunc(nimg, currIntensity) + if self.normalizeIntensity: + nimg = self.normalizeIntensityFunc(nimg, currIntensity) + return nimg def elu(self,x): if x > 0: @@ -1456,7 +1436,7 @@ def elu(self,x): else: return self.eluAlpha*(math.exp(x)-1) - def eluThreshold(self, img): + def eluThresholdFunc(self, img): if img is None: return img else: @@ -1464,48 +1444,42 @@ def eluThreshold(self, img): secondQuartile = np.quantile(img, self.thresholdQuantile) return(elu_v(img-secondQuartile)+secondQuartile) - def threshold(self, img): + def thresholdFunc(self, img): if img is None: return img else: secondQuartile = np.quantile(img, self.thresholdQuantile) return (img>secondQuartile)*img - def removeZeroIntensity(self, img, currIntensity): + def removeZeroIntensityFunc(self, img, currIntensity): if currIntensity Date: Thu, 14 Sep 2023 10:14:17 -0700 Subject: [PATCH 31/57] Checkpoint. Not sure what changed --- btx/processing/freqdir.py | 173 ++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 64 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index e741a53e8..318fc5904 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -132,6 +132,7 @@ def __init__( imgsTracked, alpha, rankAdapt, + rankAdaptMinError, merger, mergerFeatures, downsample, @@ -168,6 +169,7 @@ def __init__( # self.mean = None self.rankAdapt = rankAdapt + self.rankAdaptMinError = rankAdaptMinError self.increaseEll = False self.samplingFactor = samplingFactor @@ -244,8 +246,8 @@ def update_model(self, X): copyBatch = self.sketch[self.ell:,:].copy() self.rotate() if canRankAdapt and self.rankAdapt: - reconError = np.sqrt(self.lowMemoryReconstructionErrorUnscaled(copyBatch)) - if (reconError > 0.08): + reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch)) + if (reconError > self.rankAdaptMinError): self.increaseEll = True self.sketch[self.nextZeroRow,:] = row self.nextZeroRow += 1 @@ -326,11 +328,12 @@ def reconstructionError(self, matrixCentered): matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - def lowMemoryReconstructionError(self, matrixCentered): + def lowMemoryReconstructionErrorScaled(self, matrixCentered): """ Compute the low memory reconstruction error of the matrix sketch - against given data. This si the same as reconstructionError, - but estimates the norm computation and does not scale by the matrix. + against given data. This is the same as reconstructionError, + but estimates the norm computation and does not scale by the + minimum projection matrix, but rather by the matrix norm itself. Parameters ---------- @@ -348,7 +351,7 @@ def lowMemoryReconstructionError(self, matrixCentered): matSketchT = matSketch.T U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) G = U[:,:k] - return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 10)/ + return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ np.linalg.norm(matrixCenteredT, 'fro')**2) def estimFrobNormSquared(self, addMe, arrs, its): @@ -501,10 +504,11 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) + print(self.buffSizes) self.fd.update_model(self.data.T) @@ -787,7 +791,7 @@ class visualizeFD: """ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, - optics_min_samples, optics_xi, optics_min_cluster_size): + optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): self.inputFile = inputFile self.outputFile = outputFile output_file(filename=outputFile, title="Static HTML file") @@ -804,6 +808,7 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use self.optics_min_samples=optics_min_samples self.optics_xi = optics_xi self.optics_min_cluster_size = optics_min_cluster_size + self.outlierQuantile = outlierQuantile def embeddable_image(self, data): img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) @@ -877,22 +882,32 @@ def fastABOD(self, pts, nsamples): ac = cpt - apt if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): count += 1 + print("TOO CLOSE") continue outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) - abofs.append(np.var(np.array(outlier_factors))) + print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) + if(len(outlier_factors)==0): + abofs.append(np.inf) + else: + abofs.append(np.var(np.array(outlier_factors))) return abofs - def getOutliers(self, lst, divBy): - lstCopy = lst.copy() - lstCopy.sort() - quart10 = lstCopy[len(lstCopy)//divBy] + def getOutliers(self, lst): +# lstCopy = lst.copy() +# lstCopy.sort() +# quart10 = lstCopy[len(lstCopy)//divBy] + + lstQuant = np.quantile(np.array(lst), self.outlierQuantile) + print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) outlierInds = [] notOutlierInds = [] for j in range(len(lst)): - if lst[j]lstQuant: outlierInds.append(j) else: notOutlierInds.append(j) + print("OUTLIER INDS: ", outlierInds) + print("NOT OUTLIER INDS: ", notOutlierInds) return np.array(outlierInds), np.array(notOutlierInds) def genHist(self, vals, endClass): @@ -937,6 +952,7 @@ def genUMAP(self): n_neighbors=self.umap_n_neighbors, random_state=self.umap_random_state, n_components=2, + min_dist=0.25, ).fit_transform(self.projections) self.labels = hdbscan.HDBSCAN( @@ -948,15 +964,13 @@ def genUMAP(self): self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) self.opticsClust.fit(self.clusterable_embedding) -# self.opticsLabels = cluster_optics_dbscan( -# reachability=self.opticsClust.reachability_, -# core_distances=self.opticsClust.core_distances_, -# ordering=self.opticsClust.ordering_, -# eps=2, -# ) - -# self.opticsLabels = self.opticsClust.labels_[self.opticsClust.ordering_] - self.opticsLabels = self.opticsClust.labels_ + self.opticsLabels = cluster_optics_dbscan( + reachability=self.opticsClust.reachability_, + core_distances=self.opticsClust.core_distances_, + ordering=self.opticsClust.ordering_, + eps=2.5, + ) +# self.opticsLabels = self.opticsClust.labels_ self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) @@ -965,7 +979,7 @@ def genUMAP(self): def genABOD(self): if self.includeABOD: abod = self.fastABOD(self.projections, 10) - outliers, notOutliers = self.getOutliers(abod, 10) + outliers, notOutliers = self.getOutliers(abod) else: outliers = [] notOutliers = [] @@ -1023,7 +1037,8 @@ def genLabels(self): opticsNewLabels.append(j) opticsNewLabels = list(np.array(opticsNewLabels) + 1) self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) - self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] +# self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] + self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels] def genHTML(self): datasource = ColumnDataSource(self.experData_df) @@ -1184,7 +1199,8 @@ def genHTML(self): width = 2000, height = 400 ) space = np.arange(self.numImgsToUse) - reachability = self.opticsClust.reachability_ + reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] +# reachability = self.opticsClust.reachability_ opticsData_df = pd.DataFrame({'x':space,'y':reachability}) opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels] opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]] @@ -1289,7 +1305,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_components, alpha, rankAdapt, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): + def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1299,6 +1315,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.num_components=num_components self.alpha = alpha self.rankAdapt = rankAdapt + self.rankAdaptMinError = rankAdaptMinError self.downsample=downsample self.bin_factor= bin_factor self.threshold= threshold @@ -1318,6 +1335,7 @@ def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, num_ self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size self.imgsTracked = [] + self.grabImgSteps = grabImgSteps if self.rank==0: self.currRun = datetime.now().strftime("%y%m%d%H%M%S") @@ -1333,12 +1351,12 @@ def runMe(self): stfull = time.perf_counter() startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size) + self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps) #SKETCHING STEP ########################################################################################## freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, - det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, + det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked) print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) @@ -1400,7 +1418,8 @@ def runMe(self): umap_random_state=42, hdbscan_min_samples=int(numImgsToUse*0.75//40), hdbscan_min_cluster_size=int(numImgsToUse//40), - optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05) + optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05, + outlierQuantile=0.3) # print("here 2") visMe.fullVisualize() # print("here 3") @@ -1473,7 +1492,6 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.run = run self.downsample = downsample self.bin_factor = bin_factor - self.imgsTracked = [] self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth @@ -1499,7 +1517,23 @@ def assembleImgsToSave(self, imgs): saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth)))) return np.array(saveMe) - def get_formatted_images(self, startInd, n): + def split_range(self, start, end, num_tuples): + if start==end: + raise ValueError('Range processing error: start value equals end value, which leads to no images processed.') + return + total_elements = end - start + batch_size = total_elements // num_tuples + tuples = [] + for i in range(num_tuples - 1): + batch_start = start + i * batch_size + batch_end = batch_start + batch_size + tuples.append((batch_start, batch_end)) + last_batch_start = start + (num_tuples - 1) * batch_size + last_batch_end = end + tuples.append((last_batch_start, last_batch_end)) + return tuples + + def get_formatted_images(self, startInd, n, num_steps): """ Fetch n - x image segments from run, where x is the number of 'dead' images. @@ -1517,35 +1551,46 @@ def get_formatted_images(self, startInd, n): ndarray, shape (end_index-start_index, n-x) n-x retrieved image segments of dimension end_index-start_index """ - self.psi.counter = startInd - self.imgsTracked.append((self.psi.counter, self.psi.counter + n)) - print("Images tracked:", self.imgsTracked) - - imgs = self.psi.get_images(n, assemble=False) - - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] - thumbnails = self.assembleImgsToSave(imgs) - - if self.downsample: - imgs = bin_data(imgs, self.bin_factor) - num_valid_imgs, p, x, y = imgs.shape - img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T - img_batch[img_batch<0] = 0 - - num_valid_thumbnails, tx, ty = thumbnails.shape - thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T - - nimg_batch = [] - nthumbnail_batch = [] - for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): - currIntensity = np.sum(img.flatten(), dtype=np.double) - nimg = self.imageProcessor.processImg(img, currIntensity) - nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) - if nimg is not None: - nimg_batch.append(nimg) - nthumbnail_batch.append(nthumbnail) - nimg_batch = np.array(nimg_batch).T - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) - return (nimg_batch, nthumbnail_batch, self.imgsTracked) + fullimgs = None + fullthumbnails = None + imgsTracked = [] + runs = self.split_range(startInd, startInd+n, num_steps) + for runStart, runEnd in runs: + self.psi.counter = runStart + imgsTracked.append((runStart, runEnd)) + + imgs = self.psi.get_images(runEnd-runStart, assemble=False) + + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + thumbnails = self.assembleImgsToSave(imgs) + + if self.downsample: + imgs = bin_data(imgs, self.bin_factor) + num_valid_imgs, p, x, y = imgs.shape + img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T + img_batch[img_batch<0] = 0 + + num_valid_thumbnails, tx, ty = thumbnails.shape + thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T + + nimg_batch = [] + nthumbnail_batch = [] + for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): + currIntensity = np.sum(img.flatten(), dtype=np.double) + nimg = self.imageProcessor.processImg(img, currIntensity) + nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nthumbnail_batch.append(nthumbnail) + nimg_batch = np.array(nimg_batch).T + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) + if fullimgs is None: + fullimgs = nimg_batch + fullthumbnails = nthumbnail_batch + else: + fullimgs = np.hstack((fullimgs, nimg_batch)) + fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) + print("Images tracked:", imgsTracked) + return (fullimgs, fullthumbnails, imgsTracked) From 07c3ee8ef4727bfb807ca7ef8b68110df713cdf3 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Thu, 14 Sep 2023 14:10:13 -0700 Subject: [PATCH 32/57] disabling common mode correction in FredDir DataRetriever. Seems to be a time bottleneck --- btx/interfaces/ipsana.py | 17 +++++++++++++---- btx/processing/freqdir.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py index b3f7cedf6..cb5f53e1f 100644 --- a/btx/interfaces/ipsana.py +++ b/btx/interfaces/ipsana.py @@ -11,7 +11,8 @@ class PsanaInterface: def __init__(self, exp, run, det_type, event_receiver=None, event_code=None, event_logic=True, - ffb_mode=False, track_timestamps=False, calibdir=None): + ffb_mode=False, track_timestamps=False, calibdir=None, + no_cmod=False): self.exp = exp # experiment name, str self.hutch = exp[:3] # hutch name, str self.run = run # run number, int @@ -21,10 +22,10 @@ def __init__(self, exp, run, det_type, self.event_receiver = event_receiver # 'evr0' or 'evr1', str self.event_code = event_code # event code, int self.event_logic = event_logic # bool, if True, retain events with event_code; if False, keep all other events - self.set_up(det_type, ffb_mode, calibdir) + self.set_up(det_type, ffb_mode, calibdir, no_cmod) self.counter = 0 - def set_up(self, det_type, ffb_mode, calibdir=None): + def set_up(self, det_type, ffb_mode, calibdir=None, no_cmod=False): """ Instantiate DataSource and Detector objects; use the run functionality to retrieve all psana.EventTimes. @@ -37,6 +38,8 @@ def set_up(self, det_type, ffb_mode, calibdir=None): if True, set up in an FFB-compatible style calibdir: str directory to alternative calibration files + no_cmod: bool + if True, deactivate common mode detector correction """ ds_args=f'exp={self.exp}:run={self.run}:idx' if ffb_mode: @@ -52,16 +55,19 @@ def set_up(self, det_type, ffb_mode, calibdir=None): if calibdir is not None: setOption('psana.calib_dir', calibdir) self._calib_data_available() + self.no_cmod = no_cmod def _calib_data_available(self): """ Check whether calibration data is available. """ self.calibrate = True + self.no_cmod = no_cmod evt = self.runner.event(self.times[0]) if (self.det.pedestals(evt) is None) or (self.det.gain(evt) is None): logger.warning("Warning: calibration data unavailable, returning uncalibrated data") self.calibrate = False + self.no_cmod = True def turn_calibration_off(self): """ @@ -361,7 +367,10 @@ def get_images(self, num_images, assemble=True): img = self.det.image(evt=evt) else: if self.calibrate: - img = self.det.calib(evt=evt) + cmod = self.det.common_mode(evt=evt) + if self.no_cmod: + cmod[1] = 0 + img = self.det.calib(evt=evt, cmpars=cmod) else: img = self.det.raw(evt=evt) if self.det_type == 'epix10k2M': diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 318fc5904..194210e91 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1495,7 +1495,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True) self.imageProcessor = imageProcessor From 6f5fa51b56c18c14e01a66cb8bf7a34ca4b6a58d Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Thu, 14 Sep 2023 14:12:53 -0700 Subject: [PATCH 33/57] typo --- btx/interfaces/ipsana.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py index cb5f53e1f..e2ae4a52a 100644 --- a/btx/interfaces/ipsana.py +++ b/btx/interfaces/ipsana.py @@ -62,12 +62,10 @@ def _calib_data_available(self): Check whether calibration data is available. """ self.calibrate = True - self.no_cmod = no_cmod evt = self.runner.event(self.times[0]) if (self.det.pedestals(evt) is None) or (self.det.gain(evt) is None): logger.warning("Warning: calibration data unavailable, returning uncalibrated data") self.calibrate = False - self.no_cmod = True def turn_calibration_off(self): """ From 999a1f9330d9511247d517ec330d1e3f99dbc4c6 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Thu, 14 Sep 2023 14:20:50 -0700 Subject: [PATCH 34/57] fixed cmpars behavior to disable common mode correction if requested --- btx/interfaces/ipsana.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py index e2ae4a52a..f404a276b 100644 --- a/btx/interfaces/ipsana.py +++ b/btx/interfaces/ipsana.py @@ -365,10 +365,10 @@ def get_images(self, num_images, assemble=True): img = self.det.image(evt=evt) else: if self.calibrate: - cmod = self.det.common_mode(evt=evt) + cmpars = None if self.no_cmod: - cmod[1] = 0 - img = self.det.calib(evt=evt, cmpars=cmod) + cmpars = 0 + img = self.det.calib(evt=evt, cmpars) else: img = self.det.raw(evt=evt) if self.det_type == 'epix10k2M': From b4551fe108d2b3f37996f6bef33c12dd5ca40642 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Thu, 14 Sep 2023 14:51:39 -0700 Subject: [PATCH 35/57] created FD sketch tasks and workflow. --- btx/processing/freqdir.py | 5 +++-- dags/frequent_direction_sketch.py | 27 ++++++++++++++++++++++++ scripts/tasks.py | 34 +++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 dags/frequent_direction_sketch.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 194210e91..0aedbd523 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1305,7 +1305,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - def __init__(self, start_offset, num_imgs, exp, run, det_type, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): + def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1399,6 +1399,7 @@ def runMe(self): filenameTest3 = self.comm.allgather(filenameTest3) print("TEST 3: ", self.rank, filenameTest3) + def visualizeMe(self): #UMAP STEP ########################################################################################## if self.rank==0: @@ -1426,7 +1427,7 @@ def runMe(self): visMe.userSave() et = time.perf_counter() print("UMAP HTML Generation Processing time: {}".format(et - st)) - print("TOTAL PROCESING TIME: {}".format(et - stfull)) + #print("TOTAL PROCESING TIME: {}".format(et - stfull)) class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): diff --git a/dags/frequent_direction_sketch.py b/dags/frequent_direction_sketch.py new file mode 100644 index 000000000..02049a8d0 --- /dev/null +++ b/dags/frequent_direction_sketch.py @@ -0,0 +1,27 @@ +from datetime import datetime +import os +from airflow import DAG +from plugins.jid import JIDSlurmOperator + +# DAG SETUP +description='BTX frequent direction sketch DAG' +dag_name = os.path.splitext(os.path.basename(__file__))[0] + +dag = DAG( + dag_name, + start_date=datetime( 2022,4,1 ), + schedule_interval=None, + description=description, + ) + + +# Tasks SETUP +task_id='draw_sketch' +draw_sketch = JIDSlurmOperator(task_id=task_id, dag=dag) + +task_id='show_sketch' +show_sketch = JIDSlurmOperator(task_id = task_id, dag=dag) + + +# Draw the DAG +draw_sketch >> show_sketch \ No newline at end of file diff --git a/scripts/tasks.py b/scripts/tasks.py index 3908f4d3e..6e82f3689 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -509,3 +509,37 @@ def timetool_correct(config): logger.info('No model found! Will return the nominal delay uncorrected!') tt.timetool_correct(run, nominal, model, figs) + +def draw_sketch(config): + from btx.processing.freqdir import WrapperFullFD + setup = config.setup + task = config.draw_sketch + """ Perform Frequent Direction Sketching on run. """ + taskdir = os.path.join(setup.root_dir, 'draw_sketch') + os.makedirs(taskdir, exist_ok=True) + fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, + task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, + task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError, + task.downsample, task.bin_factor, task.threshold, task.eluThreshold, + task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity, + task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile) + logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...') + fd.runMe() + logger.debug('Done!') + +def show_sketch(): + from btx.processing.freqdir import WrapperFullFD + setup = config.setup + task = config.show_sketch + """ Display Sketch. """ + taskdir = os.path.join(setup.root_dir, 'show_sketch') + os.makedirs(taskdir, exist_ok=True) + fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, + task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, + task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError, + task.downsample, task.bin_factor, task.threshold, task.eluThreshold, + task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity, + task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile) + logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...') + fd.visualizeMe() + logger.debug('Done!') \ No newline at end of file From 71748dadc2aca149f16518a9978037a2b511623d Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Thu, 14 Sep 2023 14:55:31 -0700 Subject: [PATCH 36/57] created FD sketch tasks and workflow. --- scripts/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/tasks.py b/scripts/tasks.py index 6e82f3689..7c276b81a 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -515,7 +515,7 @@ def draw_sketch(config): setup = config.setup task = config.draw_sketch """ Perform Frequent Direction Sketching on run. """ - taskdir = os.path.join(setup.root_dir, 'draw_sketch') + taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, @@ -532,7 +532,7 @@ def show_sketch(): setup = config.setup task = config.show_sketch """ Display Sketch. """ - taskdir = os.path.join(setup.root_dir, 'show_sketch') + taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, From d0b8545b1d2954de24ca270ee417dc5b6e61f599 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 15 Sep 2023 12:05:18 -0700 Subject: [PATCH 37/57] Moved psana. Resolved cmpar bug. 430hz processing time not including visualization step. --- btx/interfaces/ipsana.py | 4 ++-- btx/processing/dimRed.py | 10 +++++++--- btx/processing/freqdir.py | 29 +++++++++++++++-------------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/btx/interfaces/ipsana.py b/btx/interfaces/ipsana.py index f404a276b..6570a30d9 100644 --- a/btx/interfaces/ipsana.py +++ b/btx/interfaces/ipsana.py @@ -367,8 +367,8 @@ def get_images(self, num_images, assemble=True): if self.calibrate: cmpars = None if self.no_cmod: - cmpars = 0 - img = self.det.calib(evt=evt, cmpars) + cmpars = [0,0,0] + img = self.det.calib(evt=evt, cmpars=cmpars) else: img = self.det.raw(evt=evt) if self.det_type == 'epix10k2M': diff --git a/btx/processing/dimRed.py b/btx/processing/dimRed.py index 0bd1db85d..493ecc1e8 100644 --- a/btx/processing/dimRed.py +++ b/btx/processing/dimRed.py @@ -39,15 +39,19 @@ def __init__( priming=False, downsample=False, bin_factor=2, - output_dir="" + output_dir="", + psi=None ): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = start_offset + if psi is None: + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = start_offset + else: + self.psi = psi self.start_offset = start_offset self.priming = priming diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 0aedbd523..dab7adec8 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -138,12 +138,13 @@ def __init__( downsample, bin_factor, samplingFactor, - num_components, + num_components, + psi, ): super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, num_images=num_imgs, num_components=num_components, batch_size=0, priming=False, - downsample=downsample, bin_factor=bin_factor, output_dir=output_dir) + downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi) self.comm = comm self.rank= rank @@ -456,7 +457,7 @@ def write(self): filename : string Name of h5 file where sketch, mean of data, and indices of data processed is written """ - self.comm.barrier() +# self.comm.barrier() filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) @@ -493,18 +494,18 @@ class MergeTree: currRun: Current datetime used to identify run """ - def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun): + def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi): self.comm = comm self.rank = rank self.size = size self.divBy = divBy - time.sleep(15) +# time.sleep(5) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -656,7 +657,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(15) +# time.sleep(5) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -1358,7 +1359,7 @@ def runMe(self): freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, - currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked) + currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi) print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) st = time.perf_counter() freqDir.run() @@ -1376,7 +1377,7 @@ def runMe(self): for j in range(freqDir.size): allNames.append(fullSketchFilename + str(j) + ".h5") mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, - output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun) + output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi) st = time.perf_counter() mergeTree.merge() mergedSketchFilename = mergeTree.write() @@ -1393,11 +1394,11 @@ def runMe(self): print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) - self.comm.barrier() - self.comm.Barrier() - filenameTest3 = random.randint(0, 10) - filenameTest3 = self.comm.allgather(filenameTest3) - print("TEST 3: ", self.rank, filenameTest3) +# self.comm.barrier() +# self.comm.Barrier() +# filenameTest3 = random.randint(0, 10) +# filenameTest3 = self.comm.allgather(filenameTest3) +# print("TEST 3: ", self.rank, filenameTest3) def visualizeMe(self): #UMAP STEP From 8dd6ccc11c3c468ddb67faf02559f720ced800ff Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 17 Sep 2023 14:58:10 -0700 Subject: [PATCH 38/57] Fixed reconstruction error. Removed double psana initialization. Moved thumbnail generation outside of sketching. Other small updates. --- btx/processing/freqdir.py | 369 +++++++++++++++++++++++++------------- 1 file changed, 240 insertions(+), 129 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index dab7adec8..70402e87a 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -248,6 +248,7 @@ def update_model(self, X): self.rotate() if canRankAdapt and self.rankAdapt: reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch)) + print("RANK ADAPT RECON ERROR: ", reconError) if (reconError > self.rankAdaptMinError): self.increaseEll = True self.sketch[self.nextZeroRow,:] = row @@ -329,76 +330,96 @@ def reconstructionError(self, matrixCentered): matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) - def lowMemoryReconstructionErrorScaled(self, matrixCentered): - """ - Compute the low memory reconstruction error of the matrix sketch - against given data. This is the same as reconstructionError, - but estimates the norm computation and does not scale by the - minimum projection matrix, but rather by the matrix norm itself. - - Parameters - ---------- - matrixCentered: ndarray - Data to compare matrix sketch to +# def lowMemoryReconstructionErrorScaled(self, matrixCentered): +# """ +# Compute the low memory reconstruction error of the matrix sketch +# against given data. This is the same as reconstructionError, +# but estimates the norm computation and does not scale by the +# minimum projection matrix, but rather by the matrix norm itself. +# +# Parameters +# ---------- +# matrixCentered: ndarray +# Data to compare matrix sketch to +# +# Returns +# ------- +# float, +# Data subtracted by data projected onto sketched space, scaled by matrix elements +# """ +# matSketch = self.sketch[:self.ell, :] +# print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape) +## k = 10 +# matrixCenteredT = matrixCentered.T +# matSketchT = matSketch.T +# U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) +## G = U[:,:k] +# G = U +# return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ +# np.linalg.norm(matrixCenteredT, 'fro')**2) - Returns - ------- - float, - Data subtracted by data projected onto sketched space, scaled by matrix elements - """ - matSketch = self.sketch - k = 10 + def lowMemoryReconstructionErrorScaled(self, matrixCentered): + matSketch = self.sketch[:self.ell, :] matrixCenteredT = matrixCentered.T matSketchT = matSketch.T U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) - G = U[:,:k] - return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ - np.linalg.norm(matrixCenteredT, 'fro')**2) - - def estimFrobNormSquared(self, addMe, arrs, its): - """ - Estimate the Frobenius Norm of product of arrs matrices - plus addME matrix using its iterations. - - Parameters - ---------- - arrs: list of ndarray - Matrices to multiply together + G = U + return self.estimFrobNormJ(matrixCenteredT, [G,G.T,matrixCenteredT], 20)/np.linalg.norm(matrixCenteredT, 'fro') - addMe: ndarray - Matrix to add to others - - its: int - Number of iterations to average over - - Returns - ------- - sumMe/its*no_rows : float - Estimate of frobenius norm of product - of arrs matrices plus addMe matrix - - Notes - ----- - Frobenius estimation is the expected value of matrix - multiplied by random vector from multivariate normal distribution - based on [1]. - - [1] Norm and Trace Estimation with Random Rank-one Vectors - Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix - Analysis and Applications 2021 42:1, 202-223 - """ - no_rows = arrs[-1].shape[1] - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) + def estimFrobNormJ(self, addMe, arrs, k): + m, n = addMe.shape + randMat = np.random.normal(0, 1, size=(n, k)) + minusMe = addMe @ randMat sumMe = 0 - for j in range(its): - v = np.random.normal(size=no_rows) - v_hat = v / np.linalg.norm(v) - v_addMe = addMe @ v_hat - for arr in arrs[::-1]: - v_hat = arr @ v_hat - sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 - return sumMe/its*no_rows + for arr in arrs[::-1]: + randMat = arr @ randMat + sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro') + return sumMe + +# def estimFrobNormSquared(self, addMe, arrs, its): +# """ +# Estimate the Frobenius Norm of product of arrs matrices +# plus addME matrix using its iterations. +# +# Parameters +# ---------- +# arrs: list of ndarray +# Matrices to multiply together +# +# addMe: ndarray +# Matrix to add to others +# +# its: int +# Number of iterations to average over +# +# Returns +# ------- +# sumMe/its*no_rows : float +# Estimate of frobenius norm of product +# of arrs matrices plus addMe matrix +# +# Notes +# ----- +# Frobenius estimation is the expected value of matrix +# multiplied by random vector from multivariate normal distribution +# based on [1]. +# +# [1] Norm and Trace Estimation with Random Rank-one Vectors +# Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix +# Analysis and Applications 2021 42:1, 202-223 +# """ +# no_rows = arrs[-1].shape[1] +# v = np.random.normal(size=no_rows) +# v_hat = v / np.linalg.norm(v) +# sumMe = 0 +# for j in range(its): +# v = np.random.normal(size=no_rows) +# v_hat = v / np.linalg.norm(v) +# v_addMe = addMe @ v_hat +# for arr in arrs[::-1]: +# v_hat = arr @ v_hat +# sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 +# return sumMe/its*no_rows def gatherFreqDirsSerial(self): @@ -509,7 +530,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) - print(self.buffSizes) +# print(self.buffSizes) self.fd.update_model(self.data.T) @@ -620,7 +641,6 @@ class ApplyCompression: non-downsampled data for thumbnail generation components: Principal Components of matrix sketch processedData: Data projected onto matrix sketch range - smallImages: Downsampled images for visualization purposes """ def __init__( @@ -637,7 +657,6 @@ def __init__( output_dir, currRun, imgData, - thumbnailData, ): self.comm = comm @@ -657,7 +676,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) -# time.sleep(5) + time.sleep(5) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -666,12 +685,10 @@ def __init__( self.components = Vt self.processedData = None - self.smallImgs = None self.imageIndicesProcessed = [] self.imgData = imgData - self.thumbnailData = thumbnailData def run(self): @@ -680,11 +697,8 @@ def run(self): Note: If-Else statement is from previous/future work enabling streaming processing. """ - if self.smallImgs is None: - self.smallImgs = self.thumbnailData - else: - self.smallImgs = np.concatenate((self.smallImgs, self.thumbnailData), axis=0) self.apply_compression(self.imgData) + return self.data def apply_compression(self, X): """ @@ -707,7 +721,6 @@ def write(self): filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank) with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) - hf.create_dataset("SmallImages", data=self.smallImgs) # print("CREATED FILE: ", filename) self.comm.barrier() return filename @@ -883,10 +896,10 @@ def fastABOD(self, pts, nsamples): ac = cpt - apt if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): count += 1 - print("TOO CLOSE") +# print("TOO CLOSE") continue outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) - print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) +# print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) if(len(outlier_factors)==0): abofs.append(np.inf) else: @@ -899,7 +912,7 @@ def getOutliers(self, lst): # quart10 = lstCopy[len(lstCopy)//divBy] lstQuant = np.quantile(np.array(lst), self.outlierQuantile) - print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) +# print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) outlierInds = [] notOutlierInds = [] for j in range(len(lst)): @@ -907,8 +920,8 @@ def getOutliers(self, lst): outlierInds.append(j) else: notOutlierInds.append(j) - print("OUTLIER INDS: ", outlierInds) - print("NOT OUTLIER INDS: ", notOutlierInds) +# print("OUTLIER INDS: ", outlierInds) +# print("NOT OUTLIER INDS: ", notOutlierInds) return np.array(outlierInds), np.array(notOutlierInds) def genHist(self, vals, endClass): @@ -1281,27 +1294,6 @@ def userShow(self): output_notebook() show(self.viewResults) -def profile(filename=None, comm=MPI.COMM_WORLD): - def prof_decorator(f): - def wrap_f(*args, **kwargs): - pr = cProfile.Profile() - pr.enable() - result = f(*args, **kwargs) - pr.disable() - - if filename is None: - pr.print_stats() - else: - filename_r = filename + ".{}".format(comm.rank) - pr.dump_stats(filename_r) - - return result - return wrap_f - return prof_decorator - -def id_generator(size=6, chars=string.ascii_uppercase + string.digits): - return ''.join(random.choice(chars) for _ in range(size)) - class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. @@ -1347,12 +1339,99 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile) self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) -# @profile(filename="fullFD_profile") +# def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): +# """ +# Compute the low memory reconstruction error of the matrix sketch +# against given data. This is the same as reconstructionError, +# but estimates the norm computation and does not scale by the +# minimum projection matrix, but rather by the matrix norm itself. +# +# Parameters +# ---------- +# matrixCentered: ndarray +# Data to compare matrix sketch to +# +# Returns +# ------- +# float, +# Data subtracted by data projected onto sketched space, scaled by matrix elements +# """ +## k = 10 +# matrixCenteredT = matrixCentered.T +# matSketchT = matSketch.T +# U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) +## G = U[:,:k] +# G = U +# return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ +# np.linalg.norm(matrixCenteredT, 'fro')**2) +# +# def estimFrobNormSquared(self, addMe, arrs, its): +# """ +# Estimate the Frobenius Norm of product of arrs matrices +# plus addME matrix using its iterations. +# +# Parameters +# ---------- +# arrs: list of ndarray +# Matrices to multiply together +# +# addMe: ndarray +# Matrix to add to others +## +# its: int +# Number of iterations to average over +# +# Returns +# ------- +# sumMe/its*no_rows : float +# Estimate of frobenius norm of product +# of arrs matrices plus addMe matrix +# +# Notes +# ----- +# Frobenius estimation is the expected value of matrix +# multiplied by random vector from multivariate normal distribution +# based on [1]. +# +# [1] Norm and Trace Estimation with Random Rank-one Vectors +# Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix +# Analysis and Applications 2021 42:1, 202-223 +# """ +# no_rows = arrs[-1].shape[1] +# v = np.random.normal(size=no_rows) +# v_hat = v / np.linalg.norm(v) +# sumMe = 0 +# for j in range(its): +# v = np.random.normal(size=no_rows) +# v_hat = v / np.linalg.norm(v) +# v_addMe = addMe @ v_hat +# for arr in arrs[::-1]: +# v_hat = arr @ v_hat +# sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 +# return sumMe/its*no_rows + + def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) + G = U + return self.estimFrobNormJ(matrixCenteredT, [G,G.T,matrixCenteredT], 20)/np.linalg.norm(matrixCenteredT, 'fro') + + def estimFrobNormJ(self, addMe, arrs, k): + m, n = addMe.shape + randMat = np.random.normal(0, 1, size=(n, k)) + minusMe = addMe @ randMat + sumMe = 0 + for arr in arrs[::-1]: + randMat = arr @ randMat + sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro') + return sumMe + def runMe(self): stfull = time.perf_counter() startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.fullThumbnailData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps) + self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) #SKETCHING STEP ########################################################################################## @@ -1386,9 +1465,9 @@ def runMe(self): #PROJECTION STEP ########################################################################################## - appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData, thumbnailData = self.fullThumbnailData) + appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData) st = time.perf_counter() - appComp.run() + self.matSketch = appComp.run() appComp.write() et = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) @@ -1400,12 +1479,25 @@ def runMe(self): # filenameTest3 = self.comm.allgather(filenameTest3) # print("TEST 3: ", self.rank, filenameTest3) + def addThumbnailsToProjectH5(self): +# print("Gathering thumbnails") + startingPoint = self.start_offset + self.num_imgs*self.rank//self.size + _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) + file_name = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData_{}.h5".format(self.currRun, self.rank) + f1 = h5py.File(file_name, 'r+') + f1.create_dataset("SmallImages", data=self.fullThumbnailData) + f1.close() + self.comm.barrier() + + def visualizeMe(self): + st = time.perf_counter() + self.addThumbnailsToProjectH5() #UMAP STEP ########################################################################################## if self.rank==0: - print("here 1") - st = time.perf_counter() + +# print("here 1") skipSize = 8 numImgsToUse = int(self.num_imgs/skipSize) @@ -1535,7 +1627,7 @@ def split_range(self, start, end, num_tuples): tuples.append((last_batch_start, last_batch_end)) return tuples - def get_formatted_images(self, startInd, n, num_steps): + def get_formatted_images(self, startInd, n, num_steps, getThumbnails): """ Fetch n - x image segments from run, where x is the number of 'dead' images. @@ -1566,7 +1658,8 @@ def get_formatted_images(self, startInd, n, num_steps): imgs = imgs[ [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] ] - thumbnails = self.assembleImgsToSave(imgs) + if getThumbnails: + thumbnails = self.assembleImgsToSave(imgs) if self.downsample: imgs = bin_data(imgs, self.bin_factor) @@ -1574,25 +1667,43 @@ def get_formatted_images(self, startInd, n, num_steps): img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T img_batch[img_batch<0] = 0 - num_valid_thumbnails, tx, ty = thumbnails.shape - thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T - - nimg_batch = [] - nthumbnail_batch = [] - for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): - currIntensity = np.sum(img.flatten(), dtype=np.double) - nimg = self.imageProcessor.processImg(img, currIntensity) - nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) - if nimg is not None: - nimg_batch.append(nimg) - nthumbnail_batch.append(nthumbnail) - nimg_batch = np.array(nimg_batch).T - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) - if fullimgs is None: - fullimgs = nimg_batch - fullthumbnails = nthumbnail_batch + if getThumbnails: + num_valid_thumbnails, tx, ty = thumbnails.shape + thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T + + if getThumbnails: + nimg_batch = [] + nthumbnail_batch = [] + for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): + currIntensity = np.sum(img.flatten(), dtype=np.double) + nimg = self.imageProcessor.processImg(img, currIntensity) + nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nthumbnail_batch.append(nthumbnail) + nimg_batch = np.array(nimg_batch).T + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) + if fullimgs is None: + fullimgs = nimg_batch + fullthumbnails = nthumbnail_batch + else: + fullimgs = np.hstack((fullimgs, nimg_batch)) + fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) else: - fullimgs = np.hstack((fullimgs, nimg_batch)) - fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) + nimg_batch = [] + for img in img_batch.T: + currIntensity = np.sum(img.flatten(), dtype=np.double) + nimg = self.imageProcessor.processImg(img, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nimg_batch = np.array(nimg_batch).T + if fullimgs is None: + fullimgs = nimg_batch + else: + fullimgs = np.hstack((fullimgs, nimg_batch)) + print("Images tracked:", imgsTracked) - return (fullimgs, fullthumbnails, imgsTracked) + if getThumbnails: + return (fullimgs, fullthumbnails, imgsTracked) + else: + return (fullimgs, imgsTracked) From b8f9ab4d7589d15c8329127c193d951b2d5dbfbc Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 24 Sep 2023 08:39:55 -0700 Subject: [PATCH 39/57] Checkpoint. I don't think there are any major changes. --- btx/processing/freqdir.py | 85 ++++++++++++++++++++++++++++++++------- scripts/tasks.py | 32 ++++++++++----- 2 files changed, 92 insertions(+), 25 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 70402e87a..b4b4fb5d0 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -140,12 +140,28 @@ def __init__( samplingFactor, num_components, psi, + usePSI ): - super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, - num_images=num_imgs, num_components=num_components, batch_size=0, priming=False, - downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi) - +######################## + if usePSI: + super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, + num_images=num_imgs, num_components=num_components, batch_size=0, priming=False, + downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi) + else: + self.start_offset = start_offset + self.downsample = False + self.bin_factor = 0 + self.output_dir = output_dir + self.num_components = num_components + self.num_features,self.num_images = imgData.shape + print("NUM IMAGES: ", self.num_images) + + self.task_durations = dict({}) + + self.num_incorporated_images = 0 + self.outliers, self.pc_data = [], [] +######################## self.comm = comm self.rank= rank self.size = size @@ -526,7 +542,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=True) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -798,7 +814,6 @@ def update(self, vec): - class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN @@ -1298,7 +1313,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile): + def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1325,11 +1340,16 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) - self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size self.imgsTracked = [] self.grabImgSteps = grabImgSteps + self.usePSI = usePSI + if usePSI: + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size + else: + self.psi = None + if self.rank==0: self.currRun = datetime.now().strftime("%y%m%d%H%M%S") else: @@ -1427,18 +1447,43 @@ def estimFrobNormJ(self, addMe, arrs, k): sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro') return sumMe + def retrieveImages(self): + startingPoint = self.start_offset + self.num_imgs*self.rank//self.size + self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) + + def genSynthData(self): + self.fullImgData = np.random.rand(70000, 100000//self.size) + self.imgsTracked = [(0, self.rank)] + + def genDecayingSVD(self): + A = np.random.rand(matrixSize, matrixSize)\n + A = A.T @ A\n + eigVals, eigVecs = np.linalg.eig(A)\n + diag_entries = list(np.random.rand(matrixSize))\n + diag_entries.sort()\n + diag_entries = np.array(diag_entries[::-1])\n + D = np.diag(diag_entries) + np.eye(matrixSize)\n + return (eigVecs @ (D) @ eigVecs.T) + def runMe(self): + stfull = time.perf_counter() - startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) + #DATA RETRIEVAL STEP + ########################################################################################## + if self.usePSI: + self.retrieveImages() + else: + self.genSynthData() + et = time.perf_counter() + print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) #SKETCHING STEP ########################################################################################## freqDir = FreqDir(comm= self.comm, rank=self.rank, size = self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run, det_type=self.det_type, output_dir=self.writeToHere, num_components=self.num_components, alpha=self.alpha, rankAdapt=self.rankAdapt, rankAdaptMinError = self.rankAdaptMinError, merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, - currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi) + currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi, usePSI=self.usePSI) print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) st = time.perf_counter() freqDir.run() @@ -1489,7 +1534,6 @@ def addThumbnailsToProjectH5(self): f1.close() self.comm.barrier() - def visualizeMe(self): st = time.perf_counter() self.addThumbnailsToProjectH5() @@ -1649,25 +1693,35 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): fullthumbnails = None imgsTracked = [] runs = self.split_range(startInd, startInd+n, num_steps) + print(runs) for runStart, runEnd in runs: +# print("RETRIEVING: [", runStart, ":", runEnd,"]") self.psi.counter = runStart imgsTracked.append((runStart, runEnd)) +# print("getting images") imgs = self.psi.get_images(runEnd-runStart, assemble=False) +# print("Removing nan images") imgs = imgs[ [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] ] + if getThumbnails: +# print("Assembling thumbnails") thumbnails = self.assembleImgsToSave(imgs) if self.downsample: +# print("Downsampling images") imgs = bin_data(imgs, self.bin_factor) +# print("Flattening images") num_valid_imgs, p, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T +# print("Image values less than 0 setting to 0") img_batch[img_batch<0] = 0 if getThumbnails: +# print("FLattening thumbnails") num_valid_thumbnails, tx, ty = thumbnails.shape thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T @@ -1693,16 +1747,19 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): nimg_batch = [] for img in img_batch.T: currIntensity = np.sum(img.flatten(), dtype=np.double) +# print("Starting image processing of size {}".format(img_batch.T.shape)) nimg = self.imageProcessor.processImg(img, currIntensity) if nimg is not None: nimg_batch.append(nimg) nimg_batch = np.array(nimg_batch).T +# print("hstacking") if fullimgs is None: + fullimgs = nimg_batch else: fullimgs = np.hstack((fullimgs, nimg_batch)) - print("Images tracked:", imgsTracked) +# print("Images tracked:", imgsTracked) if getThumbnails: return (fullimgs, fullthumbnails, imgsTracked) else: diff --git a/scripts/tasks.py b/scripts/tasks.py index 7c276b81a..0603a2498 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -518,11 +518,16 @@ def draw_sketch(config): taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, - task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, - task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError, - task.downsample, task.bin_factor, task.threshold, task.eluThreshold, - task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity, - task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile) + start_offset=task.start_offset, num_imgs=task.num_imgs, + writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps, + num_components=task.num_components, alpha=task.alpha, + rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError, + downsample=task.downsample, bin_factor=task.bin_factor, + threshold=task.threshold, eluThreshold=task.eluThreshold, + eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, + noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, + samplingFactor=task.samplingFactor, divBy=task.divBy, + thresholdQuantile=task.thresholdQuantile) logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...') fd.runMe() logger.debug('Done!') @@ -535,11 +540,16 @@ def show_sketch(): taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, - task.start_offset, task.num_imgs, task.writeToHere, task.grabImgSteps, - task.num_components, task.alpha, task.rankAdapt, task.rankAdaptMinError, - task.downsample, task.bin_factor, task.threshold, task.eluThreshold, - task.eluAlpha, task.normalizeIntensity, task.noZeroIntensity, - task.minIntensity, task.samplingFactor, task.divBy, task.thresholdQuantile) + start_offset=task.start_offset, num_imgs=task.num_imgs, + writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps, + num_components=task.num_components, alpha=task.alpha, + rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError, + downsample=task.downsample, bin_factor=task.bin_factor, + threshold=task.threshold, eluThreshold=task.eluThreshold, + eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, + noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, + samplingFactor=task.samplingFactor, divBy=task.divBy, + thresholdQuantile=task.thresholdQuantile) logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...') fd.visualizeMe() - logger.debug('Done!') \ No newline at end of file + logger.debug('Done!') From 068389651bdb21baabe7167982d29158c98b9ad0 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 3 Oct 2023 12:48:52 -0700 Subject: [PATCH 40/57] Checkpoint. Runs well and synthetic data fixed. --- btx/processing/freqdir.py | 227 +++++++++++++++++++++++++++++++------- 1 file changed, 190 insertions(+), 37 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index b4b4fb5d0..762ef85f0 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -308,10 +308,12 @@ def rotate(self): toShrink[-1] = 0 toShrink = sqrt(toShrink) toShrink[:int(self.ell*(1-self.alpha))] = sCopy[:int(self.ell*(1-self.alpha))] - self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) + #self.sketch[:self.ell:,:] = dot(diag(toShrink), Vt[:self.ell,:]) #JOHN: Removed this extra colon 10/01/2023 + self.sketch[:self.ell,:] = dot(diag(toShrink), Vt[:self.ell,:]) self.sketch[self.ell:,:] = 0 self.nextZeroRow = self.ell else: + print(S.shape, self.ell) self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:] self.sketch[ssize:,:] = 0 self.nextZeroRow = ssize @@ -531,18 +533,18 @@ class MergeTree: currRun: Current datetime used to identify run """ - def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi): + def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output_dir, allWriteDirecs, currRun, psi, usePSI): self.comm = comm self.rank = rank self.size = size self.divBy = divBy -# time.sleep(5) + time.sleep(10) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] - self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = None, imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=True) + self.fd = FreqDir(comm=comm, rank=rank, size=size, num_imgs=0, start_offset=0, currRun = currRun, rankAdapt=False, rankAdaptMinError=1, exp=exp, run=run, det_type=det_type, num_components=self.data.shape[0], alpha=0.2, merger=True, mergerFeatures = self.data.shape[1], output_dir=output_dir, imgData = np.random.rand(2, 2), imgsTracked=None, downsample=False, bin_factor=1, samplingFactor=1, psi=psi, usePSI=usePSI) sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) @@ -692,7 +694,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(5) + time.sleep(10) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -963,6 +965,8 @@ def genUMAP(self): imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + print("AOIDWJOIAWDJ", len(imgs), len(projections)) + intensities = [] for img in imgs: intensities.append(np.sum(img.flatten())) @@ -981,7 +985,8 @@ def genUMAP(self): n_neighbors=self.umap_n_neighbors, random_state=self.umap_random_state, n_components=2, - min_dist=0.25, +# min_dist=0.25, + min_dist=0.1, ).fit_transform(self.projections) self.labels = hdbscan.HDBSCAN( @@ -1357,7 +1362,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = self.comm.bcast(self.currRun, root=0) self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile) - self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) + self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 150, thumbnailWidth = 150) # def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): # """ @@ -1455,15 +1460,36 @@ def genSynthData(self): self.fullImgData = np.random.rand(70000, 100000//self.size) self.imgsTracked = [(0, self.rank)] - def genDecayingSVD(self): - A = np.random.rand(matrixSize, matrixSize)\n - A = A.T @ A\n - eigVals, eigVecs = np.linalg.eig(A)\n - diag_entries = list(np.random.rand(matrixSize))\n - diag_entries.sort()\n - diag_entries = np.array(diag_entries[::-1])\n - D = np.diag(diag_entries) + np.eye(matrixSize)\n - return (eigVecs @ (D) @ eigVecs.T) +# def genDecayingSVD(self): +# numFeats = 70000 +# numSamps = 100000//self.size +# A = np.random.rand(matrixSize, matrixSize) +## A = A.T @ A +# eigVals, eigVecs = np.linalg.eig(A) +# diag_entries = list(np.random.rand(matrixSize)) +## diag_entries.sort() +# multMe = np.ones(numSamps) +## diag_entries = np.array(diag_entries[::-1]) +# D = np.diag(diag_entries) + np.eye(matrixSize) +# return (eigVecs @ (D) @ eigVecs.T) + + def compDecayingSVD(self, seedMe, a, b): + numFeats = a + numSamps = b//self.size + perturbation = np.random.rand(numSamps, numFeats)*0.1 + np.random.seed(seedMe) + A1 = np.random.rand(numSamps, numFeats) + Q1, R1 = np.linalg.qr(A1) + Q1 = Q1 + perturbation + A2 = np.random.rand(numFeats, numFeats) #Modify + Q2, R2 = np.linalg.qr(A2) + S = list(np.random.rand(numFeats)) #Modify + S.sort() + S = S[::-1] + for j in range(len(S)): #Modify + S[j] = (2**(-16*(j+1)/len(S)))*S[j] + self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + self.imgsTracked = [(0, numSamps)] def runMe(self): @@ -1471,12 +1497,13 @@ def runMe(self): #DATA RETRIEVAL STEP ########################################################################################## - if self.usePSI: - self.retrieveImages() - else: - self.genSynthData() - et = time.perf_counter() - print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) +# if self.usePSI: +# self.retrieveImages() +# else: +# self.compDecayingSVD() +## self.genSynthData() +# et = time.perf_counter() +# print("Estimated time for data retrieval for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) #SKETCHING STEP ########################################################################################## @@ -1485,11 +1512,11 @@ def runMe(self): merger=False, mergerFeatures=0, downsample=self.downsample, bin_factor=self.bin_factor, currRun = self.currRun, samplingFactor=self.samplingFactor, imgData = self.fullImgData, imgsTracked = self.imgsTracked, psi=self.psi, usePSI=self.usePSI) print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) - st = time.perf_counter() + st1 = time.perf_counter() freqDir.run() localSketchFilename = freqDir.write() - et = time.perf_counter() - print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) + et1 = time.perf_counter() + print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et1 - st1)) #MERGING STEP ########################################################################################## @@ -1501,22 +1528,24 @@ def runMe(self): for j in range(freqDir.size): allNames.append(fullSketchFilename + str(j) + ".h5") mergeTree = MergeTree(comm=self.comm, rank=self.rank, size=self.size, exp=self.exp, run=self.run, det_type=self.det_type, divBy=self.divBy, readFile = localSketchFilename, - output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi) - st = time.perf_counter() + output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI) + st2 = time.perf_counter() mergeTree.merge() mergedSketchFilename = mergeTree.write() - et = time.perf_counter() - print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) + et2 = time.perf_counter() + print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et2 - st2)) #PROJECTION STEP ########################################################################################## appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData) - st = time.perf_counter() + st3 = time.perf_counter() self.matSketch = appComp.run() appComp.write() - et = time.perf_counter() - print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et - st)) - print("Estimated full processing time for rank {0}/{1}: {2}".format(self.rank, self.size, et - stfull)) + et3 = time.perf_counter() + print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3)) + print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull)) + + return (et1 + et2 + et3 - st1 - st2 - st3) # self.comm.barrier() # self.comm.Barrier() @@ -1528,7 +1557,7 @@ def addThumbnailsToProjectH5(self): # print("Gathering thumbnails") startingPoint = self.start_offset + self.num_imgs*self.rank//self.size _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) - file_name = "/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData_{}.h5".format(self.currRun, self.rank) + file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank) f1 = h5py.File(file_name, 'r+') f1.create_dataset("SmallImages", data=self.fullThumbnailData) f1.close() @@ -1543,16 +1572,17 @@ def visualizeMe(self): # print("here 1") - skipSize = 8 + skipSize = 1 numImgsToUse = int(self.num_imgs/skipSize) - visMe = visualizeFD(inputFile="/sdf/data/lcls/ds/mfx/mfxp23120/scratch/winnicki/h5writes/{}_ProjectedData".format(self.currRun), + visMe = visualizeFD(inputFile=self.writeToHere+"{}_ProjectedData".format(self.currRun), outputFile="./UMAPVis_{}.html".format(self.currRun), numImgsToUse=self.num_imgs, nprocs=self.size, userGroupings=[], includeABOD=True, skipSize = skipSize, - umap_n_neighbors=numImgsToUse//40, +# umap_n_neighbors=numImgsToUse//40, + umap_n_neighbors=numImgsToUse//4000, umap_random_state=42, hdbscan_min_samples=int(numImgsToUse*0.75//40), hdbscan_min_cluster_size=int(numImgsToUse//40), @@ -1764,3 +1794,126 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): return (fullimgs, fullthumbnails, imgsTracked) else: return (fullimgs, imgsTracked) + + +class SinglePanelDataRetriever: + def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): + self.exp = exp + self.det_type = det_type + self.run = run + self.thumbnailHeight = thumbnailHeight + self.thumbnailWidth = thumbnailWidth + + self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + + self.imageProcessor = imageProcessor + + def split_range(self, start, end, num_tuples): + if start==end: + raise ValueError('Range processing error: start value equals end value, which leads to no images processed.') + return + total_elements = end - start + batch_size = total_elements // num_tuples + tuples = [] + for i in range(num_tuples - 1): + batch_start = start + i * batch_size + batch_end = batch_start + batch_size + tuples.append((batch_start, batch_end)) + last_batch_start = start + (num_tuples - 1) * batch_size + last_batch_end = end + tuples.append((last_batch_start, last_batch_end)) + return tuples + + def get_formatted_images(self, startInd, n, num_steps, getThumbnails): + """ + Fetch n - x image segments from run, where x is the number of 'dead' images. + + Parameters + ---------- + n : int + number of images to retrieve + start_index : int + start index of subsection of data to retrieve + end_index : int + end index of subsection of data to retrieve + + Returns + ------- + ndarray, shape (end_index-start_index, n-x) + n-x retrieved image segments of dimension end_index-start_index + """ + fullimgs = None + fullthumbnails = None + imgsTracked = [] + runs = self.split_range(startInd, startInd+n, num_steps) + print(runs) + for runStart, runEnd in runs: +# print("RETRIEVING: [", runStart, ":", runEnd,"]") + self.psi.counter = runStart + imgsTracked.append((runStart, runEnd)) + +# print("getting images") + imgs = self.psi.get_images(runEnd-runStart, assemble=False) + +# print("Removing nan images") + imgs = imgs[ + [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + ] + + if getThumbnails: + saveMe = [] + for img in imgs: + saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) + thumbnails = np.array(saveMe) + print("thumbaaowdijaoiajw", len(imgs), len(thumbnails)) + + num_valid_imgs, x, y = imgs.shape + img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T +# print("Image values less than 0 setting to 0") + img_batch[img_batch<0] = 0 + + if getThumbnails: +# print("FLattening thumbnails") + num_valid_thumbnails, tx, ty = thumbnails.shape + thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T + + if getThumbnails: + nimg_batch = [] + nthumbnail_batch = [] + for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): + currIntensity = np.sum(img.flatten(), dtype=np.double) + nimg = self.imageProcessor.processImg(img, currIntensity) + nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nthumbnail_batch.append(nthumbnail) + nimg_batch = np.array(nimg_batch).T + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) + if fullimgs is None: + fullimgs = nimg_batch + fullthumbnails = nthumbnail_batch + else: + fullimgs = np.hstack((fullimgs, nimg_batch)) + fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) + else: + nimg_batch = [] + for img in img_batch.T: + currIntensity = np.sum(img.flatten(), dtype=np.double) +# print("Starting image processing of size {}".format(img_batch.T.shape)) + nimg = self.imageProcessor.processImg(img, currIntensity) + if nimg is not None: + nimg_batch.append(nimg) + nimg_batch = np.array(nimg_batch).T +# print("hstacking") + if fullimgs is None: + + fullimgs = nimg_batch + else: + fullimgs = np.hstack((fullimgs, nimg_batch)) + +# print("Images tracked:", imgsTracked) + if getThumbnails: + print(fullimgs.shape, fullthumbnails.shape, imgsTracked) + return (fullimgs, fullthumbnails, imgsTracked) + else: + return (fullimgs, imgsTracked) From 9ccfb95fce149dbf5f9cfd27b4654e9649e8257a Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 3 Oct 2023 14:32:54 -0700 Subject: [PATCH 41/57] Separated visualization from sketching file --- btx/processing/freqdir.py | 12 +- btx/processing/vizfreq.py | 544 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 551 insertions(+), 5 deletions(-) create mode 100644 btx/processing/vizfreq.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 762ef85f0..2ccd0278a 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -39,8 +39,8 @@ from datetime import datetime -import umap -import hdbscan +#import umap +#import hdbscan from sklearn.cluster import OPTICS, cluster_optics_dbscan from matplotlib import colors @@ -815,7 +815,7 @@ def update(self, vec): self.sketch.push(vec, pi, wi) - +''' class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN @@ -1313,6 +1313,7 @@ def userShow(self): from bokeh.io import output_notebook output_notebook() show(self.viewResults) +''' class WrapperFullFD: """ @@ -1544,7 +1545,7 @@ def runMe(self): et3 = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3)) print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull)) - + self.addThumbnailsToProjectH5() return (et1 + et2 + et3 - st1 - st2 - st3) # self.comm.barrier() @@ -1562,7 +1563,7 @@ def addThumbnailsToProjectH5(self): f1.create_dataset("SmallImages", data=self.fullThumbnailData) f1.close() self.comm.barrier() - +''' def visualizeMe(self): st = time.perf_counter() self.addThumbnailsToProjectH5() @@ -1595,6 +1596,7 @@ def visualizeMe(self): et = time.perf_counter() print("UMAP HTML Generation Processing time: {}".format(et - st)) #print("TOTAL PROCESING TIME: {}".format(et - stfull)) +''' class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): diff --git a/btx/processing/vizfreq.py b/btx/processing/vizfreq.py new file mode 100644 index 000000000..14340cc3f --- /dev/null +++ b/btx/processing/vizfreq.py @@ -0,0 +1,544 @@ +import sys +sys.path.append("/sdf/home/w/winnicki/btx/") +from btx.processing.dimRed import * + +import os, csv, argparse +import math +import time +import random +from collections import Counter +import h5py + +import numpy as np +from numpy import zeros, sqrt, dot, diag +from numpy.linalg import svd, LinAlgError +from scipy.linalg import svd as scipy_svd +import pandas as pd +from sklearn.neighbors import NearestNeighbors +from sklearn.metrics.pairwise import euclidean_distances +import heapq + +from mpi4py import MPI + +from matplotlib import pyplot as plt +from matplotlib import colors + +from PIL import Image +from io import BytesIO +import base64 + +from datetime import datetime + +import umap +import hdbscan +from sklearn.cluster import OPTICS, cluster_optics_dbscan + +from matplotlib import colors +import matplotlib as mpl +from matplotlib import cm + +from bokeh.plotting import figure, show, output_file, save +from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label +from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3 +from bokeh.layouts import column, row + +import cProfile +import string + +class visualizeFD: + """ + Visualize FD Dimension Reduction using UMAP and DBSCAN + """ + def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, + skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, + optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): + self.inputFile = inputFile + self.outputFile = outputFile + output_file(filename=outputFile, title="Static HTML file") + self.viewResults = None + self.numImgsToUse = numImgsToUse + self.nprocs = nprocs + self.includeABOD = includeABOD + self.userGroupings = userGroupings + self.skipSize = skipSize + self.umap_n_neighbors = umap_n_neighbors + self.umap_random_state = umap_random_state + self.hdbscan_min_samples=hdbscan_min_samples + self.hdbscan_min_cluster_size=hdbscan_min_cluster_size + self.optics_min_samples=optics_min_samples + self.optics_xi = optics_xi + self.optics_min_cluster_size = optics_min_cluster_size + self.outlierQuantile = outlierQuantile + + def embeddable_image(self, data): + img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) +# image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) + image = Image.fromarray(img_data, mode='RGBA') + buffer = BytesIO() + image.save(buffer, format='png') + for_encoding = buffer.getvalue() + return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8') + + def random_unique_numbers_from_range(self, start, end, count): + all_numbers = list(range(start, end + 1)) + random.shuffle(all_numbers) + return all_numbers[:count] + + def compute_medoid(self, points): + return points[np.argmin(euclidean_distances(points).sum(axis=0))] + + def genMedoids(self, medoidLabels, clusterPoints): + dictMe = {} + for j in set(medoidLabels): + dictMe[j] = [] + for index, class_name in enumerate(medoidLabels): + dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1])) + medoid_lst = [] + for k, v in dictMe.items(): + lst = [(x[1], x[2]) for x in v] + medoid_point = self.compute_medoid(lst) + for test_index, test_point in enumerate(lst): + if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]): + fin_ind = test_index + medoid_lst.append((k, v[fin_ind][0])) + return medoid_lst + + def relabel_to_closest_zero(self, labels): + unique_labels = sorted(set(labels)) + relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)} + relabeled = [relabel_dict[label] for label in labels] + return relabeled + + def regABOD(self, pts): + abofs = [] + for a in range(len(pts)): + test_list = [x for x in range(len(pts)) if x != a] + otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] + outlier_factors = [] + for b, c in otherPts: + apt = pts[a] + bpt = pts[b] + cpt = pts[c] + ab = bpt - apt + ac = cpt - apt + outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) + abofs.append(np.var(np.array(outlier_factors))) + return abofs + + def fastABOD(self, pts, nsamples): + nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts) + k_inds = nbrs.kneighbors(pts)[1] + abofs = [] + count = 0 + for a in range(len(pts)): + test_list = k_inds[a][1:] + otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] + outlier_factors = [] + for (b, c) in otherPts: + apt = pts[a] + bpt = pts[b] + cpt = pts[c] + ab = bpt - apt + ac = cpt - apt + if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): + count += 1 +# print("TOO CLOSE") + continue + outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) +# print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) + if(len(outlier_factors)==0): + abofs.append(np.inf) + else: + abofs.append(np.var(np.array(outlier_factors))) + return abofs + + def getOutliers(self, lst): +# lstCopy = lst.copy() +# lstCopy.sort() +# quart10 = lstCopy[len(lstCopy)//divBy] + + lstQuant = np.quantile(np.array(lst), self.outlierQuantile) +# print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) + outlierInds = [] + notOutlierInds = [] + for j in range(len(lst)): + if lst[j]>lstQuant: + outlierInds.append(j) + else: + notOutlierInds.append(j) +# print("OUTLIER INDS: ", outlierInds) +# print("NOT OUTLIER INDS: ", notOutlierInds) + return np.array(outlierInds), np.array(notOutlierInds) + + def genHist(self, vals, endClass): + totNum = endClass + 1 + countVals = Counter(vals) + hist = [0]*(totNum) + for val in set(countVals): + hist[val] = countVals[val] + maxval = max(countVals.values()) + return hist, maxval + + def genLeftRight(self, endClass): + return [*range(endClass+1)], [*range(1, endClass+2)] + + def genUMAP(self): + imgs = None + projections = None + for currRank in range(self.nprocs): + with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: + if imgs is None: + imgs = hf["SmallImages"][:] + projections = hf["ProjectedData"][:] + else: + imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) + projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + + print("AOIDWJOIAWDJ", len(imgs), len(projections)) + + intensities = [] + for img in imgs: + intensities.append(np.sum(img.flatten())) + intensities = np.array(intensities) + + self.imgs = imgs[:self.numImgsToUse:self.skipSize] + self.projections = projections[:self.numImgsToUse:self.skipSize] + self.intensities = intensities[:self.numImgsToUse:self.skipSize] + + self.numImgsToUse = int(self.numImgsToUse/self.skipSize) + + if len(self.imgs)!= self.numImgsToUse: + raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) + + self.clusterable_embedding = umap.UMAP( + n_neighbors=self.umap_n_neighbors, + random_state=self.umap_random_state, + n_components=2, +# min_dist=0.25, + min_dist=0.1, + ).fit_transform(self.projections) + + self.labels = hdbscan.HDBSCAN( + min_samples = self.hdbscan_min_samples, + min_cluster_size = self.hdbscan_min_cluster_size + ).fit_predict(self.clusterable_embedding) + exclusionList = np.array([]) + self.clustered = np.isin(self.labels, exclusionList, invert=True) + + self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) + self.opticsClust.fit(self.clusterable_embedding) + self.opticsLabels = cluster_optics_dbscan( + reachability=self.opticsClust.reachability_, + core_distances=self.opticsClust.core_distances_, + ordering=self.opticsClust.ordering_, + eps=2.5, + ) +# self.opticsLabels = self.opticsClust.labels_ + + self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) + self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) + self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize + + def genABOD(self): + if self.includeABOD: + abod = self.fastABOD(self.projections, 10) + outliers, notOutliers = self.getOutliers(abod) + else: + outliers = [] + notOutliers = [] + outlierLabels = [] + for j in range(self.numImgsToUse): + if j in outliers: + outlierLabels.append(str(6)) + else: + outlierLabels.append(str(0)) + self.experData_df['anomDet'] = outlierLabels + self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels] + + def setUserGroupings(self, userGroupings): + """ + Set User Grouping. An adjustment is made at the beginning of this function, + whereby 1 is added to each label. This is because internally, the clusters are stored + starting at -1 rather than 0. + """ + self.userGroupings = [[x-1 for x in grouping] for grouping in userGroupings] + + def genLabels(self): + newLabels = [] + for j in self.labels[self.clustered]: + doneChecking = False + for grouping in self.userGroupings: + if j in grouping and not doneChecking: + newLabels.append(min(grouping)) + doneChecking=True + if not doneChecking: + newLabels.append(j) + newLabels = list(np.array(newLabels) + 1) + self.newLabels = np.array(self.relabel_to_closest_zero(newLabels)) + self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]] + self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']] + self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels] + self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] + medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding) + self.medoidInds = [x[1] for x in medoid_lst] + medoidBold = [] + for ind in range(self.numImgsToUse): + if ind in self.medoidInds: + medoidBold.append(12) + else: + medoidBold.append(4) + self.experData_df['medoidBold'] = medoidBold + + opticsNewLabels = [] + for j in self.opticsLabels[self.clustered]: + doneChecking = False + for grouping in self.userGroupings: + if j in grouping and not doneChecking: + opticsNewLabels.append(min(grouping)) + doneChecking=True + if not doneChecking: + opticsNewLabels.append(j) + opticsNewLabels = list(np.array(opticsNewLabels) + 1) + self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) +# self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] + self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels] + + def genHTML(self): + datasource = ColumnDataSource(self.experData_df) + color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) + plot_figure = figure( + title='UMAP projection with DBSCAN clustering of the LCLS dataset', + tools=('pan, wheel_zoom, reset'), + width = 2000, height = 600 + ) + plot_figure.add_tools(HoverTool(tooltips=""" +
+
+ +
+
+
+ Cluster + @cluster +
+
+ Image + @imgind +
+
+
+ """)) + plot_figure.circle( + 'x', + 'y', + source=datasource, + color=dict(field='ptColor', transform=color_mapping), + line_alpha=0.6, + fill_alpha=0.6, + size='medoidBold', + legend_field='cluster' + ) + plot_figure.sizing_mode = 'scale_both' + plot_figure.legend.location = "bottom_right" + plot_figure.legend.title = "Clusters" + + vals = [x for x in self.newLabels] + trueSource = ColumnDataSource(data=dict(vals = vals)) + hist, maxCount = self.genHist(vals, max(vals)) + left, right = self.genLeftRight(max(vals)) + histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right)) + p = figure(width=2000, height=450, toolbar_location=None, + title="Histogram Testing") + p.quad(source=histsource, top='hist', bottom=0, left='left', right='right', + fill_color='skyblue', line_color="white") + p.y_range = Range1d(0, maxCount) + p.x_range = Range1d(0, max(vals)+1) + p.xaxis.axis_label = "Cluster Label" + p.yaxis.axis_label = "Count" + + indexCDS = ColumnDataSource(dict( + index=[*range(0, self.numImgsToUse, 2)] + ) + ) + cols = RangeSlider(title="ET", + start=0, + end=self.numImgsToUse, + value=(0, self.numImgsToUse-1), + step=1, sizing_mode="stretch_width") + callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, + histsource = histsource, datasource=datasource, indexCDS=indexCDS), code=""" + function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) { + let counts = new Array(largestVal-smallestVal); for (let i=0; i= index[index.length - 1]) { +// cb_obj.label = '► Play'; + slider.value = [0, slider_val1-slider_val0]; +// cb_obj.active = false; +// clearInterval(looop); + } + else if(slider_val1 !== index[index.length - 1]){ + slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]]; + } + else { + clearInterval(looop); + } + } + if(cb_obj.active == false){ + cb_obj.label = '► Play'; + clearInterval(looop); + } + else { + cb_obj.label = '❚❚ Pause'; + var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']); + }; + """) + toggl.js_on_change('active',toggl_js) + + reachabilityDiag = figure( + title='OPTICS Reachability Diag', + tools=('pan, wheel_zoom, reset'), + width = 2000, height = 400 + ) + space = np.arange(self.numImgsToUse) + reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] +# reachability = self.opticsClust.reachability_ + opticsData_df = pd.DataFrame({'x':space,'y':reachability}) + opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels] + opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]] + opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']] + color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))], + palette=Category20[20]) + opticssource = ColumnDataSource(opticsData_df) + reachabilityDiag.circle( + 'x', + 'y', + source=opticssource, + color=dict(field='ptColor', transform=color_mapping2), + line_alpha=0.6, + fill_alpha=0.6, + legend_field='cluster' + ) + reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed") + reachabilityDiag.y_range = Range1d(-1, 10) + + LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"] + radio_button_group = RadioButtonGroup(labels=LABELS, active=0) + radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code=""" + const x = datasource.data.x + const y = datasource.data.y + const image = datasource.data.image + const medoidBold = datasource.data.medoidBold + const cluster = datasource.data.cluster + const anomDet = datasource.data.anomDet + const imgind = datasource.data.imgind + const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor + const anom_backgroundColor = datasource.data.anom_backgroundColor + const optics_backgroundColor = datasource.data.optics_backgroundColor + + const opticsClust = opticssource.data.clusterForScatterPlot + + let ptColor = null + let backgroundColor = null + + if (cb_obj.active==0){ + ptColor = cluster + backgroundColor = dbscan_backgroundColor + } + else if (cb_obj.active==1){ + ptColor = opticsClust + backgroundColor = optics_backgroundColor + } + else{ + ptColor = anomDet + backgroundColor = anom_backgroundColor + } + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} + """) + radio_button_group.js_on_change("active", radioGroup_js) + + self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) + + def fullVisualize(self): + self.genUMAP() + self.genABOD() + self.genLabels() + self.genHTML() + + def updateLabels(self): + self.genLabels() + self.genHTML() + + def userSave(self): + save(self.viewResults) + + def userShow(self): + from IPython.display import display, HTML + display(HTML("")) + display(HTML("")) + display(HTML("")) + display(HTML("")) + from bokeh.io import output_notebook + output_notebook() + show(self.viewResults) From a64ccb0ae011eeea1423d3dedd4cbfd51b027087 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 3 Oct 2023 15:08:08 -0700 Subject: [PATCH 42/57] Reverted separation changes. --- btx/processing/freqdir.py | 11 +- btx/processing/vizfreq.py | 544 -------------------------------------- 2 files changed, 4 insertions(+), 551 deletions(-) delete mode 100644 btx/processing/vizfreq.py diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 2ccd0278a..d986adfed 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -39,8 +39,8 @@ from datetime import datetime -#import umap -#import hdbscan +import umap +import hdbscan from sklearn.cluster import OPTICS, cluster_optics_dbscan from matplotlib import colors @@ -815,7 +815,6 @@ def update(self, vec): self.sketch.push(vec, pi, wi) -''' class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN @@ -1313,7 +1312,6 @@ def userShow(self): from bokeh.io import output_notebook output_notebook() show(self.viewResults) -''' class WrapperFullFD: """ @@ -1563,10 +1561,10 @@ def addThumbnailsToProjectH5(self): f1.create_dataset("SmallImages", data=self.fullThumbnailData) f1.close() self.comm.barrier() -''' + def visualizeMe(self): st = time.perf_counter() - self.addThumbnailsToProjectH5() +# self.addThumbnailsToProjectH5() #UMAP STEP ########################################################################################## if self.rank==0: @@ -1596,7 +1594,6 @@ def visualizeMe(self): et = time.perf_counter() print("UMAP HTML Generation Processing time: {}".format(et - st)) #print("TOTAL PROCESING TIME: {}".format(et - stfull)) -''' class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): diff --git a/btx/processing/vizfreq.py b/btx/processing/vizfreq.py deleted file mode 100644 index 14340cc3f..000000000 --- a/btx/processing/vizfreq.py +++ /dev/null @@ -1,544 +0,0 @@ -import sys -sys.path.append("/sdf/home/w/winnicki/btx/") -from btx.processing.dimRed import * - -import os, csv, argparse -import math -import time -import random -from collections import Counter -import h5py - -import numpy as np -from numpy import zeros, sqrt, dot, diag -from numpy.linalg import svd, LinAlgError -from scipy.linalg import svd as scipy_svd -import pandas as pd -from sklearn.neighbors import NearestNeighbors -from sklearn.metrics.pairwise import euclidean_distances -import heapq - -from mpi4py import MPI - -from matplotlib import pyplot as plt -from matplotlib import colors - -from PIL import Image -from io import BytesIO -import base64 - -from datetime import datetime - -import umap -import hdbscan -from sklearn.cluster import OPTICS, cluster_optics_dbscan - -from matplotlib import colors -import matplotlib as mpl -from matplotlib import cm - -from bokeh.plotting import figure, show, output_file, save -from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label -from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3 -from bokeh.layouts import column, row - -import cProfile -import string - -class visualizeFD: - """ - Visualize FD Dimension Reduction using UMAP and DBSCAN - """ - def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, - skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, - optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): - self.inputFile = inputFile - self.outputFile = outputFile - output_file(filename=outputFile, title="Static HTML file") - self.viewResults = None - self.numImgsToUse = numImgsToUse - self.nprocs = nprocs - self.includeABOD = includeABOD - self.userGroupings = userGroupings - self.skipSize = skipSize - self.umap_n_neighbors = umap_n_neighbors - self.umap_random_state = umap_random_state - self.hdbscan_min_samples=hdbscan_min_samples - self.hdbscan_min_cluster_size=hdbscan_min_cluster_size - self.optics_min_samples=optics_min_samples - self.optics_xi = optics_xi - self.optics_min_cluster_size = optics_min_cluster_size - self.outlierQuantile = outlierQuantile - - def embeddable_image(self, data): - img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) -# image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) - image = Image.fromarray(img_data, mode='RGBA') - buffer = BytesIO() - image.save(buffer, format='png') - for_encoding = buffer.getvalue() - return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode('utf-8') - - def random_unique_numbers_from_range(self, start, end, count): - all_numbers = list(range(start, end + 1)) - random.shuffle(all_numbers) - return all_numbers[:count] - - def compute_medoid(self, points): - return points[np.argmin(euclidean_distances(points).sum(axis=0))] - - def genMedoids(self, medoidLabels, clusterPoints): - dictMe = {} - for j in set(medoidLabels): - dictMe[j] = [] - for index, class_name in enumerate(medoidLabels): - dictMe[class_name].append((index, clusterPoints[index, 0], clusterPoints[index, 1])) - medoid_lst = [] - for k, v in dictMe.items(): - lst = [(x[1], x[2]) for x in v] - medoid_point = self.compute_medoid(lst) - for test_index, test_point in enumerate(lst): - if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]): - fin_ind = test_index - medoid_lst.append((k, v[fin_ind][0])) - return medoid_lst - - def relabel_to_closest_zero(self, labels): - unique_labels = sorted(set(labels)) - relabel_dict = {label: new_label for new_label, label in enumerate(unique_labels)} - relabeled = [relabel_dict[label] for label in labels] - return relabeled - - def regABOD(self, pts): - abofs = [] - for a in range(len(pts)): - test_list = [x for x in range(len(pts)) if x != a] - otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] - outlier_factors = [] - for b, c in otherPts: - apt = pts[a] - bpt = pts[b] - cpt = pts[c] - ab = bpt - apt - ac = cpt - apt - outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) - abofs.append(np.var(np.array(outlier_factors))) - return abofs - - def fastABOD(self, pts, nsamples): - nbrs = NearestNeighbors(n_neighbors=nsamples, algorithm='ball_tree').fit(pts) - k_inds = nbrs.kneighbors(pts)[1] - abofs = [] - count = 0 - for a in range(len(pts)): - test_list = k_inds[a][1:] - otherPts = [(d, e) for idx, d in enumerate(test_list) for e in test_list[idx + 1:]] - outlier_factors = [] - for (b, c) in otherPts: - apt = pts[a] - bpt = pts[b] - cpt = pts[c] - ab = bpt - apt - ac = cpt - apt - if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): - count += 1 -# print("TOO CLOSE") - continue - outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) -# print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) - if(len(outlier_factors)==0): - abofs.append(np.inf) - else: - abofs.append(np.var(np.array(outlier_factors))) - return abofs - - def getOutliers(self, lst): -# lstCopy = lst.copy() -# lstCopy.sort() -# quart10 = lstCopy[len(lstCopy)//divBy] - - lstQuant = np.quantile(np.array(lst), self.outlierQuantile) -# print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) - outlierInds = [] - notOutlierInds = [] - for j in range(len(lst)): - if lst[j]>lstQuant: - outlierInds.append(j) - else: - notOutlierInds.append(j) -# print("OUTLIER INDS: ", outlierInds) -# print("NOT OUTLIER INDS: ", notOutlierInds) - return np.array(outlierInds), np.array(notOutlierInds) - - def genHist(self, vals, endClass): - totNum = endClass + 1 - countVals = Counter(vals) - hist = [0]*(totNum) - for val in set(countVals): - hist[val] = countVals[val] - maxval = max(countVals.values()) - return hist, maxval - - def genLeftRight(self, endClass): - return [*range(endClass+1)], [*range(1, endClass+2)] - - def genUMAP(self): - imgs = None - projections = None - for currRank in range(self.nprocs): - with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: - if imgs is None: - imgs = hf["SmallImages"][:] - projections = hf["ProjectedData"][:] - else: - imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) - projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) - - print("AOIDWJOIAWDJ", len(imgs), len(projections)) - - intensities = [] - for img in imgs: - intensities.append(np.sum(img.flatten())) - intensities = np.array(intensities) - - self.imgs = imgs[:self.numImgsToUse:self.skipSize] - self.projections = projections[:self.numImgsToUse:self.skipSize] - self.intensities = intensities[:self.numImgsToUse:self.skipSize] - - self.numImgsToUse = int(self.numImgsToUse/self.skipSize) - - if len(self.imgs)!= self.numImgsToUse: - raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) - - self.clusterable_embedding = umap.UMAP( - n_neighbors=self.umap_n_neighbors, - random_state=self.umap_random_state, - n_components=2, -# min_dist=0.25, - min_dist=0.1, - ).fit_transform(self.projections) - - self.labels = hdbscan.HDBSCAN( - min_samples = self.hdbscan_min_samples, - min_cluster_size = self.hdbscan_min_cluster_size - ).fit_predict(self.clusterable_embedding) - exclusionList = np.array([]) - self.clustered = np.isin(self.labels, exclusionList, invert=True) - - self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) - self.opticsClust.fit(self.clusterable_embedding) - self.opticsLabels = cluster_optics_dbscan( - reachability=self.opticsClust.reachability_, - core_distances=self.opticsClust.core_distances_, - ordering=self.opticsClust.ordering_, - eps=2.5, - ) -# self.opticsLabels = self.opticsClust.labels_ - - self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) - self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) - self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize - - def genABOD(self): - if self.includeABOD: - abod = self.fastABOD(self.projections, 10) - outliers, notOutliers = self.getOutliers(abod) - else: - outliers = [] - notOutliers = [] - outlierLabels = [] - for j in range(self.numImgsToUse): - if j in outliers: - outlierLabels.append(str(6)) - else: - outlierLabels.append(str(0)) - self.experData_df['anomDet'] = outlierLabels - self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels] - - def setUserGroupings(self, userGroupings): - """ - Set User Grouping. An adjustment is made at the beginning of this function, - whereby 1 is added to each label. This is because internally, the clusters are stored - starting at -1 rather than 0. - """ - self.userGroupings = [[x-1 for x in grouping] for grouping in userGroupings] - - def genLabels(self): - newLabels = [] - for j in self.labels[self.clustered]: - doneChecking = False - for grouping in self.userGroupings: - if j in grouping and not doneChecking: - newLabels.append(min(grouping)) - doneChecking=True - if not doneChecking: - newLabels.append(j) - newLabels = list(np.array(newLabels) + 1) - self.newLabels = np.array(self.relabel_to_closest_zero(newLabels)) - self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]] - self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']] - self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels] - self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] - medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding) - self.medoidInds = [x[1] for x in medoid_lst] - medoidBold = [] - for ind in range(self.numImgsToUse): - if ind in self.medoidInds: - medoidBold.append(12) - else: - medoidBold.append(4) - self.experData_df['medoidBold'] = medoidBold - - opticsNewLabels = [] - for j in self.opticsLabels[self.clustered]: - doneChecking = False - for grouping in self.userGroupings: - if j in grouping and not doneChecking: - opticsNewLabels.append(min(grouping)) - doneChecking=True - if not doneChecking: - opticsNewLabels.append(j) - opticsNewLabels = list(np.array(opticsNewLabels) + 1) - self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) -# self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] - self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels] - - def genHTML(self): - datasource = ColumnDataSource(self.experData_df) - color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) - plot_figure = figure( - title='UMAP projection with DBSCAN clustering of the LCLS dataset', - tools=('pan, wheel_zoom, reset'), - width = 2000, height = 600 - ) - plot_figure.add_tools(HoverTool(tooltips=""" -
-
- -
-
-
- Cluster - @cluster -
-
- Image - @imgind -
-
-
- """)) - plot_figure.circle( - 'x', - 'y', - source=datasource, - color=dict(field='ptColor', transform=color_mapping), - line_alpha=0.6, - fill_alpha=0.6, - size='medoidBold', - legend_field='cluster' - ) - plot_figure.sizing_mode = 'scale_both' - plot_figure.legend.location = "bottom_right" - plot_figure.legend.title = "Clusters" - - vals = [x for x in self.newLabels] - trueSource = ColumnDataSource(data=dict(vals = vals)) - hist, maxCount = self.genHist(vals, max(vals)) - left, right = self.genLeftRight(max(vals)) - histsource = ColumnDataSource(data=dict(hist=hist, left=left, right=right)) - p = figure(width=2000, height=450, toolbar_location=None, - title="Histogram Testing") - p.quad(source=histsource, top='hist', bottom=0, left='left', right='right', - fill_color='skyblue', line_color="white") - p.y_range = Range1d(0, maxCount) - p.x_range = Range1d(0, max(vals)+1) - p.xaxis.axis_label = "Cluster Label" - p.yaxis.axis_label = "Count" - - indexCDS = ColumnDataSource(dict( - index=[*range(0, self.numImgsToUse, 2)] - ) - ) - cols = RangeSlider(title="ET", - start=0, - end=self.numImgsToUse, - value=(0, self.numImgsToUse-1), - step=1, sizing_mode="stretch_width") - callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, - histsource = histsource, datasource=datasource, indexCDS=indexCDS), code=""" - function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) { - let counts = new Array(largestVal-smallestVal); for (let i=0; i= index[index.length - 1]) { -// cb_obj.label = '► Play'; - slider.value = [0, slider_val1-slider_val0]; -// cb_obj.active = false; -// clearInterval(looop); - } - else if(slider_val1 !== index[index.length - 1]){ - slider.value = [index.filter((item) => item > slider_val0)[0], index.filter((item) => item > slider_val1)[0]]; - } - else { - clearInterval(looop); - } - } - if(cb_obj.active == false){ - cb_obj.label = '► Play'; - clearInterval(looop); - } - else { - cb_obj.label = '❚❚ Pause'; - var looop = setInterval(check_and_iterate, 0.1, indexCDS.data['index']); - }; - """) - toggl.js_on_change('active',toggl_js) - - reachabilityDiag = figure( - title='OPTICS Reachability Diag', - tools=('pan, wheel_zoom, reset'), - width = 2000, height = 400 - ) - space = np.arange(self.numImgsToUse) - reachability = self.opticsClust.reachability_[self.opticsClust.ordering_] -# reachability = self.opticsClust.reachability_ - opticsData_df = pd.DataFrame({'x':space,'y':reachability}) - opticsData_df['clusterForScatterPlot'] = [str(x) for x in self.opticsNewLabels] - opticsData_df['cluster'] = [str(x) for x in self.opticsNewLabels[self.opticsClust.ordering_]] - opticsData_df['ptColor'] = [x for x in opticsData_df['cluster']] - color_mapping2 = CategoricalColorMapper(factors=[str(x) for x in list(set(self.opticsNewLabels))], - palette=Category20[20]) - opticssource = ColumnDataSource(opticsData_df) - reachabilityDiag.circle( - 'x', - 'y', - source=opticssource, - color=dict(field='ptColor', transform=color_mapping2), - line_alpha=0.6, - fill_alpha=0.6, - legend_field='cluster' - ) - reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed") - reachabilityDiag.y_range = Range1d(-1, 10) - - LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"] - radio_button_group = RadioButtonGroup(labels=LABELS, active=0) - radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code=""" - const x = datasource.data.x - const y = datasource.data.y - const image = datasource.data.image - const medoidBold = datasource.data.medoidBold - const cluster = datasource.data.cluster - const anomDet = datasource.data.anomDet - const imgind = datasource.data.imgind - const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor - const anom_backgroundColor = datasource.data.anom_backgroundColor - const optics_backgroundColor = datasource.data.optics_backgroundColor - - const opticsClust = opticssource.data.clusterForScatterPlot - - let ptColor = null - let backgroundColor = null - - if (cb_obj.active==0){ - ptColor = cluster - backgroundColor = dbscan_backgroundColor - } - else if (cb_obj.active==1){ - ptColor = opticsClust - backgroundColor = optics_backgroundColor - } - else{ - ptColor = anomDet - backgroundColor = anom_backgroundColor - } - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} - """) - radio_button_group.js_on_change("active", radioGroup_js) - - self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) - - def fullVisualize(self): - self.genUMAP() - self.genABOD() - self.genLabels() - self.genHTML() - - def updateLabels(self): - self.genLabels() - self.genHTML() - - def userSave(self): - save(self.viewResults) - - def userShow(self): - from IPython.display import display, HTML - display(HTML("")) - display(HTML("")) - display(HTML("")) - display(HTML("")) - from bokeh.io import output_notebook - output_notebook() - show(self.viewResults) From 260fe9ac8431eecb6102e068b55a48e127551439 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Tue, 3 Oct 2023 15:12:36 -0700 Subject: [PATCH 43/57] Removing UMAP and HDBSCAN dependency from freqdir module. Also made draw_sketch task able to run on more than one core. --- btx/processing/freqdir.py | 7 +++++-- scripts/elog_submit.sh | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index d986adfed..e0b2c2911 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -39,8 +39,8 @@ from datetime import datetime -import umap -import hdbscan +#import umap +#import hdbscan from sklearn.cluster import OPTICS, cluster_optics_dbscan from matplotlib import colors @@ -953,6 +953,9 @@ def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] def genUMAP(self): + import umap + import hdbscan + imgs = None projections = None for currRank in range(self.nprocs): diff --git a/scripts/elog_submit.sh b/scripts/elog_submit.sh index 5103fd33b..d19d45a40 100755 --- a/scripts/elog_submit.sh +++ b/scripts/elog_submit.sh @@ -131,7 +131,8 @@ CORES=${CORES:=1} if [ ${TASK} != 'find_peaks' ] &&\ [ ${TASK} != 'stream_analysis' ] &&\ [ ${TASK} != 'determine_cell' ] &&\ - [ ${TASK} != 'opt_geom' ]; then + [ ${TASK} != 'opt_geom' ] &&\ + [ ${TASK} != 'draw_sketch' ]; then CORES=1 fi From 0bbad2452aa6a69a4b634d53a13ee67264ee0936 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Tue, 3 Oct 2023 15:41:08 -0700 Subject: [PATCH 44/57] attempt at only having psana dependency where needed in freqdir --- btx/processing/freqdir.py | 74 +++++++++++++-------------------------- scripts/tasks.py | 31 +++++++++------- 2 files changed, 42 insertions(+), 63 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index e0b2c2911..40f388b88 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -23,15 +23,15 @@ from matplotlib import pyplot as plt from matplotlib import colors -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) +# from btx.misc.shortcuts import TaskTimer +# +# from btx.interfaces.ipsana import ( +# PsanaInterface, +# bin_data, +# bin_pixel_index_map, +# retrieve_pixel_index_map, +# assemble_image_stack_batch, +# ) from PIL import Image from io import BytesIO @@ -1320,6 +1320,7 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ + from btx.interfaces.ipsana import PsanaInterface def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True): self.start_offset = start_offset self.num_imgs = num_imgs @@ -1352,13 +1353,13 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.usePSI = usePSI if usePSI: - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type) self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size else: self.psi = None if self.rank==0: - self.currRun = datetime.now().strftime("%y%m%d%H%M%S") + self.currRun = run #datetime.now().strftime("%y%m%d%H%M%S") else: self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) @@ -1565,39 +1566,6 @@ def addThumbnailsToProjectH5(self): f1.close() self.comm.barrier() - def visualizeMe(self): - st = time.perf_counter() -# self.addThumbnailsToProjectH5() - #UMAP STEP - ########################################################################################## - if self.rank==0: - -# print("here 1") - - skipSize = 1 - numImgsToUse = int(self.num_imgs/skipSize) - visMe = visualizeFD(inputFile=self.writeToHere+"{}_ProjectedData".format(self.currRun), - outputFile="./UMAPVis_{}.html".format(self.currRun), - numImgsToUse=self.num_imgs, - nprocs=self.size, - userGroupings=[], - includeABOD=True, - skipSize = skipSize, -# umap_n_neighbors=numImgsToUse//40, - umap_n_neighbors=numImgsToUse//4000, - umap_random_state=42, - hdbscan_min_samples=int(numImgsToUse*0.75//40), - hdbscan_min_cluster_size=int(numImgsToUse//40), - optics_min_samples=150, optics_xi = 0.05, optics_min_cluster_size = 0.05, - outlierQuantile=0.3) -# print("here 2") - visMe.fullVisualize() -# print("here 3") - visMe.userSave() - et = time.perf_counter() - print("UMAP HTML Generation Processing time: {}".format(et - st)) - #print("TOTAL PROCESING TIME: {}".format(et - stfull)) - class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): self.threshold = threshold @@ -1656,6 +1624,12 @@ def normalizeIntensityFunc(self, img, currIntensity): class DataRetriever: + from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + retrieve_pixel_index_map, + assemble_image_stack_batch, + ) def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): self.exp = exp self.det_type = det_type @@ -1665,7 +1639,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True) + self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True) self.imageProcessor = imageProcessor @@ -1678,12 +1652,12 @@ def assembleImgsToSave(self, imgs): imgs: ndarray images to downsample """ - pixel_index_map = retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + pixel_index_map = self.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) saveMe = [] for img in imgs: imgRe = np.reshape(img, self.psi.det.shape()) - imgRe = assemble_image_stack_batch(imgRe, pixel_index_map) + imgRe = self.assemble_image_stack_batch(imgRe, pixel_index_map) saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth)))) return np.array(saveMe) @@ -1745,7 +1719,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): if self.downsample: # print("Downsampling images") - imgs = bin_data(imgs, self.bin_factor) + imgs = self.bin_data(imgs, self.bin_factor) # print("Flattening images") num_valid_imgs, p, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T @@ -1786,7 +1760,6 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): nimg_batch = np.array(nimg_batch).T # print("hstacking") if fullimgs is None: - fullimgs = nimg_batch else: fullimgs = np.hstack((fullimgs, nimg_batch)) @@ -1799,6 +1772,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): class SinglePanelDataRetriever: + from btx.interfaces.ipsana import PsanaInterface def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): self.exp = exp self.det_type = det_type @@ -1806,7 +1780,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth - self.psi = PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type) self.imageProcessor = imageProcessor diff --git a/scripts/tasks.py b/scripts/tasks.py index 0603a2498..cc25ff692 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -539,17 +539,22 @@ def show_sketch(): """ Display Sketch. """ taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) - fd = WrapperFullFD(exp=setup.exp, run=setup.run, det_type=setup.det_type, - start_offset=task.start_offset, num_imgs=task.num_imgs, - writeToHere=task.writeToHere, grabImgSteps=task.grabImgSteps, - num_components=task.num_components, alpha=task.alpha, - rankAdapt=task.rankAdapt, rankAdaptMinError=task.rankAdaptMinError, - downsample=task.downsample, bin_factor=task.bin_factor, - threshold=task.threshold, eluThreshold=task.eluThreshold, - eluAlpha=task.eluAlpha, normalizeIntensity=task.normalizeIntensity, - noZeroIntensity=task.noZeroIntensity, minIntensity=task.minIntensity, - samplingFactor=task.samplingFactor, divBy=task.divBy, - thresholdQuantile=task.thresholdQuantile) - logger.info(f'Display Sketch for run {setup.run} of {setup.exp}...') - fd.visualizeMe() + visMe = visualizeFD(inputFile=taskdir + "{}_ProjectedData".format(setup.run), + outputFile="./UMAPVis_{}.html".format(setup.run), + numImgsToUse=task.num_imgs, + nprocs=task.nprocs, + userGroupings=[], + includeABOD=True, + skipSize=task.skip_size, + # umap_n_neighbors=numImgsToUse//40, + umap_n_neighbors=task.num_imgs_to_use // 4000, + umap_random_state=42, + hdbscan_min_samples=int(task.num_imgs_to_use * 0.75 // 40), + hdbscan_min_cluster_size=int(task.num_imgs_to_use // 40), + optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05, + outlierQuantile=0.3) + # print("here 2") + visMe.fullVisualize() + # print("here 3") + visMe.userSave() logger.debug('Done!') From 704a76e6abeb7e2e8f34c4b63d66d0265a4786e0 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Wed, 4 Oct 2023 10:37:55 -0700 Subject: [PATCH 45/57] Drafting FD tasks. --- btx/interfaces/ischeduler.py | 2 ++ btx/processing/freqdir.py | 40 ++++++++++++++++++++++++++++++++++++ scripts/tasks.py | 39 ++++++++++++++++++----------------- 3 files changed, 62 insertions(+), 19 deletions(-) diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py index 3d771c8e3..505eaa18b 100644 --- a/btx/interfaces/ischeduler.py +++ b/btx/interfaces/ischeduler.py @@ -118,6 +118,8 @@ def _write_dependencies(self, dependencies): if "xgandalf" in dependencies: dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/:$PATH\n" dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/eigen3/Eigen/:$PATH" + if "fdviz" in dependencies: + dep_paths += f"conda activate /sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3" dep_paths += "\n" with open(self.jobfile, 'a') as jfile: diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 40f388b88..ab21c6279 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1893,3 +1893,43 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): return (fullimgs, fullthumbnails, imgsTracked) else: return (fullimgs, imgsTracked) + +def main(): + """ + Perform Frequent Direction Visualization. + """ + params = parse_input() + os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True) + visMe = visualizeFD(inputFile=params.outdir + f"{params.run:04}_ProjectedData", + outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html", + numImgsToUse=params.num_imgs, + nprocs=params.nprocs, + userGroupings=[], + includeABOD=True, + skipSize=params.skip_size, + umap_n_neighbors=params.num_imgs_to_use // 4000, + umap_random_state=42, + hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40), + hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40), + optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05, + outlierQuantile=0.3) + visMe.fullVisualize() + visMe.userSave() +def parse_input(): + """ + Parse command line input. + """ + parser = argparse.ArgumentParser() + parser.add_argument('-e', '--exp', help='Experiment name', required=True, type=str) + parser.add_argument('-r', '--run', help='Run number', required=True, type=int) + parser.add_argument('-d', '--det_type', help='Detector name, e.g epix10k2M or jungfrau4M', required=True, type=str) + parser.add_argument('-o', '--outdir', help='Output directory for powders and plots', required=True, type=str) + parser.add_argument('--num_imgs', help='Number of images to process, -1 for full run', required=False, default=-1, type=int) + parser.add_argument('--nprocs', help='Number of cores used for upstream analysis', required=False, type=int) + parser.add_argument('--skip_size', help='Skip size', required=False, type=int) + parser.add_argument('--num_imgs_to_use', help="Number of images to use", required=False, type=int) + + return parser.parse_args() + +if __name__ == '__main__': + main() diff --git a/scripts/tasks.py b/scripts/tasks.py index cc25ff692..2f5c12328 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -533,28 +533,29 @@ def draw_sketch(config): logger.debug('Done!') def show_sketch(): - from btx.processing.freqdir import WrapperFullFD + from btx.interfaces.ischeduler import JobScheduler setup = config.setup task = config.show_sketch """ Display Sketch. """ taskdir = os.path.join(setup.root_dir, 'sketch') os.makedirs(taskdir, exist_ok=True) - visMe = visualizeFD(inputFile=taskdir + "{}_ProjectedData".format(setup.run), - outputFile="./UMAPVis_{}.html".format(setup.run), - numImgsToUse=task.num_imgs, - nprocs=task.nprocs, - userGroupings=[], - includeABOD=True, - skipSize=task.skip_size, - # umap_n_neighbors=numImgsToUse//40, - umap_n_neighbors=task.num_imgs_to_use // 4000, - umap_random_state=42, - hdbscan_min_samples=int(task.num_imgs_to_use * 0.75 // 40), - hdbscan_min_cluster_size=int(task.num_imgs_to_use // 40), - optics_min_samples=150, optics_xi=0.05, optics_min_cluster_size=0.05, - outlierQuantile=0.3) - # print("here 2") - visMe.fullVisualize() - # print("here 3") - visMe.userSave() + script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../btx/processing/freqdir.py") + command = f"python {script_path}" + command += f" -e {setup.exp} -r {setup.run} -d {setup.det_type} -o {taskdir}" + if task.get('num_imgs') is not None: + command += f" --num_imgs={task.num_imgs}" + if task.get('nprocs') is not None: + command += f" --nprocs={task.nprocs}" + if task.get('skip_size') is not None: + command += f" --skip_size={task.skip_size}" + if task.get('num_imgs_to_use') is not None: + command += f" --num_imgs_to_use={task.num_imgs_to_use}" + js = JobScheduler(os.path.join(".", f'fd_{setup.run:04}.sh'), + queue=setup.queue, + ncores=task.ncores, + jobname=f'fd_{setup.run:04}') + js.write_header() + js.write_main(f"{command}\n", dependencies=['psana','fdviz']) + js.clean_up() + js.submit() logger.debug('Done!') From b4a7793825d179888581e8a5031fa9ee1b25358a Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 4 Oct 2023 22:11:02 -0700 Subject: [PATCH 46/57] Fixed bug in importing btx and other libraries. Minor other changes I think. --- btx/processing/freqdir.py | 46 ++++++++++++++++++++------------------- scripts/tasks.py | 1 + 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index ab21c6279..099f6efa3 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -819,6 +819,8 @@ class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN """ + umap = __import__('umap') + hdbscan = __import__('hdbscan') def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): @@ -953,8 +955,6 @@ def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] def genUMAP(self): - import umap - import hdbscan imgs = None projections = None @@ -967,8 +967,6 @@ def genUMAP(self): imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) - print("AOIDWJOIAWDJ", len(imgs), len(projections)) - intensities = [] for img in imgs: intensities.append(np.sum(img.flatten())) @@ -983,7 +981,7 @@ def genUMAP(self): if len(self.imgs)!= self.numImgsToUse: raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) - self.clusterable_embedding = umap.UMAP( + self.clusterable_embedding = self.umap.UMAP( n_neighbors=self.umap_n_neighbors, random_state=self.umap_random_state, n_components=2, @@ -991,7 +989,7 @@ def genUMAP(self): min_dist=0.1, ).fit_transform(self.projections) - self.labels = hdbscan.HDBSCAN( + self.labels = self.hdbscan.HDBSCAN( min_samples = self.hdbscan_min_samples, min_cluster_size = self.hdbscan_min_cluster_size ).fit_predict(self.clusterable_embedding) @@ -1320,7 +1318,8 @@ class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. """ - from btx.interfaces.ipsana import PsanaInterface +# from btx.interfaces.ipsana import PsanaInterface + btx = __import__('btx') def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True): self.start_offset = start_offset self.num_imgs = num_imgs @@ -1353,7 +1352,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.usePSI = usePSI if usePSI: - self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type) self.psi.counter = self.start_offset + self.num_imgs*self.rank//self.size else: self.psi = None @@ -1365,7 +1364,8 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = self.comm.bcast(self.currRun, root=0) self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile) - self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 150, thumbnailWidth = 150) +# self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) + self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): # """ @@ -1624,12 +1624,13 @@ def normalizeIntensityFunc(self, img, currIntensity): class DataRetriever: - from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - retrieve_pixel_index_map, - assemble_image_stack_batch, - ) + btx = __import__('btx') +# from btx.interfaces.ipsana import ( +# PsanaInterface, +# bin_data, +# retrieve_pixel_index_map, +# assemble_image_stack_batch, +# ) def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): self.exp = exp self.det_type = det_type @@ -1639,7 +1640,8 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth - self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True) +# self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type, no_cmod=True) + self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type) self.imageProcessor = imageProcessor @@ -1652,12 +1654,12 @@ def assembleImgsToSave(self, imgs): imgs: ndarray images to downsample """ - pixel_index_map = self.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) + pixel_index_map = self.btx.interfaces.ipsana.retrieve_pixel_index_map(self.psi.det.geometry(self.psi.run)) saveMe = [] for img in imgs: imgRe = np.reshape(img, self.psi.det.shape()) - imgRe = self.assemble_image_stack_batch(imgRe, pixel_index_map) + imgRe = self.btx.interfaces.ipsana.assemble_image_stack_batch(imgRe, pixel_index_map) saveMe.append(np.array(Image.fromarray(imgRe).resize((self.thumbnailHeight, self.thumbnailWidth)))) return np.array(saveMe) @@ -1719,7 +1721,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): if self.downsample: # print("Downsampling images") - imgs = self.bin_data(imgs, self.bin_factor) + imgs = self.btx.interfaces.ipsana.bin_data(imgs, self.bin_factor) # print("Flattening images") num_valid_imgs, p, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, p * x * y)).T @@ -1772,7 +1774,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): class SinglePanelDataRetriever: - from btx.interfaces.ipsana import PsanaInterface +# from btx.interfaces.ipsana import PsanaInterface + btx = __import__('btx') def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): self.exp = exp self.det_type = det_type @@ -1780,7 +1783,7 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.thumbnailHeight = thumbnailHeight self.thumbnailWidth = thumbnailWidth - self.psi = self.PsanaInterface(exp=exp, run=run, det_type=det_type) + self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type) self.imageProcessor = imageProcessor @@ -1841,7 +1844,6 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): for img in imgs: saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) thumbnails = np.array(saveMe) - print("thumbaaowdijaoiajw", len(imgs), len(thumbnails)) num_valid_imgs, x, y = imgs.shape img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T diff --git a/scripts/tasks.py b/scripts/tasks.py index 2f5c12328..3239c02ca 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -529,6 +529,7 @@ def draw_sketch(config): samplingFactor=task.samplingFactor, divBy=task.divBy, thresholdQuantile=task.thresholdQuantile) logger.info(f'Performing Frequent Direction Sketching for run {setup.run} of {setup.exp}...') + fd.retrieveImages() fd.runMe() logger.debug('Done!') From 04195d8fa695e1c701b54123cf58967ab175151b Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Mon, 16 Oct 2023 22:17:12 -0700 Subject: [PATCH 47/57] Checkpoint --- btx/processing/freqdir.py | 81 +++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 099f6efa3..8dd6a3273 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -148,6 +148,7 @@ def __init__( super().__init__(exp=exp, run=run, det_type=det_type, start_offset=start_offset, num_images=num_imgs, num_components=num_components, batch_size=0, priming=False, downsample=downsample, bin_factor=bin_factor, output_dir=output_dir, psi=psi) + self.num_features,self.num_images = imgData.shape else: self.start_offset = start_offset self.downsample = False @@ -155,7 +156,6 @@ def __init__( self.output_dir = output_dir self.num_components = num_components self.num_features,self.num_images = imgData.shape - print("NUM IMAGES: ", self.num_images) self.task_durations = dict({}) @@ -313,7 +313,6 @@ def rotate(self): self.sketch[self.ell:,:] = 0 self.nextZeroRow = self.ell else: - print(S.shape, self.ell) self.sketch[:ssize,:] = diag(s) @ Vt[:ssize,:] self.sketch[ssize:,:] = 0 self.nextZeroRow = ssize @@ -842,6 +841,7 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use self.optics_min_cluster_size = optics_min_cluster_size self.outlierQuantile = outlierQuantile + def embeddable_image(self, data): img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) # image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) @@ -972,6 +972,9 @@ def genUMAP(self): intensities.append(np.sum(img.flatten())) intensities = np.array(intensities) + if self.numImgsToUse==-1: + self.numImgsToUse = len(imgs) + self.imgs = imgs[:self.numImgsToUse:self.skipSize] self.projections = projections[:self.numImgsToUse:self.skipSize] self.intensities = intensities[:self.numImgsToUse:self.skipSize] @@ -998,13 +1001,13 @@ def genUMAP(self): self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) self.opticsClust.fit(self.clusterable_embedding) - self.opticsLabels = cluster_optics_dbscan( - reachability=self.opticsClust.reachability_, - core_distances=self.opticsClust.core_distances_, - ordering=self.opticsClust.ordering_, - eps=2.5, - ) -# self.opticsLabels = self.opticsClust.labels_ +# self.opticsLabels = cluster_optics_dbscan( +# reachability=self.opticsClust.reachability_, +# core_distances=self.opticsClust.core_distances_, +# ordering=self.opticsClust.ordering_, +# eps=2.5, +# ) + self.opticsLabels = self.opticsClust.labels_ self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) @@ -1320,7 +1323,7 @@ class WrapperFullFD: """ # from btx.interfaces.ipsana import PsanaInterface btx = __import__('btx') - def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, usePSI=True): + def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar, usePSI=True): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1342,6 +1345,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.samplingFactor=samplingFactor self.divBy = divBy self.thresholdQuantile = thresholdQuantile + self.unitVar = unitVar self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() @@ -1363,9 +1367,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) - self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile) -# self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) - self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) + self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar) + self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) +# self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): # """ @@ -1567,7 +1571,7 @@ def addThumbnailsToProjectH5(self): self.comm.barrier() class FD_ImageProcessing: - def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile): + def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar): self.threshold = threshold self.eluThreshold = eluThreshold self.eluAlpha = eluAlpha @@ -1575,6 +1579,7 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize self.normalizeIntensity = normalizeIntensity self.minIntensity = minIntensity self.thresholdQuantile = thresholdQuantile + self.unitVar = unitVar def processImg(self, nimg, currIntensity): if self.threshold: @@ -1585,6 +1590,8 @@ def processImg(self, nimg, currIntensity): nimg = self.removeZeroIntensityFunc(nimg, currIntensity) if self.normalizeIntensity: nimg = self.normalizeIntensityFunc(nimg, currIntensity) + if self.unitVar: + nimg = self.unitVarFunc(nimg, currIntensity) return nimg def elu(self,x): @@ -1622,6 +1629,35 @@ def normalizeIntensityFunc(self, img, currIntensity): else: return img/np.sum(img.flatten(), dtype=np.double) + def unitVarFunc(self, img, currIntensity): + if img is None or currIntensity Date: Fri, 20 Oct 2023 11:38:16 -0700 Subject: [PATCH 48/57] added some logger INFO to ischeduler --- btx/interfaces/ischeduler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py index 505eaa18b..0dc027c21 100644 --- a/btx/interfaces/ischeduler.py +++ b/btx/interfaces/ischeduler.py @@ -124,8 +124,10 @@ def _write_dependencies(self, dependencies): with open(self.jobfile, 'a') as jfile: jfile.write(dep_paths) + logger.info(dep_paths) if 'SIT_PSDM_DATA' in os.environ: jfile.write(f"export SIT_PSDM_DATA={os.environ['SIT_PSDM_DATA']}\n") + logger.info(f"export SIT_PSDM_DATA={os.environ['SIT_PSDM_DATA']}\n") def write_main(self, application, dependencies=[]): """ Write application and source requested dependencies. """ @@ -135,6 +137,7 @@ def write_main(self, application, dependencies=[]): pythonpath = self._find_python_path() with open(self.jobfile, 'a') as jfile: jfile.write(application.replace("python", pythonpath)) + logger.info(application.replace("python", pythonpath)) def submit(self): """ Submit to queue. """ @@ -145,3 +148,4 @@ def clean_up(self): """ Add a line to delete submission file.""" with open(self.jobfile, 'a') as jfile: jfile.write(f"if [ -f {self.jobfile} ]; then rm -f {self.jobfile}; fi") + logger.info(f"if [ -f {self.jobfile} ]; then rm -f {self.jobfile}; fi") From c2b7ea55a24bf01b855c02d0add5db9430e95b24 Mon Sep 17 00:00:00 2001 From: fredericpoitevin Date: Fri, 20 Oct 2023 11:53:28 -0700 Subject: [PATCH 49/57] when a conda environment is activated in write_dependencies (in ischeduler), the pythonpath needs to be given as well, otherwise python or mpirun defaults to the original environment. --- btx/interfaces/ischeduler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/btx/interfaces/ischeduler.py b/btx/interfaces/ischeduler.py index 0dc027c21..fd5e6d094 100644 --- a/btx/interfaces/ischeduler.py +++ b/btx/interfaces/ischeduler.py @@ -41,11 +41,15 @@ def _data_systems_management(self): self.ana_conda_manage = f'{self.ana_conda_dir}conda1/manage/bin/' self.ana_conda_bin = f'{self.ana_conda_dir}conda1/inst/envs/ana-4.0.47-py3/bin/' + self.pythonpath = None def _find_python_path(self): """ Determine the relevant python path. """ pythonpath=None - possible_paths = [f"{self.ana_conda_bin}python"] + if self.pythonpath is None: + possible_paths = [f"{self.ana_conda_bin}python"] + else: + possible_paths = [f"{self.pythonpath}"] try: pythonpath = os.environ['WHICHPYTHON'] @@ -120,6 +124,7 @@ def _write_dependencies(self, dependencies): dep_paths += "export PATH=/reg/g/cfel/crystfel/indexers/xgandalf/include/eigen3/Eigen/:$PATH" if "fdviz" in dependencies: dep_paths += f"conda activate /sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3" + self.pythonpath = "/sdf/group/lcls/ds/tools/conda_envs/johnw-ana-4.0.48-py3/bin/python" dep_paths += "\n" with open(self.jobfile, 'a') as jfile: From 5de126856a5e37430b66897fa1629307c9cda002 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Wed, 8 Nov 2023 09:53:39 -0800 Subject: [PATCH 50/57] Some of these settings produced good visualization. I think you have to re-enable ROI and the things in the settings (I think no threshold, but throw away zeros and apply unit variance and possibly normalization). --- btx/processing/freqdir.py | 84 +++++++++++++++++++++++++++++++-------- scripts/tasks.py | 4 +- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 8dd6a3273..8eaad726f 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -47,9 +47,11 @@ import matplotlib as mpl from matplotlib import cm +from bokeh.transform import linear_cmap +from bokeh.util.hex import hexbin from bokeh.plotting import figure, show, output_file, save from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label -from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3 +from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11 from bokeh.layouts import column, row import cProfile @@ -958,14 +960,22 @@ def genUMAP(self): imgs = None projections = None + trueIntensities = None for currRank in range(self.nprocs): with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: if imgs is None: imgs = hf["SmallImages"][:] projections = hf["ProjectedData"][:] + trueIntensities = hf["TrueIntensities"][:] else: imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0) + + for intensMe in trueIntensities: + print(intensMe) + if(np.isnan(intensMe)): + print("This is NAN") intensities = [] for img in imgs: @@ -1013,6 +1023,13 @@ def genUMAP(self): self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize +# self.experData_df['trueIntensities'] = [str(int(math.abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities] + self.experData_df['trueIntensities'] = [5 for x in trueIntensities] +# self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(math.abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities] + self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities] + print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities'])) + print(trueIntensities) + def genABOD(self): if self.includeABOD: abod = self.fastABOD(self.projections, 10) @@ -1029,6 +1046,8 @@ def genABOD(self): self.experData_df['anomDet'] = outlierLabels self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels] + print("2adwjiaomd", len(self.experData_df['anomDet']), self.experData_df['anomDet'], type(self.experData_df['anomDet'])) + def setUserGroupings(self, userGroupings): """ Set User Grouping. An adjustment is made at the beginning of this function, @@ -1079,12 +1098,19 @@ def genLabels(self): def genHTML(self): datasource = ColumnDataSource(self.experData_df) - color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) + #JOHN CHANGE 20231020 +# color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) + color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16]) plot_figure = figure( title='UMAP projection with DBSCAN clustering of the LCLS dataset', tools=('pan, wheel_zoom, reset'), width = 2000, height = 600 ) + +# bins = hexbin(self.clusterable_embedding[self.clustered, 0], self.clusterable_embedding[self.clustered, 1], 0.5) +# plot_figure.hex_tile(q="q", r="r", size=0.5, line_color=None, source=bins, +# fill_color=linear_cmap('counts', 'Viridis256', 0, max(bins.counts))) + plot_figure.add_tools(HoverTool(tooltips="""
@@ -1171,12 +1197,14 @@ def genHTML(self): const cluster = datasource.data.cluster const ptColor = datasource.data.ptColor const anomDet = datasource.data.anomDet + const trueIntensities = datasource.data.trueIntensities const imgind = datasource.data.imgind const backgroundColor = datasource.data.backgroundColor const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor const anom_backgroundColor = datasource.data.anom_backgroundColor const optics_backgroundColor = datasource.data.optics_backgroundColor - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} + const trueIntensities_backgroundColor = datasource.data.trueIntensities_backgroundColor + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, trueIntensities, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor, trueIntensities_backgroundColor} """) cols.js_on_change('value', callback) @@ -1257,7 +1285,7 @@ def genHTML(self): reachabilityDiag.line([0, len(opticsData_df['ptColor'])], [2, 2], line_width=2, color="black", line_dash="dashed") reachabilityDiag.y_range = Range1d(-1, 10) - LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection"] + LABELS = ["DBSCAN Clustering", "OPTICS Clustering", "Anomaly Detection", "True Intensity"] radio_button_group = RadioButtonGroup(labels=LABELS, active=0) radioGroup_js = CustomJS(args=dict(datasource=datasource, opticssource=opticssource), code=""" const x = datasource.data.x @@ -1266,10 +1294,12 @@ def genHTML(self): const medoidBold = datasource.data.medoidBold const cluster = datasource.data.cluster const anomDet = datasource.data.anomDet + const trueIntensities = datasource.data.trueIntensities const imgind = datasource.data.imgind const dbscan_backgroundColor = datasource.data.dbscan_backgroundColor const anom_backgroundColor = datasource.data.anom_backgroundColor const optics_backgroundColor = datasource.data.optics_backgroundColor + const trueIntensities_backgroundColor = datasource.data.trueIntensities_backgroundColor const opticsClust = opticssource.data.clusterForScatterPlot @@ -1284,21 +1314,29 @@ def genHTML(self): ptColor = opticsClust backgroundColor = optics_backgroundColor } - else{ + else if (cb_obj.active==2) { ptColor = anomDet backgroundColor = anom_backgroundColor } - datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor} + else { + ptColor = trueIntensities + backgroundColor = trueIntensities_backgroundColor + } + datasource.data = { x, y, image, cluster, medoidBold, ptColor, anomDet, trueIntensities, imgind, backgroundColor, dbscan_backgroundColor, anom_backgroundColor, optics_backgroundColor, trueIntensities_backgroundColor} """) radio_button_group.js_on_change("active", radioGroup_js) self.viewResults = column(plot_figure, p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) + def genCSV(self): + self.experData_df.to_csv(self.outputFile[:-4]+"csv") + def fullVisualize(self): self.genUMAP() self.genABOD() self.genLabels() self.genHTML() + self.genCSV() def updateLabels(self): self.genLabels() @@ -1323,7 +1361,7 @@ class WrapperFullFD: """ # from btx.interfaces.ipsana import PsanaInterface btx = __import__('btx') - def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar, usePSI=True): + def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1563,10 +1601,11 @@ def runMe(self): def addThumbnailsToProjectH5(self): # print("Gathering thumbnails") startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - _,self.fullThumbnailData,_ = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) + _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank) f1 = h5py.File(file_name, 'r+') f1.create_dataset("SmallImages", data=self.fullThumbnailData) + f1.create_dataset("TrueIntensities", data=np.array(self.trueIntensitiesData)) f1.close() self.comm.barrier() @@ -1861,7 +1900,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): fullthumbnails = None imgsTracked = [] runs = self.split_range(startInd, startInd+n, num_steps) - print(runs) + print(runs) + trueIntensities = [] for runStart, runEnd in runs: # print("RETRIEVING: [", runStart, ":", runEnd,"]") self.psi.counter = runStart @@ -1875,10 +1915,19 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] ] - jimgs = [] - for img in imgs: - jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100)) - imgs = np.array(jimgs) + origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs] + newTrueIntensities = [] + for j in origTrueIntensities: + if j>0: + newTrueIntensities.append(0) + else: + newTrueIntensities.append(np.log(j)) + origTrueIntensities = newTrueIntensities + +# jimgs = [] +# for img in imgs: +# jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100)) +# imgs = np.array(jimgs) if getThumbnails: saveMe = [] @@ -1899,13 +1948,15 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): if getThumbnails: nimg_batch = [] nthumbnail_batch = [] - for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): + ntrueIntensity_batch = [] + for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities): currIntensity = np.sum(img.flatten(), dtype=np.double) nimg = self.imageProcessor.processImg(img, currIntensity) nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) if nimg is not None: nimg_batch.append(nimg) nthumbnail_batch.append(nthumbnail) + ntrueIntensity_batch.append(trueIntens) else: num_valid_thumbnails -= 1 num_valid_imgs -= 1 @@ -1917,6 +1968,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): elif len(nimg_batch)!=0: fullimgs = np.hstack((fullimgs, nimg_batch)) fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) + trueIntensities += ntrueIntensity_batch else: nimg_batch = [] for img in img_batch.T: @@ -1939,7 +1991,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): # print("Images tracked:", imgsTracked) if getThumbnails: print(fullimgs.shape, fullthumbnails.shape, imgsTracked) - return (fullimgs, fullthumbnails, imgsTracked) + return (fullimgs, fullthumbnails, imgsTracked, trueIntensities) else: return (fullimgs, imgsTracked) @@ -1949,7 +2001,7 @@ def main(): """ params = parse_input() os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True) - visMe = visualizeFD(inputFile=params.outdir + f"{params.run:04}_ProjectedData", + visMe = visualizeFD(inputFile=params.outdir + f"/{params.run:04}_ProjectedData", outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html", numImgsToUse=params.num_imgs, nprocs=params.nprocs, diff --git a/scripts/tasks.py b/scripts/tasks.py index 3239c02ca..1708a7548 100755 --- a/scripts/tasks.py +++ b/scripts/tasks.py @@ -533,7 +533,7 @@ def draw_sketch(config): fd.runMe() logger.debug('Done!') -def show_sketch(): +def show_sketch(config): from btx.interfaces.ischeduler import JobScheduler setup = config.setup task = config.show_sketch @@ -553,7 +553,7 @@ def show_sketch(): command += f" --num_imgs_to_use={task.num_imgs_to_use}" js = JobScheduler(os.path.join(".", f'fd_{setup.run:04}.sh'), queue=setup.queue, - ncores=task.ncores, + ncores=task.nprocs, jobname=f'fd_{setup.run:04}') js.write_header() js.write_main(f"{command}\n", dependencies=['psana','fdviz']) From 64cabbf4fc5972bb45484fa1e504f6e2d0f15997 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Fri, 29 Dec 2023 15:09:01 -0800 Subject: [PATCH 51/57] Everything is working. It runs and produces beam profiles using scaling script and run and single panel stuff. --- btx/processing/freqdir.py | 548 ++++++++++++++++++++++++++++++++++---- 1 file changed, 495 insertions(+), 53 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 8eaad726f..bbf3a13be 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -23,15 +23,21 @@ from matplotlib import pyplot as plt from matplotlib import colors -# from btx.misc.shortcuts import TaskTimer -# -# from btx.interfaces.ipsana import ( -# PsanaInterface, -# bin_data, -# bin_pixel_index_map, -# retrieve_pixel_index_map, -# assemble_image_stack_batch, -# ) +########################## +########################## +#JOHN CHANGE BACK AFTER 12/15/2023 +from btx.misc.shortcuts import TaskTimer + +from btx.interfaces.ipsana import ( + PsanaInterface, + bin_data, + bin_pixel_index_map, + retrieve_pixel_index_map, + assemble_image_stack_batch, +) +########################## +########################## + from PIL import Image from io import BytesIO @@ -51,12 +57,15 @@ from bokeh.util.hex import hexbin from bokeh.plotting import figure, show, output_file, save from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label +from bokeh.models import CustomJS, ColumnDataSource, Span, PreText from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11 from bokeh.layouts import column, row import cProfile import string +import cv2 + class FreqDir(DimRed): """ @@ -816,6 +825,7 @@ def update(self, vec): self.sketch.push(vec, pi, wi) + class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN @@ -844,6 +854,278 @@ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, use self.outlierQuantile = outlierQuantile + def retrieveCircularity(self, fullThumbnailData): + + def rotate_image(image, angle, center=None, scale=1.0): + (h, w) = image.shape[:2] + if center is None: + center = (w // 2, h // 2) + M = cv2.getRotationMatrix2D(center, angle, scale) + rotated = cv2.warpAffine(image, M, (w, h)) + return rotated + + def compute_properties(M): + # Calculate centroid + cx = int(M["m10"] / M["m00"]) + cy = int(M["m01"] / M["m00"]) + + # Calculate orientation + mu20 = M["mu20"] / M["m00"] + mu02 = M["mu02"] / M["m00"] + mu11 = M["mu11"] / M["m00"] + theta = 0.5 * np.arctan2(2 * mu11, mu20 - mu02) + + # Calculate eccentricity + a = 2 * np.sqrt(mu20 + mu02 + np.sqrt(4 * mu11**2 + (mu20 - mu02)**2)) + b = 2 * np.sqrt(mu20 + mu02 - np.sqrt(4 * mu11**2 + (mu20 - mu02)**2)) + eccentricity = np.sqrt(1 - (b / a) ** 2) + + return cx, cy, theta, eccentricity + + def reorientImg(nimg): + M = cv2.moments(nimg) + cx, cy , theta, eccentricity = compute_properties(M) + return rotate_image(nimg, angle=theta*180/math.pi) + + + def denoiseImg(image): + # Threshold the image to get a binary image + _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY) + + # Perform connected component labeling + num_labels, labels_im = cv2.connectedComponents(binary_image) + + # Create a mask for components larger than the size threshold + mask = np.zeros_like(image, dtype=bool) + + size_threshold = 500 + + # Iterate through components and update the mask for large components + for label in range(1, num_labels): + if np.sum(labels_im == label) > size_threshold: + mask[labels_im == label] = True + + # Apply the mask to the original grayscale image + masked_image = np.zeros_like(image) + masked_image[mask] = image[mask] + return masked_image + + def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired_size=224, min_contour_area=100): + """ + Centers and crops the main intensity pattern in an image. + + Parameters: + image (numpy.ndarray): The input image. + threshold_value (int): Threshold value for binary thresholding. + blur_kernel (tuple): Kernel size for Gaussian blur. + + Returns: + numpy.ndarray: The cropped image centered around the intensity pattern. + """ + # Normalize or scale the image + if image.dtype != np.uint8: + # If the range of your image is known (e.g., 0 to 5), normalize accordingly + # image = ((image - image.min()) / (image.max() - image.min())) * 255 + + # If the range is not known, scale based on the current min and max + image = 255 * (image - image.min()) / (image.max() - image.min()) + image = image.astype(np.uint8) + + # print(image[100]) + + # Ensure the image is in grayscale + if len(image.shape) == 3: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # Step 1: Noise Reduction + blurred = cv2.GaussianBlur(image, blur_kernel, 0) + # blurred = image + + # Step 2: Thresholding + # _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY) + _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY) + + + # # Step 3: Locate the Beam + # contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # if not contours: + # return None # No contours found + + # # Step 4: Determine the Bounding Box + # beam = max(contours, key=cv2.contourArea) + # x, y, w, h = cv2.boundingRect(beam) + + # # Step 5: Centering and Cropping + # cropped = image[y:y+h, x:x+w] + # print(x, y, w, h) + + # Locate the Beam + contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # contours = [c for c in contours if cv2.contourArea(c) > min_contour_area] + if not contours: + return None + + # Determine the Bounding Box + beam = max(contours, key=cv2.contourArea) + x, y, w, h = cv2.boundingRect(beam) + + # Find the center of the bounding box + center_x, center_y = x + w // 2, y + h // 2 + + # Define new bounding box dimensions + new_x = max(center_x - desired_size // 2, 0) + new_y = max(center_y - desired_size // 2, 0) + + # print("new x: ", x, new_x) + # print("new y: ", y, new_y) + + # Adjust if the new box extends beyond the original image + new_x = min(new_x, image.shape[1] - desired_size) + new_y = min(new_y, image.shape[0] - desired_size) + + # print("new x: ", x, new_x) + # print("new y: ", y, new_y) + + # Crop the image to the new bounding box + cropped = image[new_y:new_y + desired_size, new_x:new_x + desired_size] + + return cropped + + # threshVal = 100 + + nimgs = [] + nbws = [] + nbws1 = [] + contours = [] + contourImgs = [] + for j in range(len(fullThumbnailData)): + # currImg = (fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy() + # nimg = currImg + nimg = center_and_crop_beam(fullThumbnailData[j]) + # nimg = reorientImg(nimg) + if nimg is None: + continue + nimg = reorientImg(nimg) + nimg = denoiseImg(nimg) + nimgs.append(nimg) + # nbws.append(nimg) + # (thresh, im_bw) = cv2.threshold((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy(), 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) + # print(nimg) + # print(j, np.max(nimg)) + # np.set_printoptions(threshold=np.inf, linewidth=np.inf) + # print(nimg) + + (thresh, im_bw) = cv2.threshold(nimg, 0, 255, cv2.THRESH_BINARY) + nbws.append(im_bw.copy()) + (thresh1, im_bw1) = cv2.threshold(nimg, 0, 1, cv2.THRESH_BINARY) + nbws1.append(im_bw1.copy()) + + # # Assuming 'im' is your grayscale image + # # Apply Gaussian blur to the image + # blurred = cv2.GaussianBlur(im_bw, (5, 5), 0) + # # Apply binary thresholding on the blurred image + # _, binary = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY) + # # Find contours + # contourList, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # # Find the largest contour based on area + # largest_contour = max(contourList, key=cv2.contourArea) + # contours.append(largest_contour) + # canvas = np.zeros(im_bw.shape, dtype='uint8') + # # Draw the largest contour in white + # cv2.drawContours(canvas, [largest_contour], -1, (255), 1) + # contourImgs.append(canvas) + + # # nbws.append(cv2.GaussianBlur(nimg, (5, 5), 0)) + # # nbws.append(im_bw) + # # nbws.append((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy() + + # # ind = 356 + + # # plt.imshow(nimgs[ind]) + # # plt.show() + + # # print(nbws1[ind][80]) + + # # Calculate moments + # M = cv2.moments(nbws1[ind]) + # # Zeroth moment is the area + # area = M['m00'] + # epsilon = 0.01 * cv2.arcLength(contours[ind], True) + # approx = cv2.approxPolyDP(contours[ind], epsilon, True) + # # Calculate the perimeter + # perimeter = cv2.arcLength(approx, True) + # # Calculate circularity using moments + # circularity = 4 * 3.14159 * area / (perimeter * perimeter) + # print(circularity) + + # # Calculate moments + # M = cv2.moments(nbws1[ind]) + # ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02'])) + # print(ncirc) + + circs = [] + ncircs = [] + + for ind in range(len(nbws)): + # # Calculate moments + # M = cv2.moments(nbws1[ind]) + # # Zeroth moment is the area + # area = M['m00'] + # epsilon = 0.01 * cv2.arcLength(contours[ind], True) + # approx = cv2.approxPolyDP(contours[ind], epsilon, True) + # # Calculate the perimeter + # perimeter = cv2.arcLength(approx, True) + # # Calculate circularity using moments + # circularity = 4 * 3.14159 * area / (perimeter * perimeter) + + # Calculate moments + M = cv2.moments(nbws[ind]) + try: + ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02'])) + except: + ncirc = 1 + + # circs.append(circularity) + ncircs.append(ncirc) + + sorted_indices = np.argsort(ncircs) + sorted_arrays = [nimgs[i] for i in sorted_indices] + sorted_full = [fullThumbnailData[i] for i in sorted_indices] + + # import matplotlib.pyplot as plt + # import numpy as np + + # # Assuming 'images' is your list of 16 NumPy array images + # # For demonstration, creating 16 random 8x8 grayscale images + # images = [j for j in sorted_arrays[::len(sorted_arrays)//16]] + + # # Create a 4x4 grid of subplots + # fig, axs = plt.subplots(4, 4, figsize=(10, 10)) + + # # Flatten the array of axes for easy iteration + # axs = axs.ravel() + + # # Plot each image and add text + # for i in range(16): + # axs[i].imshow(images[i], cmap='jet', vmin=0, vmax=255) # Assuming grayscale images + # axs[i].text(50, 5, f"Image {i+1}", color='white', ha='center', va='center') + # axs[i].axis('off') # Turn off axis + + # plt.tight_layout() # Adjust subplots to fit into the figure area. + # plt.show() + + # ind=23 + # nimg = center_and_crop_beam(fullThumbnailData[40]) + # # plt.imshow(fullThumbnailData[ind]) + # plt.imshow(nimg) + # plt.show() + + bigOrSmall = [1 if j>len(sorted_arrays)*10//16 else 0 for j in sorted_indices] +# np.savez(saveDir+'circularityImgs_{}.npz'.format(currRun), **{f'array_{i}': arr for i, arr in enumerate(nimgs)}, labels=bigOrSmall) + + return ncircs + + def embeddable_image(self, data): img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) # image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) @@ -874,7 +1156,8 @@ def genMedoids(self, medoidLabels, clusterPoints): for test_index, test_point in enumerate(lst): if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]): fin_ind = test_index - medoid_lst.append((k, v[fin_ind][0])) +# medoid_lst.append((k, v[fin_ind][0])) + medoid_lst.append((k, v[fin_ind+1][0])) return medoid_lst def relabel_to_closest_zero(self, labels): @@ -956,8 +1239,31 @@ def genHist(self, vals, endClass): def genLeftRight(self, endClass): return [*range(endClass+1)], [*range(1, endClass+2)] + + def float_to_int_percentile(self, float_list): + # Edge case: If the list is empty, return an empty list + if not float_list: + return [] + + # Calculate the percentiles that define the bin edges + percentiles = np.percentile(float_list, [10 * i for i in range(1, 10)]) + + # Function to find the bin for a single value + def find_bin(value): + for i, p in enumerate(percentiles): + if value < p: + return i + return 9 # For values in the highest bin + + # Convert each float to an integer based on its bin + int_list = [find_bin(value) for value in float_list] + + return int_list + + def genUMAP(self): + imgs = None projections = None trueIntensities = None @@ -971,6 +1277,7 @@ def genUMAP(self): imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0) + print(len(imgs)) for intensMe in trueIntensities: print(intensMe) @@ -998,14 +1305,18 @@ def genUMAP(self): n_neighbors=self.umap_n_neighbors, random_state=self.umap_random_state, n_components=2, -# min_dist=0.25, - min_dist=0.1, + min_dist=0, +# min_dist=0.1, ).fit_transform(self.projections) - self.labels = self.hdbscan.HDBSCAN( - min_samples = self.hdbscan_min_samples, - min_cluster_size = self.hdbscan_min_cluster_size - ).fit_predict(self.clusterable_embedding) +# self.labels = self.hdbscan.HDBSCAN( +# min_samples = self.hdbscan_min_samples, +# min_cluster_size = self.hdbscan_min_cluster_size +# ).fit_predict(self.clusterable_embedding) + + ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs)) + self.labels = np.array(ncircs) + exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) @@ -1023,12 +1334,15 @@ def genUMAP(self): self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize -# self.experData_df['trueIntensities'] = [str(int(math.abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities] - self.experData_df['trueIntensities'] = [5 for x in trueIntensities] -# self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(math.abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities] - self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities] - print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities'])) - print(trueIntensities) +# self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities] +# self.experData_df['trueIntensities'] = [5 for x in trueIntensities] +# self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities] +# self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities] +# print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities'])) +# print(trueIntensities) + self.experData_df['trueIntensities'] = [1 for x in self.experData_df['imgind']] + self.experData_df['trueIntensities_backgroundColor'] = [1 for x in self.experData_df['imgind']] + def genABOD(self): if self.includeABOD: @@ -1103,7 +1417,7 @@ def genHTML(self): color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16]) plot_figure = figure( title='UMAP projection with DBSCAN clustering of the LCLS dataset', - tools=('pan, wheel_zoom, reset'), + tools=('pan, wheel_zoom, reset, lasso_select'), width = 2000, height = 600 ) @@ -1142,6 +1456,8 @@ def genHTML(self): plot_figure.legend.location = "bottom_right" plot_figure.legend.title = "Clusters" + density_text = PreText(text='Density_Text') + vals = [x for x in self.newLabels] trueSource = ColumnDataSource(data=dict(vals = vals)) hist, maxCount = self.genHist(vals, max(vals)) @@ -1165,7 +1481,7 @@ def genHTML(self): end=self.numImgsToUse, value=(0, self.numImgsToUse-1), step=1, sizing_mode="stretch_width") - callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, + callback = CustomJS(args=dict(cols=cols, trueSource = trueSource, density_text=density_text, histsource = histsource, datasource=datasource, indexCDS=indexCDS), code=""" function countNumbersAtIndices(numbers, startInd, endInd, smallestVal, largestVal) { let counts = new Array(largestVal-smallestVal); for (let i=0; i i + 1); + + console.log(rightVal-leftVal) + var avg = countCommonElementsInWindow(inds, arrayFrom1ToLength, rightVal-leftVal); + density_text.text = avg.toString(); + """)) + + + self.viewResults = column(row(plot_figure, density_text), p, imgsPlot, row(cols, toggl, radio_button_group), reachabilityDiag) def genCSV(self): self.experData_df.to_csv(self.outputFile[:-4]+"csv") @@ -1405,7 +1783,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) - self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar) + self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150) self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) @@ -1610,7 +1988,7 @@ def addThumbnailsToProjectH5(self): self.comm.barrier() class FD_ImageProcessing: - def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar): + def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h): self.threshold = threshold self.eluThreshold = eluThreshold self.eluAlpha = eluAlpha @@ -1619,12 +1997,23 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize self.minIntensity = minIntensity self.thresholdQuantile = thresholdQuantile self.unitVar = unitVar + self.centerImg = centerImg + self.roi_w = roi_w + self.roi_h = roi_h - def processImg(self, nimg, currIntensity): + def processImg(self, nimg, ncurrIntensity): if self.threshold: nimg = self.thresholdFunc(nimg) if self.eluThreshold: nimg = self.eluThresholdFunc(nimg) + if self.centerImg: + nimg = self.centerImgFunc(nimg) + + if nimg is not None: + currIntensity = abs(np.sum(nimg.flatten(), dtype=np.double)) + else: + currIntensity = 0 + if self.noZeroIntensity: nimg = self.removeZeroIntensityFunc(nimg, currIntensity) if self.normalizeIntensity: @@ -1664,8 +2053,10 @@ def normalizeIntensityFunc(self, img, currIntensity): if img is None: return img elif currIntensity=1: + curr_roi_w = int(self.roi_w*rampingFact) + curr_roi_h = int(self.roi_h*rampingFact) + nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1) +# print(rampingFact) + if np.sum(img.flatten(), dtype=np.double)<10000: + cogx, cogy = (curr_roi_w, curr_roi_h) + else: + cogx, cogy = self.calcCenterGrav(nimg) + # return nimg[cogy-(roi_h):cogy+(roi_h//2), cogx-(roi_w):cogx+(roi_w//2)] + nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)] + rampingFact -= 0.5 + return nimg def calcCenterGrav(self, grid): @@ -1693,6 +2092,7 @@ def calcCenterGrav(self, grid): row_indices, col_indices = np.indices(grid.shape) X_c = np.sum(row_indices * grid) / M_total Y_c = np.sum(col_indices * grid) / M_total +# print(M_total, X_c, Y_c, grid) return (round(X_c), round(Y_c)) @@ -1812,9 +2212,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): nimg_batch = [] nthumbnail_batch = [] for img, thumbnail in zip(img_batch.T, thumbnail_batch.T): - currIntensity = np.sum(img.flatten(), dtype=np.double) - nimg = self.imageProcessor.processImg(img, currIntensity) - nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) +# currIntensity = np.sum(img.flatten(), dtype=np.double) + nimg = self.imageProcessor.processImg(img) #JOHN 011/09/2023 + nthumbnail = self.imageProcessor.processImg(thumbnail) if nimg is not None: nimg_batch.append(nimg) nthumbnail_batch.append(nthumbnail) @@ -1829,9 +2229,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): else: nimg_batch = [] for img in img_batch.T: - currIntensity = np.sum(img.flatten(), dtype=np.double) +# currIntensity = np.sum(img.flatten(), dtype=np.double) #JOHN 011/09/2023 # print("Starting image processing of size {}".format(img_batch.T.shape)) - nimg = self.imageProcessor.processImg(img, currIntensity) + nimg = self.imageProcessor.processImg(img) if nimg is not None: nimg_batch.append(nimg) nimg_batch = np.array(nimg_batch).T @@ -1918,7 +2318,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs] newTrueIntensities = [] for j in origTrueIntensities: - if j>0: +# if j>0: +# newTrueIntensities.append(0) +# else: +# newTrueIntensities.append(np.log(j)) + if j<0: newTrueIntensities.append(0) else: newTrueIntensities.append(np.log(j)) @@ -1932,18 +2336,25 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): if getThumbnails: saveMe = [] for img in imgs: - saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) +# saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023 + saveMe.append(np.array(img)) #JOHN 011/09/2023 thumbnails = np.array(saveMe) - num_valid_imgs, x, y = imgs.shape - img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T + num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023 + +# img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T #JOHN 011/09/2023 + img_batch = imgs.T # print("Image values less than 0 setting to 0") img_batch[img_batch<0] = 0 + +# num_valid_imgs, x, y = img_batch.T.shape #JOHN 11/20/2023 +# print(num_valid_imgs, x, y) if getThumbnails: # print("FLattening thumbnails") num_valid_thumbnails, tx, ty = thumbnails.shape - thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T +# thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T #JOHN 011/09/2023 + thumbnail_batch = thumbnails.T if getThumbnails: nimg_batch = [] @@ -1951,17 +2362,36 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): ntrueIntensity_batch = [] for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities): currIntensity = np.sum(img.flatten(), dtype=np.double) - nimg = self.imageProcessor.processImg(img, currIntensity) - nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) + if self.imageProcessor.centerImg: + nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) + else: + nimg = self.imageProcessor.processImg(img, currIntensity) +# nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) #JOHN 011/09/2023 + if nimg is None: + nthumbnail = None + else: + nthumbnail = nimg.copy() +# print(np.array(nimg).shape) +# print(nthumbnail) if nimg is not None: nimg_batch.append(nimg) nthumbnail_batch.append(nthumbnail) ntrueIntensity_batch.append(trueIntens) else: +# nimg_batch.append(np.zeros((x, y))) +# nthumbnail_batch.append(np.zeros((tx, ty))) +# ntrueIntensity_batch.append(0) num_valid_thumbnails -= 1 num_valid_imgs -= 1 - nimg_batch = np.array(nimg_batch).T - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) + if self.imageProcessor.centerImg: #JOHN 011/09/2023 +# print("a09wupoidkw", np.array(nimg_batch).shape) + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023 + else: #JOHN 011/09/2023 +# print("a09wupoidkw", np.array(nimg_batch).shape) +# print(num_valid_imgs, x, y) + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023 + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023 if fullimgs is None: fullimgs = nimg_batch fullthumbnails = nthumbnail_batch @@ -1974,12 +2404,23 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): for img in img_batch.T: currIntensity = np.sum(img.flatten(), dtype=np.double) # print("Starting image processing of size {}".format(img_batch.T.shape)) - nimg = self.imageProcessor.processImg(img, currIntensity) + nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) if nimg is not None: nimg_batch.append(nimg) else: +# nimg_batch.append(np.zeros((x, y))) num_valid_imgs -= 1 - nimg_batch = np.array(nimg_batch).T + +# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 + + #JOHN 11/20/23 + if self.imageProcessor.centerImg: #JOHN 011/09/2023 + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 + else: #JOHN 011/09/2023 + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023 + + + # print(nimg_batch.shape) # print("hstacking") if fullimgs is None: @@ -2008,7 +2449,8 @@ def main(): userGroupings=[], includeABOD=True, skipSize=params.skip_size, - umap_n_neighbors=params.num_imgs_to_use // 4000, +# umap_n_neighbors=params.num_imgs_to_use // 4000, + umap_n_neighbors=params.num_imgs_to_use // 10000, umap_random_state=42, hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40), hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40), From e4ac86e218334744f7a6c966060561be24745804 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Sun, 31 Dec 2023 12:19:17 -0800 Subject: [PATCH 52/57] run, script, scalingscript, scalingrun. Fixed bug where images were not being scaled correctly using the new boxing mechanism. --- btx/processing/freqdir.py | 42 +++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index bbf3a13be..8773bda21 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1291,6 +1291,7 @@ def genUMAP(self): if self.numImgsToUse==-1: self.numImgsToUse = len(imgs) + self.logging_numImgsToUse = len(imgs) self.imgs = imgs[:self.numImgsToUse:self.skipSize] self.projections = projections[:self.numImgsToUse:self.skipSize] @@ -1299,7 +1300,7 @@ def genUMAP(self): self.numImgsToUse = int(self.numImgsToUse/self.skipSize) if len(self.imgs)!= self.numImgsToUse: - raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({})".format(len(self.imgs), self.numImgsToUse)) + raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({}). TRUE LEN IS {}.".format(len(self.imgs), self.numImgsToUse, self.logging_numImgsToUse)) self.clusterable_embedding = self.umap.UMAP( n_neighbors=self.umap_n_neighbors, @@ -1783,7 +1784,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) +#JOHN CHANGE 12/30/2023 self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150) + # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300) self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) @@ -1980,12 +1983,14 @@ def addThumbnailsToProjectH5(self): # print("Gathering thumbnails") startingPoint = self.start_offset + self.num_imgs*self.rank//self.size _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) + # print("FULL THUMBNAIL DATA: ", np.array(self.fullThumbnailData).shape) file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank) f1 = h5py.File(file_name, 'r+') f1.create_dataset("SmallImages", data=self.fullThumbnailData) f1.create_dataset("TrueIntensities", data=np.array(self.trueIntensitiesData)) f1.close() self.comm.barrier() + # print("FINISHED AIJOWDAWODIDWJA") class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h): @@ -2336,8 +2341,9 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): if getThumbnails: saveMe = [] for img in imgs: -# saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023 - saveMe.append(np.array(img)) #JOHN 011/09/2023 + #JOHN CHANGE 12/30/2023 + saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023 +# saveMe.append(np.array(img)) #JOHN 011/09/2023 thumbnails = np.array(saveMe) num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023 @@ -2387,18 +2393,36 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): # print("a09wupoidkw", np.array(nimg_batch).shape) nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023 + + ############################## + # JOHN 12/30/2023 + saveMe = [] + for img in nthumbnail_batch: + saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) + nthumbnail_batch = np.array(saveMe) + # print("a09wdjaoimd", nimg_batch.shape, nthumbnail_batch.shape) + # print(nthumbnail_batch.shape) + # JOHN 12/30/2023 + else: #JOHN 011/09/2023 -# print("a09wupoidkw", np.array(nimg_batch).shape) +# print("a09wupoidkw", np.arrayħnimg_batch).shape) # print(num_valid_imgs, x, y) nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023 nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023 - if fullimgs is None: + + + if fullimgs is None and nimg_batch.shape[1]!=0: fullimgs = nimg_batch fullthumbnails = nthumbnail_batch - elif len(nimg_batch)!=0: + # print("FULL IMGS IS NONE.", "nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) + trueIntensities += ntrueIntensity_batch + # elif len(nimg_batch)!=0: + elif nimg_batch.shape[1]!=0: #JOHN CHANGE 12/31/2023 + # print("nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) fullimgs = np.hstack((fullimgs, nimg_batch)) fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) - trueIntensities += ntrueIntensity_batch + # print("NEW: nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) + trueIntensities += ntrueIntensity_batch else: nimg_batch = [] for img in img_batch.T: @@ -2425,7 +2449,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): # print("hstacking") if fullimgs is None: fullimgs = nimg_batch - elif len(nimg_batch)!=0: + # elif len(nimg_batch)!=0: #JOHN 12/31/2023 + elif nimg_batch.shape[1]!=0: # print(fullimgs.shape, nimg_batch.shape, nimg_batch) fullimgs = np.hstack((fullimgs, nimg_batch)) @@ -2458,6 +2483,7 @@ def main(): outlierQuantile=0.3) visMe.fullVisualize() visMe.userSave() + def parse_input(): """ Parse command line input. From 8f81c98cfdea5c4020367588a6063a7283acdc63 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Thu, 11 Jan 2024 09:57:50 -0800 Subject: [PATCH 53/57] Checkpoint. This is working nice. --- btx/processing/freqdir.py | 105 +++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 8773bda21..54ef7e47c 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -205,17 +205,22 @@ def __init__( self.imgData = imgData self.imgsTracked = imgsTracked + self.fdTime = 0 + def run(self): """ Perform frequent directions matrix sketching on run subject to initialization parameters. """ img_batch = self.imgData - if self.samplingFactor <1: + if self.samplingFactor<1: + st = time.process_time() psamp = PrioritySampling(int((img_batch.shape[1])*self.samplingFactor), self.d) for row in img_batch.T: psamp.update(row) img_batch = np.array(psamp.sketch.get()).T + et = time.process_time() + self.fdTime += et - st self.update_model(img_batch) # if self.mean is None: # self.mean = np.mean(img_batch, axis=1) @@ -252,36 +257,42 @@ def update_model(self, X): X: ndarray data to update matrix sketch with """ + + rankAdapt_increaseAmount = 50 + _, numIncorp = X.shape origNumIncorp = numIncorp - with TaskTimer(self.task_durations, "total update"): - if self.rank==0 and not self.merger: - print( - "Factoring {m} sample{s} into {n} sample, {q} component model...".format( - m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell - ) + # with TaskTimer(self.task_durations, "total update"): + if self.rank==0 and not self.merger: + print( + "Factoring {m} sample{s} into {n} sample, {q} component model...".format( + m=numIncorp, s="s" if numIncorp > 1 else "", n=self.num_incorporated_images, q=self.ell ) - for row in X.T: - canRankAdapt = numIncorp > (self.ell + 15) - if self.nextZeroRow >= self.m: - if self.increaseEll and canRankAdapt and self.rankAdapt: - self.ell = self.ell + 10 - self.m = 2*self.ell - self.sketch = np.vstack((*self.sketch, np.zeros((20, self.d)))) - self.increaseEll = False - print("Increasing rank of process {} to {}".format(self.rank, self.ell)) - else: - copyBatch = self.sketch[self.ell:,:].copy() - self.rotate() - if canRankAdapt and self.rankAdapt: - reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch)) - print("RANK ADAPT RECON ERROR: ", reconError) - if (reconError > self.rankAdaptMinError): - self.increaseEll = True - self.sketch[self.nextZeroRow,:] = row - self.nextZeroRow += 1 - self.num_incorporated_images += 1 - numIncorp -= 1 + ) + for row in X.T: + st = time.process_time() + canRankAdapt = numIncorp > (self.ell + 15) + if self.nextZeroRow >= self.m: + if self.increaseEll and canRankAdapt and self.rankAdapt: + self.ell = self.ell + rankAdapt_increaseAmount + self.m = 2*self.ell + self.sketch = np.vstack((*self.sketch, np.zeros((2*rankAdapt_increaseAmount, self.d)))) + self.increaseEll = False + print("Increasing rank of process {} to {}".format(self.rank, self.ell)) + else: + copyBatch = self.sketch[self.ell:,:].copy() + self.rotate() + if canRankAdapt and self.rankAdapt: + reconError = np.sqrt(self.lowMemoryReconstructionErrorScaled(copyBatch)) + print("RANK ADAPT RECON ERROR: ", reconError) + if (reconError > self.rankAdaptMinError): + self.increaseEll = True + self.sketch[self.nextZeroRow,:] = row + self.nextZeroRow += 1 + self.num_incorporated_images += 1 + numIncorp -= 1 + et = time.process_time() + self.fdTime += et - st def rotate(self): """ @@ -550,7 +561,7 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy - time.sleep(10) + # time.sleep(10) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] @@ -573,6 +584,8 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.currRun = currRun + self.mergeTime = 0 + def merge(self): """ Merge Frequent Direction Components in a tree-like fashion. @@ -620,6 +633,7 @@ def merge(self): # + hf["sketch"].attrs["numImgsIncorp"]) self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"] self.fullImgsTracked = np.vstack((self.fullImgsTracked, hf["imgsTracked"][:])) + self.mergeTime = self.fd.fdTime return self.fd.get() else: return @@ -704,7 +718,7 @@ def __init__( # print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) # while(not os.path.isfile(readFile2)): # print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - time.sleep(10) + # time.sleep(10) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -718,6 +732,7 @@ def __init__( self.imgData = imgData + self.compTime = 0 def run(self): """ @@ -737,10 +752,13 @@ def apply_compression(self, X): X: ndarray data to project """ + st = time.process_time() if self.processedData is None: self.processedData = np.dot(X.T, self.components.T) else: self.processedData = np.vstack((self.processedData, np.dot(X.T, self.components.T))) + et = time.process_time() + self.compTime += et - st def write(self): """ @@ -1315,8 +1333,9 @@ def genUMAP(self): # min_cluster_size = self.hdbscan_min_cluster_size # ).fit_predict(self.clusterable_embedding) - ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs)) - self.labels = np.array(ncircs) + # ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs)) + # self.labels = np.array(ncircs) + self.labels = np.array(np.zeros(len(self.imgs))) exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) @@ -1784,6 +1803,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) + #JOHN CHANGE 01/08/2024 + self.newBareTime = 0 + #JOHN CHANGE 12/30/2023 self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150) # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300) @@ -1900,6 +1922,7 @@ def genSynthData(self): # return (eigVecs @ (D) @ eigVecs.T) def compDecayingSVD(self, seedMe, a, b): + #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. numFeats = a numSamps = b//self.size perturbation = np.random.rand(numSamps, numFeats)*0.1 @@ -1913,7 +1936,8 @@ def compDecayingSVD(self, seedMe, a, b): S.sort() S = S[::-1] for j in range(len(S)): #Modify - S[j] = (2**(-16*(j+1)/len(S)))*S[j] + # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN + S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 self.fullImgData = (Q1 @ np.diag(S) @ Q2).T self.imgsTracked = [(0, numSamps)] @@ -1940,6 +1964,7 @@ def runMe(self): print("{} STARTING SKETCHING FOR {}".format(self.rank, self.currRun)) st1 = time.perf_counter() freqDir.run() + self.newBareTime += freqDir.fdTime localSketchFilename = freqDir.write() et1 = time.perf_counter() print("Estimated time for frequent directions rank {0}/{1}: {2}".format(self.rank, self.size, et1 - st1)) @@ -1957,6 +1982,7 @@ def runMe(self): output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI) st2 = time.perf_counter() mergeTree.merge() + self.newBareTime += mergeTree.mergeTime mergedSketchFilename = mergeTree.write() et2 = time.perf_counter() print("Estimated time merge tree for rank {0}/{1}: {2}".format(self.rank, self.size, et2 - st2)) @@ -1966,11 +1992,12 @@ def runMe(self): appComp = ApplyCompression(comm=self.comm, rank = self.rank, size=self.size, start_offset=self.start_offset, num_imgs=self.num_imgs, exp=self.exp, run=self.run,det_type=self.det_type, readFile = mergedSketchFilename, output_dir = self.writeToHere, currRun = self.currRun, imgData = self.fullImgData) st3 = time.perf_counter() self.matSketch = appComp.run() + self.newBareTime += appComp.compTime appComp.write() et3 = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3)) print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull)) - self.addThumbnailsToProjectH5() + # self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. return (et1 + et2 + et3 - st1 - st2 - st3) # self.comm.barrier() @@ -2267,6 +2294,8 @@ def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, t self.imageProcessor = imageProcessor + self.excludedImgs = [] + def split_range(self, start, end, num_tuples): if start==end: raise ValueError('Range processing error: start value equals end value, which leads to no images processed.') @@ -2366,7 +2395,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): nimg_batch = [] nthumbnail_batch = [] ntrueIntensity_batch = [] - for img, thumbnail, trueIntens in zip(img_batch.T, thumbnail_batch.T, origTrueIntensities): + for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)): currIntensity = np.sum(img.flatten(), dtype=np.double) if self.imageProcessor.centerImg: nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) @@ -2389,6 +2418,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): # ntrueIntensity_batch.append(0) num_valid_thumbnails -= 1 num_valid_imgs -= 1 + self.excludedImgs.append(ind) if self.imageProcessor.centerImg: #JOHN 011/09/2023 # print("a09wupoidkw", np.array(nimg_batch).shape) nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 @@ -2425,7 +2455,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): trueIntensities += ntrueIntensity_batch else: nimg_batch = [] - for img in img_batch.T: + for ind, img in enumerate(img_batch.T): currIntensity = np.sum(img.flatten(), dtype=np.double) # print("Starting image processing of size {}".format(img_batch.T.shape)) nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) @@ -2434,6 +2464,7 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): else: # nimg_batch.append(np.zeros((x, y))) num_valid_imgs -= 1 + self.excludedImgs.append(ind) # nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 @@ -2454,6 +2485,8 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): # print(fullimgs.shape, nimg_batch.shape, nimg_batch) fullimgs = np.hstack((fullimgs, nimg_batch)) + print("EXCLUDING IMAGES: ", self.excludedImgs) + # print("Images tracked:", imgsTracked) if getThumbnails: print(fullimgs.shape, fullthumbnails.shape, imgsTracked) From 931de83ec1fe31574e8a8f705279688c7e7daab6 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Mon, 11 Mar 2024 20:43:51 -0700 Subject: [PATCH 54/57] Pushing as a checkpoint. Not too sure what changed, but this code seems to work. --- btx/processing/freqdir.py | 189 +++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 73 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 54ef7e47c..0dc2889ae 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -23,20 +23,21 @@ from matplotlib import pyplot as plt from matplotlib import colors -########################## -########################## -#JOHN CHANGE BACK AFTER 12/15/2023 -from btx.misc.shortcuts import TaskTimer - -from btx.interfaces.ipsana import ( - PsanaInterface, - bin_data, - bin_pixel_index_map, - retrieve_pixel_index_map, - assemble_image_stack_batch, -) -########################## -########################## +#JOHN: COMMENTED OUT AFTER 03/11/2024 +# ########################## +# ########################## +# #JOHN CHANGE BACK AFTER 12/15/2023 +# from btx.misc.shortcuts import TaskTimer + +# from btx.interfaces.ipsana import ( +# PsanaInterface, +# bin_data, +# bin_pixel_index_map, +# retrieve_pixel_index_map, +# assemble_image_stack_batch, +# ) +# ########################## +# ########################## from PIL import Image @@ -637,6 +638,39 @@ def merge(self): return self.fd.get() else: return + + def serialMerge(self): + """ + Merge Frequent Direction Components in a serial fashion. + Returns + ------- + finalSketch : ndarray + Merged matrix sketch of cumulative data + """ + + if self.rank==0: + for currWorkingCore in range(1, self.size): + bufferMe = np.empty(self.buffSizes[currWorkingCore] * self.data.shape[1], dtype=np.double) + self.comm.Recv(bufferMe, source=currWorkingCore, tag=currWorkingCore) + bufferMe = np.reshape(bufferMe, (self.buffSizes[currWorkingCore], self.data.shape[1])) + self.fd.update_model(np.hstack((bufferMe.T, np.zeros((bufferMe.shape[1],1))))) + else: + bufferMe = self.fd.get().copy().flatten() + self.comm.Send(bufferMe, dest=0, tag=self.rank) + + if self.rank==0: + for readMe in self.allWriteDirecs: + with h5py.File(readMe, 'r') as hf: + if self.fullNumIncorp==0: + self.fullNumIncorp = hf["sketch"].attrs["numImgsIncorp"] + self.fullImgsTracked = hf["imgsTracked"][:] + else: + self.fullNumIncorp += hf["sketch"].attrs["numImgsIncorp"] + self.fullImgsTracked = np.vstack((self.fullImgsTracked, hf["imgsTracked"][:])) + self.mergeTime = self.fd.fdTime + return self.fd.get() + else: + return def write(self): """ @@ -907,23 +941,13 @@ def reorientImg(nimg): def denoiseImg(image): - # Threshold the image to get a binary image _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY) - - # Perform connected component labeling num_labels, labels_im = cv2.connectedComponents(binary_image) - - # Create a mask for components larger than the size threshold mask = np.zeros_like(image, dtype=bool) - size_threshold = 500 - - # Iterate through components and update the mask for large components for label in range(1, num_labels): if np.sum(labels_im == label) > size_threshold: mask[labels_im == label] = True - - # Apply the mask to the original grayscale image masked_image = np.zeros_like(image) masked_image[mask] = image[mask] return masked_image @@ -940,77 +964,45 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired Returns: numpy.ndarray: The cropped image centered around the intensity pattern. """ - # Normalize or scale the image if image.dtype != np.uint8: - # If the range of your image is known (e.g., 0 to 5), normalize accordingly - # image = ((image - image.min()) / (image.max() - image.min())) * 255 - - # If the range is not known, scale based on the current min and max image = 255 * (image - image.min()) / (image.max() - image.min()) image = image.astype(np.uint8) - - # print(image[100]) - - # Ensure the image is in grayscale if len(image.shape) == 3: image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - # Step 1: Noise Reduction blurred = cv2.GaussianBlur(image, blur_kernel, 0) # blurred = image - # Step 2: Thresholding # _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY) _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY) - # # Step 3: Locate the Beam # contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # if not contours: # return None # No contours found - # # Step 4: Determine the Bounding Box # beam = max(contours, key=cv2.contourArea) # x, y, w, h = cv2.boundingRect(beam) - # # Step 5: Centering and Cropping # cropped = image[y:y+h, x:x+w] # print(x, y, w, h) - # Locate the Beam contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # contours = [c for c in contours if cv2.contourArea(c) > min_contour_area] if not contours: return None - # Determine the Bounding Box beam = max(contours, key=cv2.contourArea) x, y, w, h = cv2.boundingRect(beam) - - # Find the center of the bounding box center_x, center_y = x + w // 2, y + h // 2 - - # Define new bounding box dimensions new_x = max(center_x - desired_size // 2, 0) new_y = max(center_y - desired_size // 2, 0) - - # print("new x: ", x, new_x) - # print("new y: ", y, new_y) - - # Adjust if the new box extends beyond the original image new_x = min(new_x, image.shape[1] - desired_size) new_y = min(new_y, image.shape[0] - desired_size) - - # print("new x: ", x, new_x) - # print("new y: ", y, new_y) - - # Crop the image to the new bounding box cropped = image[new_y:new_y + desired_size, new_x:new_x + desired_size] return cropped - # threshVal = 100 - nimgs = [] nbws = [] nbws1 = [] @@ -1807,7 +1799,7 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.newBareTime = 0 #JOHN CHANGE 12/30/2023 - self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=150, roi_h = 150) + self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200) # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300) self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) # self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) @@ -1921,26 +1913,76 @@ def genSynthData(self): # D = np.diag(diag_entries) + np.eye(matrixSize) # return (eigVecs @ (D) @ eigVecs.T) + def modified_gram_schmidt(self, A, num_vecs): + m, n = A.shape + Q = np.zeros((m, num_vecs)) + for j in range(num_vecs): + v = A[:, j] + for i in range(j): + rij = Q[:, i].dot(A[:, j]) + v = v - rij * Q[:, i] + rjj = np.linalg.norm(v, 2) + Q[:, j] = v / rjj + print(f"COMPUTED VECTOR {j}/{num_vecs}") + return Q + def compDecayingSVD(self, seedMe, a, b): - #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. numFeats = a numSamps = b//self.size - perturbation = np.random.rand(numSamps, numFeats)*0.1 - np.random.seed(seedMe) - A1 = np.random.rand(numSamps, numFeats) - Q1, R1 = np.linalg.qr(A1) - Q1 = Q1 + perturbation - A2 = np.random.rand(numFeats, numFeats) #Modify - Q2, R2 = np.linalg.qr(A2) - S = list(np.random.rand(numFeats)) #Modify - S.sort() - S = S[::-1] - for j in range(len(S)): #Modify - # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN - S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 - self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + self.fullImgData = np.random.randn(numFeats, numSamps) self.imgsTracked = [(0, numSamps)] + # def compDecayingSVD(self, seedMe, a, b): + # #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. + # print(1) + # np.random.seed(seedMe + self.rank) + # numFeats = a + # numSamps = b//self.size + # # perturbation = np.random.rand(numSamps, numFeats)*0.1 + # # print(2) + # A1 = np.random.rand(numSamps, numFeats) + # print(3) + # Q1 = self.modified_gram_schmidt(A1, numFeats) + # print(5) + # A2 = np.random.rand(numFeats, numFeats) #Modify + # print(6) + # Q2, R2 = np.linalg.qr(A2) + # print(7) + # S = list(np.random.rand(numFeats)) #Modify + # print(8) + # S.sort() + # print(9) + # S = S[::-1] + # print(10) + # for j in range(len(S)): #Modify + # # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN + # S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 + # print(11) + # self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + # print(12) + # self.imgsTracked = [(0, numSamps)] + # print(13) + + # def compDecayingSVD(self, seedMe, a, b): + # #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. + # numFeats = a + # numSamps = b//self.size + # perturbation = np.random.rand(numSamps, numFeats)*0.1 + # np.random.seed(seedMe) + # A1 = np.random.rand(numSamps, numFeats) + # Q1, R1 = np.linalg.qr(A1) + # Q1 = Q1 + perturbation + # A2 = np.random.rand(numFeats, numFeats) #Modify + # Q2, R2 = np.linalg.qr(A2) + # S = list(np.random.rand(numFeats)) #Modify + # S.sort() + # S = S[::-1] + # for j in range(len(S)): #Modify + # # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN + # S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 + # self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + # self.imgsTracked = [(0, numSamps)] + def runMe(self): stfull = time.perf_counter() @@ -1982,6 +2024,7 @@ def runMe(self): output_dir=self.writeToHere, allWriteDirecs=allNames, currRun = self.currRun, psi=self.psi, usePSI=self.usePSI) st2 = time.perf_counter() mergeTree.merge() + # mergeTree.serialMerge() self.newBareTime += mergeTree.mergeTime mergedSketchFilename = mergeTree.write() et2 = time.perf_counter() @@ -1997,7 +2040,7 @@ def runMe(self): et3 = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3)) print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull)) - # self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. + self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. return (et1 + et2 + et3 - st1 - st2 - st3) # self.comm.barrier() From c4abf71a1163832c898168d518e262dc9f9e7928 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 12 Mar 2024 12:01:13 -0700 Subject: [PATCH 55/57] Modified the script to allow for elog submission. --- btx/processing/freqdir.py | 733 ++++++++++++-------------------------- 1 file changed, 226 insertions(+), 507 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 0dc2889ae..b044d61ae 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -23,31 +23,12 @@ from matplotlib import pyplot as plt from matplotlib import colors -#JOHN: COMMENTED OUT AFTER 03/11/2024 -# ########################## -# ########################## -# #JOHN CHANGE BACK AFTER 12/15/2023 -# from btx.misc.shortcuts import TaskTimer - -# from btx.interfaces.ipsana import ( -# PsanaInterface, -# bin_data, -# bin_pixel_index_map, -# retrieve_pixel_index_map, -# assemble_image_stack_batch, -# ) -# ########################## -# ########################## - - from PIL import Image from io import BytesIO import base64 from datetime import datetime -#import umap -#import hdbscan from sklearn.cluster import OPTICS, cluster_optics_dbscan from matplotlib import colors @@ -67,6 +48,12 @@ import cv2 +try: + import umap + import hdbscan +except: + print("UMAP NOT AVAILABLE") + class FreqDir(DimRed): """ @@ -370,33 +357,33 @@ def reconstructionError(self, matrixCentered): matrixCenteredT - G @ G.T @ matrixCenteredT, 'fro')**2)/( (np.linalg.norm(matrixCenteredT - Ak, 'fro'))**2) -# def lowMemoryReconstructionErrorScaled(self, matrixCentered): -# """ -# Compute the low memory reconstruction error of the matrix sketch -# against given data. This is the same as reconstructionError, -# but estimates the norm computation and does not scale by the -# minimum projection matrix, but rather by the matrix norm itself. -# -# Parameters -# ---------- -# matrixCentered: ndarray -# Data to compare matrix sketch to -# -# Returns -# ------- -# float, -# Data subtracted by data projected onto sketched space, scaled by matrix elements -# """ -# matSketch = self.sketch[:self.ell, :] -# print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape) -## k = 10 -# matrixCenteredT = matrixCentered.T -# matSketchT = matSketch.T -# U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) -## G = U[:,:k] -# G = U -# return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ -# np.linalg.norm(matrixCenteredT, 'fro')**2) + def oldLowMemoryReconstructionErrorScaled(self, matrixCentered): + """ + Compute the low memory reconstruction error of the matrix sketch + against given data. This is the same as reconstructionError, + but estimates the norm computation and does not scale by the + minimum projection matrix, but rather by the matrix norm itself. + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + + Returns + ------- + float, + Data subtracted by data projected onto sketched space, scaled by matrix elements + """ + matSketch = self.sketch[:self.ell, :] + print("RANK ADAPTIVE SHAPE:",matrixCentered.shape, matSketch.shape) + # k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) + # G = U[:,:k] + G = U + return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ + np.linalg.norm(matrixCenteredT, 'fro')**2) def lowMemoryReconstructionErrorScaled(self, matrixCentered): matSketch = self.sketch[:self.ell, :] @@ -416,50 +403,50 @@ def estimFrobNormJ(self, addMe, arrs, k): sumMe += math.sqrt(1/k) * np.linalg.norm(randMat - minusMe, 'fro') return sumMe -# def estimFrobNormSquared(self, addMe, arrs, its): -# """ -# Estimate the Frobenius Norm of product of arrs matrices -# plus addME matrix using its iterations. -# -# Parameters -# ---------- -# arrs: list of ndarray -# Matrices to multiply together -# -# addMe: ndarray -# Matrix to add to others -# -# its: int -# Number of iterations to average over -# -# Returns -# ------- -# sumMe/its*no_rows : float -# Estimate of frobenius norm of product -# of arrs matrices plus addMe matrix -# -# Notes -# ----- -# Frobenius estimation is the expected value of matrix -# multiplied by random vector from multivariate normal distribution -# based on [1]. -# -# [1] Norm and Trace Estimation with Random Rank-one Vectors -# Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix -# Analysis and Applications 2021 42:1, 202-223 -# """ -# no_rows = arrs[-1].shape[1] -# v = np.random.normal(size=no_rows) -# v_hat = v / np.linalg.norm(v) -# sumMe = 0 -# for j in range(its): -# v = np.random.normal(size=no_rows) -# v_hat = v / np.linalg.norm(v) -# v_addMe = addMe @ v_hat -# for arr in arrs[::-1]: -# v_hat = arr @ v_hat -# sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 -# return sumMe/its*no_rows + def oldEstimFrobNormSquared(self, addMe, arrs, its): + """ + Estimate the Frobenius Norm of product of arrs matrices + plus addME matrix using its iterations. + + Parameters + ---------- + arrs: list of ndarray + Matrices to multiply together + + addMe: ndarray + Matrix to add to others + + its: int + Number of iterations to average over + + Returns + ------- + sumMe/its*no_rows : float + Estimate of frobenius norm of product + of arrs matrices plus addMe matrix + + Notes + ----- + Frobenius estimation is the expected value of matrix + multiplied by random vector from multivariate normal distribution + based on [1]. + + [1] Norm and Trace Estimation with Random Rank-one Vectors + Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix + Analysis and Applications 2021 42:1, 202-223 + """ + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows def gatherFreqDirsSerial(self): @@ -518,8 +505,7 @@ def write(self): filename : string Name of h5 file where sketch, mean of data, and indices of data processed is written """ -# self.comm.barrier() - filename = self.output_dir + '{}_sketch_{}.h5'.format(self.currRun, self.rank) + filename = self.output_dir + f'{self.currRun:04}_sketch_{self.rank}.h5' with h5py.File(filename, 'w') as hf: hf.create_dataset("sketch", data=self.sketch[:self.ell, :]) # hf.create_dataset("mean", data=self.mean) @@ -562,7 +548,6 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output self.divBy = divBy - # time.sleep(10) with h5py.File(readFile, 'r') as hf: self.data = hf["sketch"][:] @@ -570,7 +555,6 @@ def __init__(self, comm, rank, size, exp, run, det_type, divBy, readFile, output sendbuf = self.data.shape[0] self.buffSizes = np.array(self.comm.allgather(sendbuf)) -# print(self.buffSizes) self.fd.update_model(self.data.T) @@ -676,7 +660,7 @@ def write(self): """ Write merged matrix sketch to h5 file """ - filename = self.output_dir + '{}_merge.h5'.format(self.currRun) + filename = self.output_dir + f'{self.currRun:04}_merge.h5' if self.rank==0: for ind in range(self.size): @@ -749,10 +733,6 @@ def __init__( readFile2 = readFile[:-3] + "_"+str(self.rank)+".h5" -# print("FOR RANK {}, READFILE: {} HAS THE CURRENT EXISTENCE STATUS {}".format(self.rank, readFile2, os.path.isfile(readFile2))) -# while(not os.path.isfile(readFile2)): -# print("{} DOES NOT CURRENTLY EXIST FOR {}".format(readFile2, self.rank)) - # time.sleep(10) with h5py.File(readFile2, 'r') as hf: self.data = hf["sketch"][:] # self.mean = hf["mean"][:] @@ -798,10 +778,9 @@ def write(self): """ Write projected data and downsampled data to h5 file """ - filename = self.output_dir + '{}_ProjectedData_{}.h5'.format(self.currRun, self.rank) + filename = self.output_dir + f'{self.currRun:04}_ProjectedData_{self.rank}.h5' with h5py.File(filename, 'w') as hf: hf.create_dataset("ProjectedData", data=self.processedData) -# print("CREATED FILE: ", filename) self.comm.barrier() return filename @@ -836,7 +815,6 @@ def get(self): ret = [] while self.queue: curr = heapq.heappop(self.queue)[-1] - #ret.append(curr[0]*max(curr[1], curr[2])/curr[2]) ret.append(curr[0]) return ret @@ -877,13 +855,10 @@ def update(self, vec): self.sketch.push(vec, pi, wi) - class visualizeFD: """ Visualize FD Dimension Reduction using UMAP and DBSCAN """ - umap = __import__('umap') - hdbscan = __import__('hdbscan') def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): @@ -971,24 +946,9 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(image, blur_kernel, 0) - # blurred = image - - # _, thresh = cv2.threshold(blurred, threshold_value, 255, cv2.THRESH_BINARY) _, thresh = cv2.threshold(blurred, 100, 255, cv2.THRESH_BINARY) - - # contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - # if not contours: - # return None # No contours found - - # beam = max(contours, key=cv2.contourArea) - # x, y, w, h = cv2.boundingRect(beam) - - # cropped = image[y:y+h, x:x+w] - # print(x, y, w, h) - contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - # contours = [c for c in contours if cv2.contourArea(c) > min_contour_area] if not contours: return None @@ -1009,136 +969,34 @@ def center_and_crop_beam(image, threshold_value=127, blur_kernel=(5, 5), desired contours = [] contourImgs = [] for j in range(len(fullThumbnailData)): - # currImg = (fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy() - # nimg = currImg nimg = center_and_crop_beam(fullThumbnailData[j]) - # nimg = reorientImg(nimg) if nimg is None: continue nimg = reorientImg(nimg) nimg = denoiseImg(nimg) nimgs.append(nimg) - # nbws.append(nimg) - # (thresh, im_bw) = cv2.threshold((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy(), 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) - # print(nimg) - # print(j, np.max(nimg)) - # np.set_printoptions(threshold=np.inf, linewidth=np.inf) - # print(nimg) - (thresh, im_bw) = cv2.threshold(nimg, 0, 255, cv2.THRESH_BINARY) nbws.append(im_bw.copy()) (thresh1, im_bw1) = cv2.threshold(nimg, 0, 1, cv2.THRESH_BINARY) nbws1.append(im_bw1.copy()) - - # # Assuming 'im' is your grayscale image - # # Apply Gaussian blur to the image - # blurred = cv2.GaussianBlur(im_bw, (5, 5), 0) - # # Apply binary thresholding on the blurred image - # _, binary = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY) - # # Find contours - # contourList, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - # # Find the largest contour based on area - # largest_contour = max(contourList, key=cv2.contourArea) - # contours.append(largest_contour) - # canvas = np.zeros(im_bw.shape, dtype='uint8') - # # Draw the largest contour in white - # cv2.drawContours(canvas, [largest_contour], -1, (255), 1) - # contourImgs.append(canvas) - - # # nbws.append(cv2.GaussianBlur(nimg, (5, 5), 0)) - # # nbws.append(im_bw) - # # nbws.append((fullThumbnailData[j]*(255/np.max(fullThumbnailData[j]))).astype('uint8').copy() - - # # ind = 356 - - # # plt.imshow(nimgs[ind]) - # # plt.show() - - # # print(nbws1[ind][80]) - - # # Calculate moments - # M = cv2.moments(nbws1[ind]) - # # Zeroth moment is the area - # area = M['m00'] - # epsilon = 0.01 * cv2.arcLength(contours[ind], True) - # approx = cv2.approxPolyDP(contours[ind], epsilon, True) - # # Calculate the perimeter - # perimeter = cv2.arcLength(approx, True) - # # Calculate circularity using moments - # circularity = 4 * 3.14159 * area / (perimeter * perimeter) - # print(circularity) - - # # Calculate moments - # M = cv2.moments(nbws1[ind]) - # ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02'])) - # print(ncirc) - circs = [] ncircs = [] - for ind in range(len(nbws)): - # # Calculate moments - # M = cv2.moments(nbws1[ind]) - # # Zeroth moment is the area - # area = M['m00'] - # epsilon = 0.01 * cv2.arcLength(contours[ind], True) - # approx = cv2.approxPolyDP(contours[ind], epsilon, True) - # # Calculate the perimeter - # perimeter = cv2.arcLength(approx, True) - # # Calculate circularity using moments - # circularity = 4 * 3.14159 * area / (perimeter * perimeter) - - # Calculate moments M = cv2.moments(nbws[ind]) try: ncirc = (M['m00']**2)/(2*math.pi*(M['mu20'] + M['mu02'])) except: ncirc = 1 - - # circs.append(circularity) ncircs.append(ncirc) - sorted_indices = np.argsort(ncircs) sorted_arrays = [nimgs[i] for i in sorted_indices] sorted_full = [fullThumbnailData[i] for i in sorted_indices] - - # import matplotlib.pyplot as plt - # import numpy as np - - # # Assuming 'images' is your list of 16 NumPy array images - # # For demonstration, creating 16 random 8x8 grayscale images - # images = [j for j in sorted_arrays[::len(sorted_arrays)//16]] - - # # Create a 4x4 grid of subplots - # fig, axs = plt.subplots(4, 4, figsize=(10, 10)) - - # # Flatten the array of axes for easy iteration - # axs = axs.ravel() - - # # Plot each image and add text - # for i in range(16): - # axs[i].imshow(images[i], cmap='jet', vmin=0, vmax=255) # Assuming grayscale images - # axs[i].text(50, 5, f"Image {i+1}", color='white', ha='center', va='center') - # axs[i].axis('off') # Turn off axis - - # plt.tight_layout() # Adjust subplots to fit into the figure area. - # plt.show() - - # ind=23 - # nimg = center_and_crop_beam(fullThumbnailData[40]) - # # plt.imshow(fullThumbnailData[ind]) - # plt.imshow(nimg) - # plt.show() - bigOrSmall = [1 if j>len(sorted_arrays)*10//16 else 0 for j in sorted_indices] -# np.savez(saveDir+'circularityImgs_{}.npz'.format(currRun), **{f'array_{i}': arr for i, arr in enumerate(nimgs)}, labels=bigOrSmall) - return ncircs def embeddable_image(self, data): img_data = np.uint8(cm.jet(data/max(data.flatten()))*255) -# image = Image.fromarray(img_data, mode='RGBA').resize((75, 75), Image.Resampling.BICUBIC) image = Image.fromarray(img_data, mode='RGBA') buffer = BytesIO() image.save(buffer, format='png') @@ -1166,8 +1024,8 @@ def genMedoids(self, medoidLabels, clusterPoints): for test_index, test_point in enumerate(lst): if math.isclose(test_point[0],medoid_point[0]) and math.isclose(test_point[1], medoid_point[1]): fin_ind = test_index -# medoid_lst.append((k, v[fin_ind][0])) - medoid_lst.append((k, v[fin_ind+1][0])) + # medoid_lst.append((k, v[fin_ind+1][0])) + medoid_lst.append((k, v[fin_ind][0])) return medoid_lst def relabel_to_closest_zero(self, labels): @@ -1209,10 +1067,8 @@ def fastABOD(self, pts, nsamples): ac = cpt - apt if math.isclose(np.linalg.norm(ab), 0.0) or math.isclose(np.linalg.norm(ac), 0.0): count += 1 -# print("TOO CLOSE") continue outlier_factors.append(np.dot(ab, ac)/((np.linalg.norm(ab)**2) * (np.linalg.norm(ac)))) -# print("CURRENT POINT: ", pts[a], test_list, outlier_factors, np.var(np.array(outlier_factors))) if(len(outlier_factors)==0): abofs.append(np.inf) else: @@ -1220,12 +1076,7 @@ def fastABOD(self, pts, nsamples): return abofs def getOutliers(self, lst): -# lstCopy = lst.copy() -# lstCopy.sort() -# quart10 = lstCopy[len(lstCopy)//divBy] - lstQuant = np.quantile(np.array(lst), self.outlierQuantile) -# print("AIDJWOIJDAOWIDJWAOIDJAWOIDWJA", lstQuant, lst) outlierInds = [] notOutlierInds = [] for j in range(len(lst)): @@ -1233,8 +1084,6 @@ def getOutliers(self, lst): outlierInds.append(j) else: notOutlierInds.append(j) -# print("OUTLIER INDS: ", outlierInds) -# print("NOT OUTLIER INDS: ", notOutlierInds) return np.array(outlierInds), np.array(notOutlierInds) def genHist(self, vals, endClass): @@ -1251,23 +1100,15 @@ def genLeftRight(self, endClass): def float_to_int_percentile(self, float_list): - # Edge case: If the list is empty, return an empty list if not float_list: return [] - - # Calculate the percentiles that define the bin edges percentiles = np.percentile(float_list, [10 * i for i in range(1, 10)]) - - # Function to find the bin for a single value def find_bin(value): for i, p in enumerate(percentiles): if value < p: return i - return 9 # For values in the highest bin - - # Convert each float to an integer based on its bin + return 9 int_list = [find_bin(value) for value in float_list] - return int_list @@ -1312,48 +1153,31 @@ def genUMAP(self): if len(self.imgs)!= self.numImgsToUse: raise TypeError("NUMBER OF IMAGES REQUESTED ({}) EXCEEDS NUMBER OF DATA POINTS PROVIDED ({}). TRUE LEN IS {}.".format(len(self.imgs), self.numImgsToUse, self.logging_numImgsToUse)) - self.clusterable_embedding = self.umap.UMAP( + self.clusterable_embedding = umap.UMAP( n_neighbors=self.umap_n_neighbors, random_state=self.umap_random_state, n_components=2, min_dist=0, -# min_dist=0.1, ).fit_transform(self.projections) -# self.labels = self.hdbscan.HDBSCAN( -# min_samples = self.hdbscan_min_samples, -# min_cluster_size = self.hdbscan_min_cluster_size -# ).fit_predict(self.clusterable_embedding) - - # ncircs = self.float_to_int_percentile(self.retrieveCircularity(self.imgs)) - # self.labels = np.array(ncircs) - self.labels = np.array(np.zeros(len(self.imgs))) + self.labels = hdbscan.HDBSCAN( + min_samples = self.hdbscan_min_samples, + min_cluster_size = self.hdbscan_min_cluster_size + ).fit_predict(self.clusterable_embedding) exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) self.opticsClust = OPTICS(min_samples=self.optics_min_samples, xi=self.optics_xi, min_cluster_size=self.optics_min_cluster_size) self.opticsClust.fit(self.clusterable_embedding) -# self.opticsLabels = cluster_optics_dbscan( -# reachability=self.opticsClust.reachability_, -# core_distances=self.opticsClust.core_distances_, -# ordering=self.opticsClust.ordering_, -# eps=2.5, -# ) self.opticsLabels = self.opticsClust.labels_ self.experData_df = pd.DataFrame({'x':self.clusterable_embedding[self.clustered, 0],'y':self.clusterable_embedding[self.clustered, 1]}) self.experData_df['image'] = list(map(self.embeddable_image, self.imgs[self.clustered])) self.experData_df['imgind'] = np.arange(self.numImgsToUse)*self.skipSize -# self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities] -# self.experData_df['trueIntensities'] = [5 for x in trueIntensities] -# self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities] -# self.experData_df['trueIntensities_backgroundColor'] = [5 for x in trueIntensities] -# print("aowdijaoidjwaoij", len(self.experData_df['trueIntensities']), self.experData_df['trueIntensities'], type(self.experData_df['trueIntensities'])) -# print(trueIntensities) - self.experData_df['trueIntensities'] = [1 for x in self.experData_df['imgind']] - self.experData_df['trueIntensities_backgroundColor'] = [1 for x in self.experData_df['imgind']] + self.experData_df['trueIntensities'] = [str(int(abs(x)/max(np.abs(trueIntensities))*19)) for x in trueIntensities] + self.experData_df['trueIntensities_backgroundColor'] = [Plasma256[int(abs(x)/max(np.abs(trueIntensities))*255)] for x in trueIntensities] def genABOD(self): @@ -1372,8 +1196,6 @@ def genABOD(self): self.experData_df['anomDet'] = outlierLabels self.experData_df['anom_backgroundColor'] = [Category20[20][int(x)] for x in outlierLabels] - print("2adwjiaomd", len(self.experData_df['anomDet']), self.experData_df['anomDet'], type(self.experData_df['anomDet'])) - def setUserGroupings(self, userGroupings): """ Set User Grouping. An adjustment is made at the beginning of this function, @@ -1419,14 +1241,13 @@ def genLabels(self): opticsNewLabels.append(j) opticsNewLabels = list(np.array(opticsNewLabels) + 1) self.opticsNewLabels = np.array(self.relabel_to_closest_zero(opticsNewLabels)) -# self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels[self.opticsClust.ordering_]] self.experData_df['optics_backgroundColor'] = [Category20[20][x] for x in self.opticsNewLabels] def genHTML(self): datasource = ColumnDataSource(self.experData_df) #JOHN CHANGE 20231020 -# color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) - color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16]) + color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) + # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16]) plot_figure = figure( title='UMAP projection with DBSCAN clustering of the LCLS dataset', tools=('pan, wheel_zoom, reset, lasso_select'), @@ -1745,6 +1566,7 @@ def userShow(self): output_notebook() show(self.viewResults) + class WrapperFullFD: """ Frequent Directions Data Processing Wrapper Class. @@ -1790,90 +1612,87 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.psi = None if self.rank==0: - self.currRun = run #datetime.now().strftime("%y%m%d%H%M%S") + self.currRun = run else: self.currRun = None self.currRun = self.comm.bcast(self.currRun, root=0) - #JOHN CHANGE 01/08/2024 self.newBareTime = 0 -#JOHN CHANGE 12/30/2023 self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200) - # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=300, roi_h = 300) + # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = False, roi_w=500, roi_h = 500) self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) -# self.imgRetriever = DataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) - -# def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): -# """ -# Compute the low memory reconstruction error of the matrix sketch -# against given data. This is the same as reconstructionError, -# but estimates the norm computation and does not scale by the -# minimum projection matrix, but rather by the matrix norm itself. -# -# Parameters -# ---------- -# matrixCentered: ndarray -# Data to compare matrix sketch to -# -# Returns -# ------- -# float, -# Data subtracted by data projected onto sketched space, scaled by matrix elements -# """ -## k = 10 -# matrixCenteredT = matrixCentered.T -# matSketchT = matSketch.T -# U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) -## G = U[:,:k] -# G = U -# return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ -# np.linalg.norm(matrixCenteredT, 'fro')**2) -# -# def estimFrobNormSquared(self, addMe, arrs, its): -# """ -# Estimate the Frobenius Norm of product of arrs matrices -# plus addME matrix using its iterations. -# -# Parameters -# ---------- -# arrs: list of ndarray -# Matrices to multiply together -# -# addMe: ndarray -# Matrix to add to others -## -# its: int -# Number of iterations to average over -# -# Returns -# ------- -# sumMe/its*no_rows : float -# Estimate of frobenius norm of product -# of arrs matrices plus addMe matrix -# -# Notes -# ----- -# Frobenius estimation is the expected value of matrix -# multiplied by random vector from multivariate normal distribution -# based on [1]. -# -# [1] Norm and Trace Estimation with Random Rank-one Vectors -# Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix -# Analysis and Applications 2021 42:1, 202-223 -# """ -# no_rows = arrs[-1].shape[1] -# v = np.random.normal(size=no_rows) -# v_hat = v / np.linalg.norm(v) -# sumMe = 0 -# for j in range(its): -# v = np.random.normal(size=no_rows) -# v_hat = v / np.linalg.norm(v) -# v_addMe = addMe @ v_hat -# for arr in arrs[::-1]: -# v_hat = arr @ v_hat -# sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 -# return sumMe/its*no_rows + + def oldLowMemoryReconstructionErrorScaled1(self, matrixCentered, matSketch): + """ + Compute the low memory reconstruction error of the matrix sketch + against given data. This is the same as reconstructionError, + but estimates the norm computation and does not scale by the + minimum projection matrix, but rather by the matrix norm itself. + + Parameters + ---------- + matrixCentered: ndarray + Data to compare matrix sketch to + + Returns + ------- + float, + Data subtracted by data projected onto sketched space, scaled by matrix elements + """ + # k = 10 + matrixCenteredT = matrixCentered.T + matSketchT = matSketch.T + U, S, Vt = np.linalg.svd(matSketchT, full_matrices=False) + # G = U[:,:k] + G = U + return (self.estimFrobNormSquared(matrixCenteredT, [G,G.T,matrixCenteredT], 50)/ + np.linalg.norm(matrixCenteredT, 'fro')**2) + + def oldEstimFrobNormSquared1(self, addMe, arrs, its): + """ + Estimate the Frobenius Norm of product of arrs matrices + plus addME matrix using its iterations. + + Parameters + ---------- + arrs: list of ndarray + Matrices to multiply together + + addMe: ndarray + Matrix to add to others + # + its: int + Number of iterations to average over + + Returns + ------- + sumMe/its*no_rows : float + Estimate of frobenius norm of product + of arrs matrices plus addMe matrix + + Notes + ----- + Frobenius estimation is the expected value of matrix + multiplied by random vector from multivariate normal distribution + based on [1]. + + [1] Norm and Trace Estimation with Random Rank-one Vectors + Zvonimir Bujanovic and Daniel Kressner SIAM Journal on Matrix + Analysis and Applications 2021 42:1, 202-223 + """ + no_rows = arrs[-1].shape[1] + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + sumMe = 0 + for j in range(its): + v = np.random.normal(size=no_rows) + v_hat = v / np.linalg.norm(v) + v_addMe = addMe @ v_hat + for arr in arrs[::-1]: + v_hat = arr @ v_hat + sumMe = sumMe + (np.linalg.norm(v_addMe - v_hat))**2 + return sumMe/its*no_rows def lowMemoryReconstructionErrorScaled(self, matrixCentered, matSketch): matrixCenteredT = matrixCentered.T @@ -1895,12 +1714,13 @@ def estimFrobNormJ(self, addMe, arrs, k): def retrieveImages(self): startingPoint = self.start_offset + self.num_imgs*self.rank//self.size self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) + np.save('/sdf/home/w/winnicki/john_20240312.npy', self.fullImgData) def genSynthData(self): self.fullImgData = np.random.rand(70000, 100000//self.size) self.imgsTracked = [(0, self.rank)] -# def genDecayingSVD(self): +# def oldGenDecayingSVD3(self): # numFeats = 70000 # numSamps = 100000//self.size # A = np.random.rand(matrixSize, matrixSize) @@ -1932,56 +1752,44 @@ def compDecayingSVD(self, seedMe, a, b): self.fullImgData = np.random.randn(numFeats, numSamps) self.imgsTracked = [(0, numSamps)] - # def compDecayingSVD(self, seedMe, a, b): - # #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. - # print(1) - # np.random.seed(seedMe + self.rank) - # numFeats = a - # numSamps = b//self.size - # # perturbation = np.random.rand(numSamps, numFeats)*0.1 - # # print(2) - # A1 = np.random.rand(numSamps, numFeats) - # print(3) - # Q1 = self.modified_gram_schmidt(A1, numFeats) - # print(5) - # A2 = np.random.rand(numFeats, numFeats) #Modify - # print(6) - # Q2, R2 = np.linalg.qr(A2) - # print(7) - # S = list(np.random.rand(numFeats)) #Modify - # print(8) - # S.sort() - # print(9) - # S = S[::-1] - # print(10) - # for j in range(len(S)): #Modify - # # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN - # S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 - # print(11) - # self.fullImgData = (Q1 @ np.diag(S) @ Q2).T - # print(12) - # self.imgsTracked = [(0, numSamps)] - # print(13) - - # def compDecayingSVD(self, seedMe, a, b): - # #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. - # numFeats = a - # numSamps = b//self.size - # perturbation = np.random.rand(numSamps, numFeats)*0.1 - # np.random.seed(seedMe) - # A1 = np.random.rand(numSamps, numFeats) - # Q1, R1 = np.linalg.qr(A1) - # Q1 = Q1 + perturbation - # A2 = np.random.rand(numFeats, numFeats) #Modify - # Q2, R2 = np.linalg.qr(A2) - # S = list(np.random.rand(numFeats)) #Modify - # S.sort() - # S = S[::-1] - # for j in range(len(S)): #Modify - # # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN - # S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 - # self.fullImgData = (Q1 @ np.diag(S) @ Q2).T - # self.imgsTracked = [(0, numSamps)] + def oldCompDecayingSVD1(self, seedMe, a, b): + #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. + np.random.seed(seedMe + self.rank) + numFeats = a + numSamps = b//self.size + # perturbation = np.random.rand(numSamps, numFeats)*0.1 + A1 = np.random.rand(numSamps, numFeats) + Q1 = self.modified_gram_schmidt(A1, numFeats) + A2 = np.random.rand(numFeats, numFeats) #Modify + Q2, R2 = np.linalg.qr(A2) + S = list(np.random.rand(numFeats)) #Modify + S.sort() + S = S[::-1] + for j in range(len(S)): #Modify + # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN + S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 + self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + self.imgsTracked = [(0, numSamps)] + + def oldCompDecayingSVD2(self, seedMe, a, b): + #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. + numFeats = a + numSamps = b//self.size + perturbation = np.random.rand(numSamps, numFeats)*0.1 + np.random.seed(seedMe) + A1 = np.random.rand(numSamps, numFeats) + Q1, R1 = np.linalg.qr(A1) + Q1 = Q1 + perturbation + A2 = np.random.rand(numFeats, numFeats) #Modify + Q2, R2 = np.linalg.qr(A2) + S = list(np.random.rand(numFeats)) #Modify + S.sort() + S = S[::-1] + for j in range(len(S)): #Modify + # S[j] = (2**(-16*(j+1)/len(S)))*S[j] #SCALING RUN + S[j] = (2**(-5*(j+1)/len(S)))*S[j] #PARALLEL RUN: 01/10/2024 + self.fullImgData = (Q1 @ np.diag(S) @ Q2).T + self.imgsTracked = [(0, numSamps)] def runMe(self): @@ -2040,27 +1848,18 @@ def runMe(self): et3 = time.perf_counter() print("Estimated time projection for rank {0}/{1}: {2}".format(self.rank, self.size, et3 - st3)) print("Estimated full processing time for rank {0}/{1}: {2}, {3}".format(self.rank, self.size, (et1 + et2 + et3 - st1 - st2 - st3), et3 - stfull)) - self.addThumbnailsToProjectH5() #JOHN CHANGE 01/09/2024. Modifying this just because we don't need this to do our testing. + self.addThumbnailsToProjectH5() return (et1 + et2 + et3 - st1 - st2 - st3) -# self.comm.barrier() -# self.comm.Barrier() -# filenameTest3 = random.randint(0, 10) -# filenameTest3 = self.comm.allgather(filenameTest3) -# print("TEST 3: ", self.rank, filenameTest3) - def addThumbnailsToProjectH5(self): -# print("Gathering thumbnails") startingPoint = self.start_offset + self.num_imgs*self.rank//self.size _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) - # print("FULL THUMBNAIL DATA: ", np.array(self.fullThumbnailData).shape) - file_name = self.writeToHere+"{}_ProjectedData_{}.h5".format(self.currRun, self.rank) + file_name = self.writeToHere+f"{self.currRun:04}_ProjectedData_{self.rank}.h5" f1 = h5py.File(file_name, 'r+') f1.create_dataset("SmallImages", data=self.fullThumbnailData) f1.create_dataset("TrueIntensities", data=np.array(self.trueIntensitiesData)) f1.close() self.comm.barrier() - # print("FINISHED AIJOWDAWODIDWJA") class FD_ImageProcessing: def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h): @@ -2128,10 +1927,8 @@ def normalizeIntensityFunc(self, img, currIntensity): if img is None: return img elif currIntensity0: -# newTrueIntensities.append(0) -# else: -# newTrueIntensities.append(np.log(j)) if j<0: newTrueIntensities.append(0) else: newTrueIntensities.append(np.log(j)) origTrueIntensities = newTrueIntensities -# jimgs = [] -# for img in imgs: -# jimgs.append(self.imageProcessor.centerImgFunc(self.imageProcessor.thresholdFunc(img),100,100)) -# imgs = np.array(jimgs) - if getThumbnails: saveMe = [] for img in imgs: - #JOHN CHANGE 12/30/2023 - saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) #JOHN 011/09/2023 -# saveMe.append(np.array(img)) #JOHN 011/09/2023 + saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) thumbnails = np.array(saveMe) - num_valid_imgs, x, y = imgs.shape #JOHN 11/20/2023 + num_valid_imgs, x, y = imgs.shape -# img_batch = np.reshape(imgs, (num_valid_imgs, x * y)).T #JOHN 011/09/2023 img_batch = imgs.T -# print("Image values less than 0 setting to 0") img_batch[img_batch<0] = 0 - -# num_valid_imgs, x, y = img_batch.T.shape #JOHN 11/20/2023 -# print(num_valid_imgs, x, y) if getThumbnails: -# print("FLattening thumbnails") num_valid_thumbnails, tx, ty = thumbnails.shape -# thumbnail_batch = np.reshape(thumbnails, (num_valid_thumbnails, tx*ty)).T #JOHN 011/09/2023 thumbnail_batch = thumbnails.T if getThumbnails: @@ -2444,93 +2194,63 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) else: nimg = self.imageProcessor.processImg(img, currIntensity) -# nthumbnail = self.imageProcessor.processImg(thumbnail, currIntensity) #JOHN 011/09/2023 if nimg is None: nthumbnail = None else: nthumbnail = nimg.copy() -# print(np.array(nimg).shape) -# print(nthumbnail) if nimg is not None: nimg_batch.append(nimg) nthumbnail_batch.append(nthumbnail) ntrueIntensity_batch.append(trueIntens) else: -# nimg_batch.append(np.zeros((x, y))) -# nthumbnail_batch.append(np.zeros((tx, ty))) -# ntrueIntensity_batch.append(0) num_valid_thumbnails -= 1 num_valid_imgs -= 1 self.excludedImgs.append(ind) - if self.imageProcessor.centerImg: #JOHN 011/09/2023 -# print("a09wupoidkw", np.array(nimg_batch).shape) - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) #JOHN 011/09/2023 + if self.imageProcessor.centerImg: + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) - ############################## - # JOHN 12/30/2023 saveMe = [] for img in nthumbnail_batch: saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) nthumbnail_batch = np.array(saveMe) - # print("a09wdjaoimd", nimg_batch.shape, nthumbnail_batch.shape) - # print(nthumbnail_batch.shape) - # JOHN 12/30/2023 - - else: #JOHN 011/09/2023 -# print("a09wupoidkw", np.arrayħnimg_batch).shape) -# print(num_valid_imgs, x, y) - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023 - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) #JOHN 011/09/2023 + + else: + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T + nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) if fullimgs is None and nimg_batch.shape[1]!=0: fullimgs = nimg_batch fullthumbnails = nthumbnail_batch - # print("FULL IMGS IS NONE.", "nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) trueIntensities += ntrueIntensity_batch - # elif len(nimg_batch)!=0: - elif nimg_batch.shape[1]!=0: #JOHN CHANGE 12/31/2023 - # print("nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) + elif nimg_batch.shape[1]!=0: fullimgs = np.hstack((fullimgs, nimg_batch)) fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) - # print("NEW: nimgbatch shape", nimg_batch.shape, "fullthumbnailshape", fullthumbnails.shape, "nthumbnail shape", nthumbnail_batch.shape) trueIntensities += ntrueIntensity_batch else: nimg_batch = [] for ind, img in enumerate(img_batch.T): currIntensity = np.sum(img.flatten(), dtype=np.double) -# print("Starting image processing of size {}".format(img_batch.T.shape)) nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) if nimg is not None: nimg_batch.append(nimg) else: -# nimg_batch.append(np.zeros((x, y))) num_valid_imgs -= 1 self.excludedImgs.append(ind) -# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 - - #JOHN 11/20/23 - if self.imageProcessor.centerImg: #JOHN 011/09/2023 - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T #JOHN 011/09/2023 - else: #JOHN 011/09/2023 - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T #JOHN 011/09/2023 - - + if self.imageProcessor.centerImg: + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T + else: + nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T -# print(nimg_batch.shape) -# print("hstacking") if fullimgs is None: fullimgs = nimg_batch - # elif len(nimg_batch)!=0: #JOHN 12/31/2023 elif nimg_batch.shape[1]!=0: -# print(fullimgs.shape, nimg_batch.shape, nimg_batch) fullimgs = np.hstack((fullimgs, nimg_batch)) print("EXCLUDING IMAGES: ", self.excludedImgs) -# print("Images tracked:", imgsTracked) if getThumbnails: print(fullimgs.shape, fullthumbnails.shape, imgsTracked) return (fullimgs, fullthumbnails, imgsTracked, trueIntensities) @@ -2542,16 +2262,15 @@ def main(): Perform Frequent Direction Visualization. """ params = parse_input() - os.makedirs(os.path.join(params.outdir, "figs"), exist_ok=True) visMe = visualizeFD(inputFile=params.outdir + f"/{params.run:04}_ProjectedData", - outputFile=params.outdir + f"figs/UMAPVis_{params.run:04}.html", + outputFile=params.outdir + f"/UMAPVis_{params.run:04}.html", numImgsToUse=params.num_imgs, nprocs=params.nprocs, userGroupings=[], includeABOD=True, skipSize=params.skip_size, # umap_n_neighbors=params.num_imgs_to_use // 4000, - umap_n_neighbors=params.num_imgs_to_use // 10000, + umap_n_neighbors= 15, umap_random_state=42, hdbscan_min_samples=int(params.num_imgs_to_use * 0.75 // 40), hdbscan_min_cluster_size=int(params.num_imgs_to_use // 40), From a254014fc167eef549892b0195989123ab8ed58d Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Tue, 12 Mar 2024 12:49:54 -0700 Subject: [PATCH 56/57] I don't think I changed anything. --- btx/processing/freqdir.py | 1 - 1 file changed, 1 deletion(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index b044d61ae..523c325ad 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -1714,7 +1714,6 @@ def estimFrobNormJ(self, addMe, arrs, k): def retrieveImages(self): startingPoint = self.start_offset + self.num_imgs*self.rank//self.size self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) - np.save('/sdf/home/w/winnicki/john_20240312.npy', self.fullImgData) def genSynthData(self): self.fullImgData = np.random.rand(70000, 100000//self.size) From 1449dac863c193a60b499044f19e0fd82600ddb9 Mon Sep 17 00:00:00 2001 From: John Winnicki Date: Mon, 1 Jul 2024 10:02:53 -0700 Subject: [PATCH 57/57] Not sure what changed --- btx/processing/freqdir.py | 565 ++++++++++++++++++++++++++------------ 1 file changed, 388 insertions(+), 177 deletions(-) diff --git a/btx/processing/freqdir.py b/btx/processing/freqdir.py index 523c325ad..1a355fd50 100644 --- a/btx/processing/freqdir.py +++ b/btx/processing/freqdir.py @@ -40,12 +40,14 @@ from bokeh.plotting import figure, show, output_file, save from bokeh.models import HoverTool, CategoricalColorMapper, LinearColorMapper, ColumnDataSource, CustomJS, Slider, RangeSlider, Toggle, RadioButtonGroup, Range1d, Label from bokeh.models import CustomJS, ColumnDataSource, Span, PreText -from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11 +from bokeh.palettes import Viridis256, Cividis256, Turbo256, Category20, Plasma3, Plasma256, Plasma11, Plasma10, Inferno256 from bokeh.layouts import column, row import cProfile import string +import pickle + import cv2 try: @@ -861,7 +863,7 @@ class visualizeFD: """ def __init__(self, inputFile, outputFile, numImgsToUse, nprocs, includeABOD, userGroupings, skipSize, umap_n_neighbors, umap_random_state, hdbscan_min_samples, hdbscan_min_cluster_size, - optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile): + optics_min_samples, optics_xi, optics_min_cluster_size, outlierQuantile, ): self.inputFile = inputFile self.outputFile = outputFile output_file(filename=outputFile, title="Static HTML file") @@ -1113,27 +1115,58 @@ def find_bin(value): def genUMAP(self): - - imgs = None projections = None trueIntensities = None - for currRank in range(self.nprocs): - with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: - if imgs is None: - imgs = hf["SmallImages"][:] - projections = hf["ProjectedData"][:] - trueIntensities = hf["TrueIntensities"][:] - else: - imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) - projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) - trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0) - print(len(imgs)) - - for intensMe in trueIntensities: - print(intensMe) - if(np.isnan(intensMe)): - print("This is NAN") + + runlbs = [] + currlb = 0 + currLen = 0 + + skipNums = [229, 231, 232, 248, 275, 305, 306, 309] + for iirun in range(214, 215, 1): + if iirun in skipNums: + print(f"SKIPPING RUN {iirun}") + continue + else: + for currRank in range(self.nprocs): + with h5py.File(self.inputFile+f"{iirun:04}_ProjectedData" + "_"+str(currRank)+".h5", 'r') as hf: + print(f"PROCESSING: " + self.inputFile+f"{iirun:04}_ProjectedData" + "_"+str(currRank)+".h5") + if imgs is None: + imgs = hf["SmallImages"][:] + projections = hf["ProjectedData"][:] + trueIntensities = hf["TrueIntensities"][:] + else: + imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) + projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0) + currLen += len(hf["TrueIntensities"][:]) + for currInd in range(currLen): + runlbs.append(currlb) + currlb += 1 + currLen = 0 + + self.numFiles = currlb + + # colorRunLbs = [int(x/len(set(runlbs))*255) for x in runlbs] + # print(colorRunLbs) + # trueIntensities = None + # for currRank in range(self.nprocs): + # with h5py.File(self.inputFile+"_"+str(currRank)+".h5", 'r') as hf: + # if imgs is None: + # imgs = hf["SmallImages"][:] + # projections = hf["ProjectedData"][:] + # trueIntensities = hf["TrueIntensities"][:] + # else: + # imgs = np.concatenate((imgs, hf["SmallImages"][:]), axis=0) + # projections = np.concatenate((projections, hf["ProjectedData"][:]), axis=0) + # trueIntensities = np.concatenate((trueIntensities, hf["TrueIntensities"][:]), axis=0) + print(f"PROCESSING {len(imgs)} BEAM PROFILES") + + # for intensMe in trueIntensities: + # print(intensMe) + # if(np.isnan(intensMe)): + # print("This is NAN") intensities = [] for img in imgs: @@ -1148,6 +1181,9 @@ def genUMAP(self): self.projections = projections[:self.numImgsToUse:self.skipSize] self.intensities = intensities[:self.numImgsToUse:self.skipSize] + trueIntensities = trueIntensities[:self.numImgsToUse:self.skipSize] + runlbs = np.array(runlbs[:self.numImgsToUse:self.skipSize]) + self.numImgsToUse = int(self.numImgsToUse/self.skipSize) if len(self.imgs)!= self.numImgsToUse: @@ -1160,10 +1196,14 @@ def genUMAP(self): min_dist=0, ).fit_transform(self.projections) - self.labels = hdbscan.HDBSCAN( - min_samples = self.hdbscan_min_samples, - min_cluster_size = self.hdbscan_min_cluster_size - ).fit_predict(self.clusterable_embedding) + ##################### JOHN 05/18/2024 + np.save('clusteringSave.npy', self.clusterable_embedding) + + # self.labels = hdbscan.HDBSCAN( + # min_samples = self.hdbscan_min_samples, + # min_cluster_size = self.hdbscan_min_cluster_size + # ).fit_predict(self.clusterable_embedding) + self.labels = runlbs exclusionList = np.array([]) self.clustered = np.isin(self.labels, exclusionList, invert=True) @@ -1218,8 +1258,13 @@ def genLabels(self): self.newLabels = np.array(self.relabel_to_closest_zero(newLabels)) self.experData_df['cluster'] = [str(x) for x in self.newLabels[self.clustered]] self.experData_df['ptColor'] = [x for x in self.experData_df['cluster']] - self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels] - self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] + + ################# JOHN CHANGE 05/14/2024 + self.experData_df['dbscan_backgroundColor'] = [Inferno256[::(int(256/self.numFiles))][x] for x in self.newLabels] + self.experData_df['backgroundColor'] = [Inferno256[::(int(256/self.numFiles))][x] for x in self.newLabels] + # self.experData_df['dbscan_backgroundColor'] = [Category20[20][x] for x in self.newLabels] + # self.experData_df['backgroundColor'] = [Category20[20][x] for x in self.newLabels] + medoid_lst = self.genMedoids(self.newLabels, self.clusterable_embedding) self.medoidInds = [x[1] for x in medoid_lst] medoidBold = [] @@ -1245,8 +1290,10 @@ def genLabels(self): def genHTML(self): datasource = ColumnDataSource(self.experData_df) + ####################################### JOHN CHANGE 05/14/2024 + color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Inferno256[::(int(256/self.numFiles))]) #JOHN CHANGE 20231020 - color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) + # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Category20[20]) # color_mapping = CategoricalColorMapper(factors=[str(x) for x in list(set(self.newLabels))],palette=Plasma256[::16]) plot_figure = figure( title='UMAP projection with DBSCAN clustering of the LCLS dataset', @@ -1573,7 +1620,7 @@ class WrapperFullFD: """ # from btx.interfaces.ipsana import PsanaInterface btx = __import__('btx') - def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True): + def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grabImgSteps, num_components, alpha, rankAdapt, rankAdaptMinError, downsample, bin_factor, threshold, eluThreshold, eluAlpha, normalizeIntensity, noZeroIntensity, minIntensity, samplingFactor, divBy, thresholdQuantile, unitVar=False, usePSI=True, downsampleImg=150, roiLen=800, thumbLen=64, centerImg=True): self.start_offset = start_offset self.num_imgs = num_imgs self.exp = exp @@ -1597,6 +1644,10 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.thresholdQuantile = thresholdQuantile self.unitVar = unitVar + self.downsampleImg = downsampleImg + self.roiLen = roiLen + self.thumbLen = thumbLen + self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() @@ -1604,6 +1655,8 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.imgsTracked = [] self.grabImgSteps = grabImgSteps + self.centerImg = centerImg + self.usePSI = usePSI if usePSI: self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type) @@ -1619,9 +1672,9 @@ def __init__(self, exp, run, det_type, start_offset, num_imgs, writeToHere, grab self.newBareTime = 0 - self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = True, roi_w=200, roi_h = 200) + self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = self.centerImg, roiLen=self.roiLen, downsampleImg=self.downsampleImg) # self.imageProcessor = FD_ImageProcessing(threshold = self.threshold, eluThreshold = self.eluThreshold, eluAlpha = self.eluAlpha, noZeroIntensity = self.noZeroIntensity, normalizeIntensity=self.normalizeIntensity, minIntensity=self.minIntensity, thresholdQuantile=self.thresholdQuantile, unitVar = self.unitVar, centerImg = False, roi_w=500, roi_h = 500) - self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbnailHeight = 64, thumbnailWidth = 64) + self.imgRetriever = SinglePanelDataRetriever(exp=exp, det_type=det_type, run=run, downsample=downsample, bin_factor=bin_factor, imageProcessor = self.imageProcessor, thumbLen=self.thumbLen) def oldLowMemoryReconstructionErrorScaled1(self, matrixCentered, matSketch): """ @@ -1713,7 +1766,8 @@ def estimFrobNormJ(self, addMe, arrs, k): def retrieveImages(self): startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) + # self.fullImgData, self.imgsTracked = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) + self.fullImgData, self.imgsTracked = self.imgRetriever.get_fake_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=False) def genSynthData(self): self.fullImgData = np.random.rand(70000, 100000//self.size) @@ -1745,6 +1799,7 @@ def modified_gram_schmidt(self, A, num_vecs): print(f"COMPUTED VECTOR {j}/{num_vecs}") return Q + #JOHN COMMENT 06/27/2024: This is the code you should use for testing speed in parallelization. def compDecayingSVD(self, seedMe, a, b): numFeats = a numSamps = b//self.size @@ -1770,6 +1825,7 @@ def oldCompDecayingSVD1(self, seedMe, a, b): self.fullImgData = (Q1 @ np.diag(S) @ Q2).T self.imgsTracked = [(0, numSamps)] + #JOHN COMMENT 06/27/2024: This is the code you should use for testing performance. It generates synthetic data for error. def oldCompDecayingSVD2(self, seedMe, a, b): #JOHN COMMENT 01/09/2024: YOU MUST HAVE GREATER NUMBER OF COMPONENTS VERSUS NUMBER OF SAMPLES. numFeats = a @@ -1852,7 +1908,8 @@ def runMe(self): def addThumbnailsToProjectH5(self): startingPoint = self.start_offset + self.num_imgs*self.rank//self.size - _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) + # _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_formatted_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) + _,self.fullThumbnailData,_,self.trueIntensitiesData = self.imgRetriever.get_fake_images(startInd=startingPoint, n=self.num_imgs//self.size, num_steps=self.grabImgSteps, getThumbnails=True) file_name = self.writeToHere+f"{self.currRun:04}_ProjectedData_{self.rank}.h5" f1 = h5py.File(file_name, 'r+') f1.create_dataset("SmallImages", data=self.fullThumbnailData) @@ -1861,7 +1918,7 @@ def addThumbnailsToProjectH5(self): self.comm.barrier() class FD_ImageProcessing: - def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roi_w, roi_h): + def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalizeIntensity, minIntensity, thresholdQuantile, unitVar, centerImg, roiLen, downsampleImg): self.threshold = threshold self.eluThreshold = eluThreshold self.eluAlpha = eluAlpha @@ -1871,8 +1928,8 @@ def __init__(self, threshold, eluThreshold, eluAlpha, noZeroIntensity, normalize self.thresholdQuantile = thresholdQuantile self.unitVar = unitVar self.centerImg = centerImg - self.roi_w = roi_w - self.roi_h = roi_h + self.roiLen = roiLen + self.downsampleImg = downsampleImg def processImg(self, nimg, ncurrIntensity): if self.threshold: @@ -1880,6 +1937,7 @@ def processImg(self, nimg, ncurrIntensity): if self.eluThreshold: nimg = self.eluThresholdFunc(nimg) if self.centerImg: + # print("CENTERING IMAGE") nimg = self.centerImgFunc(nimg) if nimg is not None: @@ -1893,6 +1951,8 @@ def processImg(self, nimg, ncurrIntensity): nimg = self.normalizeIntensityFunc(nimg, currIntensity) if self.unitVar: nimg = self.unitVarFunc(nimg, currIntensity) + if self.downsampleImg!=-1: + nimg = self.downsampleFunc(nimg) return nimg def elu(self,x): @@ -1937,30 +1997,69 @@ def unitVarFunc(self, img, currIntensity): return img/img.std(axis=0) def centerImgFunc(self, img): - if img is None: + def find_center_of_gravity(image): + moments = cv2.moments(image) + if moments["m00"] != 0: + cX = int(moments["m10"] / moments["m00"]) + cY = int(moments["m01"] / moments["m00"]) + else: + cX, cY = 0, 0 + return int(cX), int(cY) + def find_bounding_box(array, box_size): + center_y, center_x = find_center_of_gravity(array) + return (center_y-box_size//2, center_x-box_size//2, box_size, box_size) + if img is None: return img else: - nimg = img - rampingFact = 1 - while rampingFact>=1: - curr_roi_w = int(self.roi_w*rampingFact) - curr_roi_h = int(self.roi_h*rampingFact) - nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1) - if np.sum(img.flatten(), dtype=np.double)<10000: - cogx, cogy = (curr_roi_w, curr_roi_h) - else: - cogx, cogy = self.calcCenterGrav(nimg) - nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)] - rampingFact -= 0.5 + nimg = np.pad(img, self.roiLen//2, mode='constant', constant_values=0) + bounding_box = find_bounding_box(nimg, self.roiLen) + nimg = nimg[bounding_box[1]:bounding_box[1]+bounding_box[3], bounding_box[0]:bounding_box[0]+bounding_box[2]] return nimg + + def downsampleFunc(self, img): + if img is None: + return img + else: + normalized_array = (255 * (img - np.min(img)) / np.ptp(img)).astype(np.uint8) + image = Image.fromarray(normalized_array) + nimg = image.resize((self.downsampleImg, self.downsampleImg), Image.Resampling.LANCZOS) + return np.array(nimg) + - def calcCenterGrav(self, grid): - M_total = np.sum(grid) - row_indices, col_indices = np.indices(grid.shape) - X_c = np.sum(row_indices * grid) / M_total - Y_c = np.sum(col_indices * grid) / M_total -# print(M_total, X_c, Y_c, grid) - return (round(X_c), round(Y_c)) + # def centerImgFunc(self, img): + # if img is None: + # return img + # else: + # nimg = img + # rampingFact = 1 + # while rampingFact>=1: + # curr_roi_w = int(self.roi_w*rampingFact) + # curr_roi_h = int(self.roi_h*rampingFact) + # nimg = np.pad(img, max(2*curr_roi_w, 2*curr_roi_h)+1) + # if np.sum(img.flatten(), dtype=np.double)<10000: + # cogx, cogy = (curr_roi_w, curr_roi_h) + # else: + # cogx, cogy = self.calcCenterGrav(nimg) + # nimg = nimg[cogx-(curr_roi_w//2):cogx+(curr_roi_w//2), cogy-(curr_roi_h//2):cogy+(curr_roi_h//2)] + # rampingFact -= 0.5 + # return nimg + +# def calcCenterGrav(self, grid): +# M_total = np.sum(grid) +# row_indices, col_indices = np.indices(grid.shape) +# X_c = np.sum(row_indices * grid) / M_total +# Y_c = np.sum(col_indices * grid) / M_total +# # print(M_total, X_c, Y_c, grid) +# return (round(X_c), round(Y_c)) + # def find_center_of_gravity(image): + # moments = cv2.moments(image) + # if moments["m00"] != 0: + # cX = int(moments["m10"] / moments["m00"]) + # cY = int(moments["m01"] / moments["m00"]) + # else: + # cX, cY = 0, 0 + + # return cX, cY @@ -2096,12 +2195,11 @@ def get_formatted_images(self, startInd, n, num_steps, getThumbnails): class SinglePanelDataRetriever: btx = __import__('btx') - def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbnailHeight, thumbnailWidth): + def __init__(self, exp, det_type, run, downsample, bin_factor, imageProcessor, thumbLen): self.exp = exp self.det_type = det_type self.run = run - self.thumbnailHeight = thumbnailHeight - self.thumbnailWidth = thumbnailWidth + self.thumbLen = thumbLen self.psi = self.btx.interfaces.ipsana.PsanaInterface(exp=exp, run=run, det_type=det_type) @@ -2125,136 +2223,249 @@ def split_range(self, start, end, num_tuples): tuples.append((last_batch_start, last_batch_end)) return tuples - def get_formatted_images(self, startInd, n, num_steps, getThumbnails): - """ - Fetch n - x image segments from run, where x is the number of 'dead' images. - - Parameters - ---------- - n : int - number of images to retrieve - start_index : int - start index of subsection of data to retrieve - end_index : int - end index of subsection of data to retrieve - - Returns - ------- - ndarray, shape (end_index-start_index, n-x) - n-x retrieved image segments of dimension end_index-start_index - """ - fullimgs = None - fullthumbnails = None - imgsTracked = [] - runs = self.split_range(startInd, startInd+n, num_steps) - print(runs) - trueIntensities = [] - for runStart, runEnd in runs: - self.psi.counter = runStart - imgsTracked.append((runStart, runEnd)) - - imgs = self.psi.get_images(runEnd-runStart, assemble=False) + # def get_formatted_images(self, startInd, n, num_steps, getThumbnails): + # """ + # Fetch n - x image segments from run, where x is the number of 'dead' images. + + # Parameters + # ---------- + # n : int + # number of images to retrieve + # start_index : int + # start index of subsection of data to retrieve + # end_index : int + # end index of subsection of data to retrieve + + # Returns + # ------- + # ndarray, shape (end_index-start_index, n-x) + # n-x retrieved image segments of dimension end_index-start_index + # """ + # fullimgs = None + # imgsTracked = [] + # runs = self.split_range(startInd, startInd+n, num_steps) + # print(runs) + # trueIntensities = [] + + # for runStart, runEnd in runs: + # self.psi.counter = runStart + # imgsTracked.append((runStart, runEnd)) + + # imgs = self.psi.get_images(runEnd-runStart, assemble=False) + + # imgs = imgs[ + # [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] + # ] + + # origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs] + # newTrueIntensities = [] + # for j in origTrueIntensities: + # if j<0: + # newTrueIntensities.append(0) + # else: + # newTrueIntensities.append(np.log(j)) + # origTrueIntensities = newTrueIntensities + + # num_valid_imgs, x, y = imgs.shape + + # imshape = (x, y) + # if self.imageProcessor.centerImg: + # imshape = (self.imageProcessor.roiLen, self.imageProcessor.roiLen) + # if self.imageProcessor.downsampleImg!=-1: + # imshape = (self.imageProcessor.downsampleImg, self.imageProcessor.downsampleImg) + + # img_batch = imgs.T + # img_batch[img_batch<0] = 0 + + # nimg_batch = [] + # ntrueIntensity_batch = [] + # for ind, (img, trueIntens) in enumerate(zip(img_batch.T, origTrueIntensities)): + # currIntensity = np.sum(img.flatten(), dtype=np.double) + # nimg = self.imageProcessor.processImg(img, currIntensity) + # if nimg is not None: + # nimg_batch.append(nimg) + # ntrueIntensity_batch.append(trueIntens) + # else: + # num_valid_imgs -= 1 + # self.excludedImgs.append(ind) + # if self.imageProcessor.centerImg or self.imageProcessor.downsampleImg!=-1: + # nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, imshape[0]*imshape[1]).T + # else: + # nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T + + # if fullimgs is None and nimg_batch.shape[1]!=0: + # fullimgs = nimg_batch + # trueIntensities += ntrueIntensity_batch + # elif nimg_batch.shape[1]!=0: + # fullimgs = np.hstack((fullimgs, nimg_batch)) + # trueIntensities += ntrueIntensity_batch + + # print("EXCLUDING IMAGES: ", self.excludedImgs) + + # if getThumbnails: + # fullThumbs = [] + # for img in fullimgs.T: + # nimg = img.reshape(imshape) + # if self.imageProcessor.downsampleImg==-1: + # normalized_array = (255 * (nimg - np.min(nimg)) / np.ptp(nimg)).astype(np.uint8) + # else: + # normalized_array = nimg + # image = Image.fromarray(normalized_array) + # thumbnail = image.resize((self.thumbLen, self.thumbLen), Image.Resampling.LANCZOS) + # thumbnail_array = np.array(thumbnail) + # fullThumbs.append(thumbnail_array) + # return (fullimgs, fullThumbs, imgsTracked, trueIntensities) + # else: + # return (fullimgs, imgsTracked) + - imgs = imgs[ - [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] - ] + def get_fake_images(self, startInd, n, num_steps, getThumbnails): + fullimgs = np.load('/sdf/data/lcls/ds/xpp/xppx22715/scratch/winnicki/fakeImgData.npy') + with open('/sdf/data/lcls/ds/xpp/xppx22715/scratch/winnicki/fakeThumbData.pkl', 'rb') as f: + fullThumbs = pickle.load(f) + if getThumbnails: + return (fullimgs, fullThumbs, [[0, 10]], [1 for x in range(len(fullimgs))]) + else: + return (fullimgs, [[0, 10]]) + + +################################# JOHN OLD VERSION 05/13/2024 ############################################ +# def get_formatted_images(self, startInd, n, num_steps, getThumbnails): +# """ +# Fetch n - x image segments from run, where x is the number of 'dead' images. + +# Parameters +# ---------- +# n : int +# number of images to retrieve +# start_index : int +# start index of subsection of data to retrieve +# end_index : int +# end index of subsection of data to retrieve + +# Returns +# ------- +# ndarray, shape (end_index-start_index, n-x) +# n-x retrieved image segments of dimension end_index-start_index +# """ +# fullimgs = None +# fullthumbnails = None +# imgsTracked = [] +# runs = self.split_range(startInd, startInd+n, num_steps) +# print(runs) +# trueIntensities = [] +# for runStart, runEnd in runs: +# self.psi.counter = runStart +# imgsTracked.append((runStart, runEnd)) + +# imgs = self.psi.get_images(runEnd-runStart, assemble=False) + +# imgs = imgs[ +# [i for i in range(imgs.shape[0]) if not np.isnan(imgs[i : i + 1]).any()] +# ] + +# origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs] +# newTrueIntensities = [] +# for j in origTrueIntensities: +# if j<0: +# newTrueIntensities.append(0) +# else: +# newTrueIntensities.append(np.log(j)) +# origTrueIntensities = newTrueIntensities + +# if getThumbnails: +# saveMe = [] +# for img in imgs: +# saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) +# thumbnails = np.array(saveMe) + +# num_valid_imgs, x, y = imgs.shape + +# img_batch = imgs.T +# img_batch[img_batch<0] = 0 + +# if getThumbnails: +# num_valid_thumbnails, tx, ty = thumbnails.shape +# thumbnail_batch = thumbnails.T + +# if getThumbnails: +# nimg_batch = [] +# nthumbnail_batch = [] +# ntrueIntensity_batch = [] +# for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)): +# currIntensity = np.sum(img.flatten(), dtype=np.double) +# if self.imageProcessor.centerImg: +# nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) +# else: +# nimg = self.imageProcessor.processImg(img, currIntensity) +# if nimg is None: +# nthumbnail = None +# else: +# nthumbnail = nimg.copy() +# if nimg is not None: +# nimg_batch.append(nimg) +# nthumbnail_batch.append(nthumbnail) +# ntrueIntensity_batch.append(trueIntens) +# else: +# num_valid_thumbnails -= 1 +# num_valid_imgs -= 1 +# self.excludedImgs.append(ind) +# if self.imageProcessor.centerImg: +# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T +# nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) + +# saveMe = [] +# for img in nthumbnail_batch: +# saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) +# nthumbnail_batch = np.array(saveMe) + +# else: +# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T +# nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) + + +# if fullimgs is None and nimg_batch.shape[1]!=0: +# fullimgs = nimg_batch +# fullthumbnails = nthumbnail_batch +# trueIntensities += ntrueIntensity_batch +# elif nimg_batch.shape[1]!=0: +# fullimgs = np.hstack((fullimgs, nimg_batch)) +# fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) +# trueIntensities += ntrueIntensity_batch +# else: +# nimg_batch = [] +# for ind, img in enumerate(img_batch.T): +# currIntensity = np.sum(img.flatten(), dtype=np.double) +# nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) +# if nimg is not None: +# nimg_batch.append(nimg) +# else: +# num_valid_imgs -= 1 +# self.excludedImgs.append(ind) - origTrueIntensities = [np.sum(img.flatten(), dtype=np.double) for img in imgs] - newTrueIntensities = [] - for j in origTrueIntensities: - if j<0: - newTrueIntensities.append(0) - else: - newTrueIntensities.append(np.log(j)) - origTrueIntensities = newTrueIntensities +# if self.imageProcessor.centerImg: +# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T +# else: +# nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T - if getThumbnails: - saveMe = [] - for img in imgs: - saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) - thumbnails = np.array(saveMe) +# if fullimgs is None: +# fullimgs = nimg_batch +# elif nimg_batch.shape[1]!=0: +# fullimgs = np.hstack((fullimgs, nimg_batch)) - num_valid_imgs, x, y = imgs.shape +# print("EXCLUDING IMAGES: ", self.excludedImgs) - img_batch = imgs.T - img_batch[img_batch<0] = 0 - - if getThumbnails: - num_valid_thumbnails, tx, ty = thumbnails.shape - thumbnail_batch = thumbnails.T +# if getThumbnails: +# print(fullimgs.shape, fullthumbnails.shape, imgsTracked) +# return (fullimgs, fullthumbnails, imgsTracked, trueIntensities) +# else: +# return (fullimgs, imgsTracked) - if getThumbnails: - nimg_batch = [] - nthumbnail_batch = [] - ntrueIntensity_batch = [] - for ind, (img, thumbnail, trueIntens) in enumerate(zip(img_batch.T, thumbnail_batch.T, origTrueIntensities)): - currIntensity = np.sum(img.flatten(), dtype=np.double) - if self.imageProcessor.centerImg: - nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) - else: - nimg = self.imageProcessor.processImg(img, currIntensity) - if nimg is None: - nthumbnail = None - else: - nthumbnail = nimg.copy() - if nimg is not None: - nimg_batch.append(nimg) - nthumbnail_batch.append(nthumbnail) - ntrueIntensity_batch.append(trueIntens) - else: - num_valid_thumbnails -= 1 - num_valid_imgs -= 1 - self.excludedImgs.append(ind) - if self.imageProcessor.centerImg: - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, self.imageProcessor.roi_h, self.imageProcessor.roi_w) - - saveMe = [] - for img in nthumbnail_batch: - saveMe.append(np.array(Image.fromarray(img).resize((self.thumbnailHeight, self.thumbnailWidth)))) - nthumbnail_batch = np.array(saveMe) - else: - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T - nthumbnail_batch = np.array(nthumbnail_batch).reshape(num_valid_thumbnails, tx, ty) - - - if fullimgs is None and nimg_batch.shape[1]!=0: - fullimgs = nimg_batch - fullthumbnails = nthumbnail_batch - trueIntensities += ntrueIntensity_batch - elif nimg_batch.shape[1]!=0: - fullimgs = np.hstack((fullimgs, nimg_batch)) - fullthumbnails = np.vstack((fullthumbnails, nthumbnail_batch)) - trueIntensities += ntrueIntensity_batch - else: - nimg_batch = [] - for ind, img in enumerate(img_batch.T): - currIntensity = np.sum(img.flatten(), dtype=np.double) - nimg = self.imageProcessor.processImg(img[200:, :], currIntensity) - if nimg is not None: - nimg_batch.append(nimg) - else: - num_valid_imgs -= 1 - self.excludedImgs.append(ind) - if self.imageProcessor.centerImg: - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, self.imageProcessor.roi_h*self.imageProcessor.roi_w).T - else: - nimg_batch = np.array(nimg_batch).reshape(num_valid_imgs, x*y).T - if fullimgs is None: - fullimgs = nimg_batch - elif nimg_batch.shape[1]!=0: - fullimgs = np.hstack((fullimgs, nimg_batch)) - print("EXCLUDING IMAGES: ", self.excludedImgs) - if getThumbnails: - print(fullimgs.shape, fullthumbnails.shape, imgsTracked) - return (fullimgs, fullthumbnails, imgsTracked, trueIntensities) - else: - return (fullimgs, imgsTracked) def main(): """