gruber-sciencelab · fabian178 · Feb 28, 2024
diff --git a/smeagol/matrices.py b/smeagol/matrices.py
@@ -120,10 +120,13 @@ def position_wise_ic(probs):
 # Functions to convert matrix types
 
 
-def ppm_to_pwm(probs):
+def ppm_to_pwm(probs, pseudocount=0.0001):
     """Function to convert a valid PPM into a PWM, using
     the formula: PWM = log2(PPM/B), where the background
-    probability B is set to 0.25.
+    probability B is set to 0.25. The pseudocount specifies
+    by which value the whole ppm should be incremented
+    in order to convert to a pwm to avoid log2(0),
+    if the ppm contains any 0 entries.
 
     Args:
         probs (np.array): Numpy array containing PPM probability values
@@ -134,6 +137,16 @@ def ppm_to_pwm(probs):
 
     """
     check_ppm(probs)
+    #if there is a 0 entry in the ppm, add the pseudocount and normalize
+    #such that all rows add to 1, but to get rid of any 0 entries.
+    #This is to avoid taking log2(0) which returns -inf
+    #This will change the matrix slightly
+    #imo, it ist best to do this check and adjustment when reading in the ppm
+    #i did it here for now because it serves my purpose
+    if (probs == 0).any():
+        probs = probs+pseudocount
+        probs = probs / probs.sum(axis=1, keepdims=True)
+
     return np.log2(probs / 0.25)
 
 
@@ -551,3 +564,40 @@ def cluster_pms(df, n_clusters, sims=None, weight_col="weights"):
     ]
     result = {"clusters": cluster_ids, "reps": reps, "min_ncorr": min_ncorrs}
     return result
+
+
+#I added this function because it was missing in this module
+#Please remove again, if it should not be here
+def get_rep_mats(sel_pwms, max_clusters=5, threshold=.2):
+    """Function to cluster position matrices based on a specified threshold. 
+    A distance matrix between the matrices is computed using the normalized Pearson
+    correlation metric and agglomerative clustering is used to
+    find clusters. `choose_representative_pm` is called to
+    identify a representative matrix from each cluster.
+
+    Args:
+        df (pandas df): Dataframe containing position matrix values and IDs.
+        n_clusters (int): Number of clusters
+        sims (np.array): pairwise similarities between all matrices in pwms
+        weight_col(str): column in pwms that contains matrix values.
+
+    Returns:
+        result: dictionary containing cluster labels, representative
+                matrix IDs, and minimum pairwise similarity within each
+                cluster
+
+    """
+    choice = None
+    n_clusters = 1
+    min_ncorr = np.nanmin(pairwise_ncorrs(list(sel_pwms.weights)))
+
+    if min_ncorr > threshold:
+        choice = {'reps': choose_representative_pm(sel_pwms),
+                  'min_ncorr': min_ncorr}
+    else:
+        while ((n_clusters < max_clusters) & (min_ncorr <= threshold)):
+            n_clusters += 1
+            choice = cluster_pms(sel_pwms, n_clusters=n_clusters)
+            min_ncorr = np.min(choice['min_ncorr'])
+
+    return choice
diff --git a/smeagol/visualize.py b/smeagol/visualize.py
@@ -12,13 +12,14 @@
 import scipy.stats as stats
 
 
-def ppm_logo(probs, title="", figsize=(5, 2)):
+def ppm_logo(probs, title="", figsize=(5, 2), fileName=""):
     """Function to visualize the sequence logo of a PPM.
 
     Args:
         probs (np.array): array containing probabilities
         title (str): Title of the logo.
         figsize (tuple): (width, height)
+        fileName (str): path for saving the plot, will not save if empty
 
     Returns:
         PPM logo
@@ -40,56 +41,61 @@ def ppm_logo(probs, title="", figsize=(5, 2)):
                           length_padding=0.01,
                           subticks_frequency=1,
                           highlight={})
+    if not fileName=="":
+        plt.savefig(fileName)
     plt.show()
 
 
-def pwm_logo(weights, title="", figsize=(5, 2)):
+def pwm_logo(weights, title="", figsize=(5, 2), fileName = ""):
     """Function to visualize the sequence logo of a PWM.
 
     Args:
         weights (np.array): array containing PWM weight values
         title (str): Title of the logo.
         figsize (tuple): (width, height)
+        fileName (str): path for saving the plot, will not save if empty
 
     Returns:
         PWM logo
 
     """
     # Convert PWM to PPM
     ppm = pwm_to_ppm(weights)
-    ppm_logo(ppm, title=title, figsize=figsize)
+    ppm_logo(ppm, title=title, figsize=figsize, fileName=fileName)
 
 
-def plot_pwm(pwm_df, Matrix_id, figsize=(5, 2)):
+def plot_pwm(pwm_df, Matrix_id, figsize=(5, 2), fileName = ""):
     """Function to plot sequence logo from PWM
 
     Args:
         pwm_df (pd.DataFrame): Dataframe containing cols weight, Matrix_id
         Matrix_id: ID of PWM to plot (will be used as logo title)
         figsize (tuple): (width, height)
+        fileName (str): path for saving the plot, will not save if empty
 
     Returns:
         Plots PWM
 
     """
     weights = pwm_df.weights.values[pwm_df.Matrix_id == Matrix_id][0]
-    pwm_logo(weights=weights, title=Matrix_id, figsize=figsize)
+    pwm_logo(weights=weights, title=Matrix_id, figsize=figsize, fileName=fileName)
 
 
-def plot_ppm(ppm_df, Matrix_id, figsize=(5, 2)):
+def plot_ppm(ppm_df, Matrix_id, figsize=(5, 2), fileName = ""):
     """
     Function to plot sequence logo from PPM
 
     Args:
         ppm_df (pd.DataFrame): Dataframe containing cols probs, Matrix_id
         Matrix_id: ID of PWM to plot
+        fileName (str): path for saving the plot, will not save if empty
 
     Returns:
         Plots PPM
 
     """
     probs = ppm_df.probs.values[ppm_df.Matrix_id == Matrix_id][0]
-    ppm_logo(probs=probs, title=Matrix_id, figsize=figsize)
+    ppm_logo(probs=probs, title=Matrix_id, figsize=figsize, fileName=fileName)
 
 
 def plot_binned_count_dist(