Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed ppm handling and added figure saving #44

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions smeagol/matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,13 @@ def position_wise_ic(probs):
# Functions to convert matrix types


def ppm_to_pwm(probs):
def ppm_to_pwm(probs, pseudocount=0.0001):
"""Function to convert a valid PPM into a PWM, using
the formula: PWM = log2(PPM/B), where the background
probability B is set to 0.25.
probability B is set to 0.25. The pseudocount specifies
by which value the whole ppm should be incremented
in order to convert to a pwm to avoid log2(0),
if the ppm contains any 0 entries.

Args:
probs (np.array): Numpy array containing PPM probability values
Expand All @@ -134,6 +137,16 @@ def ppm_to_pwm(probs):

"""
check_ppm(probs)
#if there is a 0 entry in the ppm, add the pseudocount and normalize
#such that all rows add to 1, but to get rid of any 0 entries.
#This is to avoid taking log2(0) which returns -inf
#This will change the matrix slightly
#imo, it ist best to do this check and adjustment when reading in the ppm
#i did it here for now because it serves my purpose
if (probs == 0).any():
probs = probs+pseudocount
probs = probs / probs.sum(axis=1, keepdims=True)

return np.log2(probs / 0.25)


Expand Down Expand Up @@ -551,3 +564,40 @@ def cluster_pms(df, n_clusters, sims=None, weight_col="weights"):
]
result = {"clusters": cluster_ids, "reps": reps, "min_ncorr": min_ncorrs}
return result


#I added this function because it was missing in this module
#Please remove again, if it should not be here
def get_rep_mats(sel_pwms, max_clusters=5, threshold=.2):
"""Function to cluster position matrices based on a specified threshold.
A distance matrix between the matrices is computed using the normalized Pearson
correlation metric and agglomerative clustering is used to
find clusters. `choose_representative_pm` is called to
identify a representative matrix from each cluster.

Args:
df (pandas df): Dataframe containing position matrix values and IDs.
n_clusters (int): Number of clusters
sims (np.array): pairwise similarities between all matrices in pwms
weight_col(str): column in pwms that contains matrix values.

Returns:
result: dictionary containing cluster labels, representative
matrix IDs, and minimum pairwise similarity within each
cluster

"""
choice = None
n_clusters = 1
min_ncorr = np.nanmin(pairwise_ncorrs(list(sel_pwms.weights)))

if min_ncorr > threshold:
choice = {'reps': choose_representative_pm(sel_pwms),
'min_ncorr': min_ncorr}
else:
while ((n_clusters < max_clusters) & (min_ncorr <= threshold)):
n_clusters += 1
choice = cluster_pms(sel_pwms, n_clusters=n_clusters)
min_ncorr = np.min(choice['min_ncorr'])

return choice
20 changes: 13 additions & 7 deletions smeagol/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
import scipy.stats as stats


def ppm_logo(probs, title="", figsize=(5, 2)):
def ppm_logo(probs, title="", figsize=(5, 2), fileName=""):
"""Function to visualize the sequence logo of a PPM.

Args:
probs (np.array): array containing probabilities
title (str): Title of the logo.
figsize (tuple): (width, height)
fileName (str): path for saving the plot, will not save if empty

Returns:
PPM logo
Expand All @@ -40,56 +41,61 @@ def ppm_logo(probs, title="", figsize=(5, 2)):
length_padding=0.01,
subticks_frequency=1,
highlight={})
if not fileName=="":
plt.savefig(fileName)
plt.show()


def pwm_logo(weights, title="", figsize=(5, 2)):
def pwm_logo(weights, title="", figsize=(5, 2), fileName = ""):
"""Function to visualize the sequence logo of a PWM.

Args:
weights (np.array): array containing PWM weight values
title (str): Title of the logo.
figsize (tuple): (width, height)
fileName (str): path for saving the plot, will not save if empty

Returns:
PWM logo

"""
# Convert PWM to PPM
ppm = pwm_to_ppm(weights)
ppm_logo(ppm, title=title, figsize=figsize)
ppm_logo(ppm, title=title, figsize=figsize, fileName=fileName)


def plot_pwm(pwm_df, Matrix_id, figsize=(5, 2)):
def plot_pwm(pwm_df, Matrix_id, figsize=(5, 2), fileName = ""):
"""Function to plot sequence logo from PWM

Args:
pwm_df (pd.DataFrame): Dataframe containing cols weight, Matrix_id
Matrix_id: ID of PWM to plot (will be used as logo title)
figsize (tuple): (width, height)
fileName (str): path for saving the plot, will not save if empty

Returns:
Plots PWM

"""
weights = pwm_df.weights.values[pwm_df.Matrix_id == Matrix_id][0]
pwm_logo(weights=weights, title=Matrix_id, figsize=figsize)
pwm_logo(weights=weights, title=Matrix_id, figsize=figsize, fileName=fileName)


def plot_ppm(ppm_df, Matrix_id, figsize=(5, 2)):
def plot_ppm(ppm_df, Matrix_id, figsize=(5, 2), fileName = ""):
"""
Function to plot sequence logo from PPM

Args:
ppm_df (pd.DataFrame): Dataframe containing cols probs, Matrix_id
Matrix_id: ID of PWM to plot
fileName (str): path for saving the plot, will not save if empty

Returns:
Plots PPM

"""
probs = ppm_df.probs.values[ppm_df.Matrix_id == Matrix_id][0]
ppm_logo(probs=probs, title=Matrix_id, figsize=figsize)
ppm_logo(probs=probs, title=Matrix_id, figsize=figsize, fileName=fileName)


def plot_binned_count_dist(
Expand Down
Loading