Skip to content

Commit

Permalink
similarity function
Browse files Browse the repository at this point in the history
  • Loading branch information
TahiriNadia authored Nov 8, 2023
1 parent ad0ea1d commit 6fa3ad7
Showing 1 changed file with 64 additions and 3 deletions.
67 changes: 64 additions & 3 deletions aphylogeo/alignement.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
from Bio.Align.Applications import ClustalwCommandline, MafftCommandline
from Bio.Seq import Seq

from aphylogeo.params import Params

from .params import Params
from .multiProcessor import Multi

import itertools
import textdistance as td

class Alignment:
"""
Expand Down Expand Up @@ -130,6 +131,8 @@ def __init__(
alignment_method=Params().alignment_method,
reference_gene_file=Params().reference_gene_file,
fit_method=Params().fit_method,
rate_similarity=Params().rate_similarity,
method_similarity=Params().method_similarity,
):
"""
Constructor if the alignment object.
Expand Down Expand Up @@ -167,6 +170,8 @@ def __init__(
##todo
self.msa (AlignIO())
self.rate_similarity
self.method_similarity
"""
self.window_size = window_size
self.step_size = step_size
Expand All @@ -176,7 +181,8 @@ def __init__(
self.alignment_method = alignment_method
self.reference_gene_file = reference_gene_file
self.fit_method = fit_method

self.rate_similarity = rate_similarity,
self.method_similarity = method_similarity,
"""
Method that align sequences
"""
Expand Down Expand Up @@ -751,8 +757,15 @@ def slidingWindow(self):
for i in range(0, seq_len, step):
if i + step < seq_len:
windowed_alignment[f"{i}_{i + step - 1}"] = {key: val[i : i + step - 1] for key, val in paddedMSA.items()}
combinations = itertools.combinations(windowed_alignment[f"{i}_{i + step - 1}"].values(),2)
else:
windowed_alignment[f"{i}_{seq_len-1}"] = {key: val[i : i + seq_len - 1] for key, val in paddedMSA.items()}
combinations = itertools.combinations(windowed_alignment[f"{i}_{seq_len-1}"].values(),2)
df = pd.DataFrame(list(combinations))

#if self.rate_similarity[0] < self.similarity(df):
# windowed_alignment.pop(f"{i}_{seq_len-1}")
# print(self.rate_similarity)

# JUST TO MAKE THE DEBUG FILES
if self.makeDebugFiles:
Expand Down Expand Up @@ -852,3 +865,51 @@ def fileToAlignIO(filename, ext):
f.close()
alignIO = AlignIO.read(StringIO(data), "fasta")
return alignIO

def similarity(self, df):
"""
Method that compute similarity in all string in dataframe (df)
Method source:
https://yassineelkhal.medium.com/the-complete-guide-to-string-similarity-algorithms-1290ad07c6b7
arguments:
method (int) similarity method
1: Hamming distance
2: Levenshtein distance
3: Damerau-Levenshtein distance
4: Jaro similarity
5: Jaro-Winkler similarity
6: Smith–Waterman similarity
7: Jaccard similarity
8: Sørensen-Dice similarity
--NOT IMPLEMENTED--
9: Tversky similarity
10: Overlap similarity
11: Cosine similarity
12: N-gram similarity
13: Ratcliff-Obershelp similarity
14: Longest common substring/subsequence similarity
df (dataframe) dataframe of all string sequances
return:
percentage similarity (float) the similarity between the dataframe of all strings elements
"""
rateSimilarity = 0.0
if self.method_similarity[0] == "1":
rateSimilarity = df.apply(lambda x: td.hamming.normalized_similarity(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "2":
rateSimilarity = df.apply(lambda x: td.levenshtein.normalized_similarity(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "3":
rateSimilarity = df.apply(lambda x: td.damerau_levenshtein.normalized_similarity(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "4":
rateSimilarity = df.apply(lambda x: td.jaro(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "5":
rateSimilarity = df.apply(lambda x: td.jaro_winkler(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "6":
rateSimilarity = df.apply(lambda x: td.smith_waterman(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "7":
rateSimilarity = df.apply(lambda x: td.jaccard(x[0],x[1]), axis=1).mean()*100
elif self.method_similarity[0] == "8":
rateSimilarity = df.apply(lambda x: td.sorencen(x[0],x[1]), axis=1).mean()*100

return rateSimilarity

0 comments on commit 6fa3ad7

Please sign in to comment.