From b4fce30302283314fefb1b0f08e4e19cefefbf9c Mon Sep 17 00:00:00 2001 From: Bogdan Buduroiu Date: Tue, 28 Nov 2023 10:45:36 +0800 Subject: [PATCH 1/3] Functional tests: capture algo performance --- semantic_router/layer.py | 19 +++---- semantic_router/linear.py | 30 +++++++++++ tests/functional/test_linear.py | 69 ++++++++++++++++++++++++ tests/{ => unit}/encoders/test_base.py | 0 tests/{ => unit}/encoders/test_cohere.py | 0 tests/{ => unit}/encoders/test_openai.py | 0 tests/{ => unit}/test_layer.py | 0 tests/{ => unit}/test_schema.py | 0 8 files changed, 105 insertions(+), 13 deletions(-) create mode 100644 semantic_router/linear.py create mode 100644 tests/functional/test_linear.py rename tests/{ => unit}/encoders/test_base.py (100%) rename tests/{ => unit}/encoders/test_cohere.py (100%) rename tests/{ => unit}/encoders/test_openai.py (100%) rename tests/{ => unit}/test_layer.py (100%) rename tests/{ => unit}/test_schema.py (100%) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 089f2793..dd746d0e 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -2,6 +2,7 @@ from numpy.linalg import norm from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder +from semantic_router.linear import similarity_matrix, top_scores from semantic_router.schema import Decision @@ -63,18 +64,12 @@ def _query(self, text: str, top_k: int = 5): xq = np.squeeze(xq) # Reduce to 1d array. if self.index is not None: - index_norm = norm(self.index, axis=1) - xq_norm = norm(xq.T) - sim = np.dot(self.index, xq.T) / (index_norm * xq_norm) - # get indices of top_k records - top_k = min(top_k, sim.shape[0]) - idx = np.argpartition(sim, -top_k)[-top_k:] - scores = sim[idx] + # calculate similarity matrix + sim = similarity_matrix(xq, self.index) + scores, idx = top_scores(sim, top_k) # get the utterance categories (decision names) decisions = self.categories[idx] if self.categories is not None else [] - return [ - {"decision": d, "score": s.item()} for d, s in zip(decisions, scores) - ] + return [{"decision": d, "score": s.item()} for d, s in zip(decisions, scores)] else: return [] @@ -89,9 +84,7 @@ def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float scores_by_class[decision] = [score] # Calculate total score for each class - total_scores = { - decision: sum(scores) for decision, scores in scores_by_class.items() - } + total_scores = {decision: sum(scores) for decision, scores in scores_by_class.items()} top_class = max(total_scores, key=lambda x: total_scores[x], default=None) # Return the top class and its associated scores diff --git a/semantic_router/linear.py b/semantic_router/linear.py new file mode 100644 index 00000000..1c13262f --- /dev/null +++ b/semantic_router/linear.py @@ -0,0 +1,30 @@ +from typing import Tuple + +import numpy as np +from numpy.linalg import norm + + +def similarity_matrix(xq: np.ndarray, index: np.ndarray) -> np.ndarray: + """Compute the similarity scores between a query vector and a set of vectors. + + Args: + xq: A query vector (1d ndarray) + index: A set of vectors. + + Returns: + The similarity between the query vector and the set of vectors. + """ + + index_norm = norm(index, axis=1) + xq_norm = norm(xq.T) + sim = np.dot(index, xq.T) / (index_norm * xq_norm) + return sim + + +def top_scores(sim: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]: + # get indices of top_k records + top_k = min(top_k, sim.shape[0]) + idx = np.argpartition(sim, -top_k)[-top_k:] + scores = sim[idx] + + return scores, idx diff --git a/tests/functional/test_linear.py b/tests/functional/test_linear.py new file mode 100644 index 00000000..6771fd9c --- /dev/null +++ b/tests/functional/test_linear.py @@ -0,0 +1,69 @@ +import pytest +import numpy as np + +from semantic_router.linear import similarity_matrix, top_scores + + +@pytest.fixture +def ident_vector(): + return np.identity(10)[0] + + +@pytest.fixture +def test_index(): + return np.array([[3, 0, 0], [2, 1, 0], [0, 1, 0]]) + + +def test_similarity_matrix__dimensionality(): + """Test that the similarity matrix is square.""" + xq = np.random.random((10,)) # 10-dimensional embedding vector + index = np.random.random((100, 10)) + S = similarity_matrix(xq, index) + assert S.shape == (100,) + + +def test_similarity_matrix__is_norm_max(ident_vector): + """ + Using identical vectors should yield a maximum similarity of 1 + """ + index = np.repeat(np.atleast_2d(ident_vector), 3, axis=0) + sim = similarity_matrix(ident_vector, index) + assert sim.max() == 1.0 + + +def test_similarity_matrix__is_norm_min(ident_vector): + """ + Using orthogonal vectors should yield a minimum similarity of 0 + """ + orth_v = np.roll(np.atleast_2d(ident_vector), 1) + index = np.repeat(orth_v, 3, axis=0) + sim = similarity_matrix(ident_vector, index) + assert sim.min() == 0.0 + + +def test_top_scores__is_sorted(test_index): + """ + Test that the top_scores function returns a sorted list of scores. + """ + + xq = test_index[0] # should have max similarity + + sim = similarity_matrix(xq, test_index) + _, idx = top_scores(sim, 3) + + # Scores and indexes should be sorted ascending + assert np.array_equal(idx, np.array([2, 1, 0])) + + +def test_top_scores__scores(test_index): + """ + Test that for a known vector and a known index, the top_scores function + returns exactly the expected scores. + """ + xq = test_index[0] # should have max similarity + + sim = similarity_matrix(xq, test_index) + scores, _ = top_scores(sim, 3) + + # Scores and indexes should be sorted ascending + assert np.allclose(scores, np.array([0.0, 0.89442719, 1.0])) diff --git a/tests/encoders/test_base.py b/tests/unit/encoders/test_base.py similarity index 100% rename from tests/encoders/test_base.py rename to tests/unit/encoders/test_base.py diff --git a/tests/encoders/test_cohere.py b/tests/unit/encoders/test_cohere.py similarity index 100% rename from tests/encoders/test_cohere.py rename to tests/unit/encoders/test_cohere.py diff --git a/tests/encoders/test_openai.py b/tests/unit/encoders/test_openai.py similarity index 100% rename from tests/encoders/test_openai.py rename to tests/unit/encoders/test_openai.py diff --git a/tests/test_layer.py b/tests/unit/test_layer.py similarity index 100% rename from tests/test_layer.py rename to tests/unit/test_layer.py diff --git a/tests/test_schema.py b/tests/unit/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/unit/test_schema.py From c6ffb6cff5e6394821c9778a38cc5410ecdb9673 Mon Sep 17 00:00:00 2001 From: Bogdan Buduroiu Date: Tue, 28 Nov 2023 10:49:10 +0800 Subject: [PATCH 2/3] Black formatting --- semantic_router/layer.py | 8 ++++++-- tests/functional/test_linear.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index dd746d0e..fcedc979 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -69,7 +69,9 @@ def _query(self, text: str, top_k: int = 5): scores, idx = top_scores(sim, top_k) # get the utterance categories (decision names) decisions = self.categories[idx] if self.categories is not None else [] - return [{"decision": d, "score": s.item()} for d, s in zip(decisions, scores)] + return [ + {"decision": d, "score": s.item()} for d, s in zip(decisions, scores) + ] else: return [] @@ -84,7 +86,9 @@ def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float scores_by_class[decision] = [score] # Calculate total score for each class - total_scores = {decision: sum(scores) for decision, scores in scores_by_class.items()} + total_scores = { + decision: sum(scores) for decision, scores in scores_by_class.items() + } top_class = max(total_scores, key=lambda x: total_scores[x], default=None) # Return the top class and its associated scores diff --git a/tests/functional/test_linear.py b/tests/functional/test_linear.py index 6771fd9c..210de6d2 100644 --- a/tests/functional/test_linear.py +++ b/tests/functional/test_linear.py @@ -1,5 +1,5 @@ -import pytest import numpy as np +import pytest from semantic_router.linear import similarity_matrix, top_scores From c39d885032921b2483d0233b9add675674e5c9f6 Mon Sep 17 00:00:00 2001 From: Bogdan Buduroiu Date: Tue, 28 Nov 2023 11:36:03 +0800 Subject: [PATCH 3/3] Removes redundant numpy.linalg.norm in semantic_router.layer --- semantic_router/layer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index fcedc979..5add0784 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,5 +1,4 @@ import numpy as np -from numpy.linalg import norm from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder from semantic_router.linear import similarity_matrix, top_scores