merge and dependencies upgrade

aurelio-labs · Dec 2, 2023 · 7ca32aa · 7ca32aa
2 parents 35b0c8e + 096991a
commit 7ca32aa
Show file tree

Hide file tree

Showing 15 changed files with 952 additions and 320 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,41 @@
+name: Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+env:
+  POETRY_VERSION: "1.5.1"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          - "3.11"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Cache Poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.poetry
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install poetry
+        run: |
+          pipx install poetry==$POETRY_VERSION
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: poetry
+      - name: Install dependencies
+        run: |
+          poetry install
+      - name: Pytest
+        run: |
+          make test
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,6 @@ venv/
 .env*.local
 .env
 mac.env
+
+# Code coverage history
+.coverage
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/Makefile b/Makefile
@@ -9,3 +9,6 @@ lint_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep
 lint lint_diff:
 	poetry run black $(PYTHON_FILES) --check
 	poetry run ruff .
+
+test:
+	poetry run pytest -vv --cov=semantic_router --cov-report=term-missing --cov-fail-under=100
diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb
@@ -46,30 +46,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"COHERE_API_KEY\"] = \"<<COHERE_API_KEY>>\""
+    "os.environ[\"COHERE_API_KEY\"] = \"BQBiUqqjDRsYl1QKKux4JsqKdDkjyInS5T3Z3eJP\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/jamesbriggs/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from semantic_router.schema import Decision\n",
     "\n",
@@ -95,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -133,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -156,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -167,40 +157,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'politics'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "dl(\"don't you love politics?\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'chitchat'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "dl(\"how's the weather today?\")"
    ]

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,8 @@ authors = [
     "James Briggs <[email protected]>",
     "Siraj Aizlewood <[email protected]>",
     "Simonas Jakubonis <[email protected]>",
-    "Luca Mannini <[email protected]>"
+    "Luca Mannini <[email protected]>",
+    "Bogdan Buduroiu <[email protected]>"
 ]
 readme = "README.md"
 
@@ -15,13 +16,17 @@ python = "^3.10"
 pydantic = "^1.8.2"
 openai = "^0.28.1"
 cohere = "^4.32"
+numpy = "^1.25.2"
 pinecone-text = "^0.7.0"
 
 
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.26.0"
 ruff = "^0.1.5"
 black = "^23.11.0"
+pytest = "^7.4.3"
+pytest-mock = "^3.12.0"
+pytest-cov = "^4.1.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/semantic_router/layer.py b/semantic_router/layer.py
@@ -8,6 +8,7 @@
     OpenAIEncoder,
     BM25Encoder
 )
+from semantic_router.linear import similarity_matrix, top_scores
 from semantic_router.schema import Decision
 
 
@@ -69,13 +70,9 @@ def _query(self, text: str, top_k: int = 5):
         xq = np.squeeze(xq)  # Reduce to 1d array.
 
         if self.index is not None:
-            index_norm = norm(self.index, axis=1)
-            xq_norm = norm(xq.T)
-            sim = np.dot(self.index, xq.T) / (index_norm * xq_norm)
-            # get indices of top_k records
-            top_k = min(top_k, sim.shape[0])
-            idx = np.argpartition(sim, -top_k)[-top_k:]
-            scores = sim[idx]
+            # calculate similarity matrix
+            sim = similarity_matrix(xq, self.index)
+            scores, idx = top_scores(sim, top_k)
             # get the utterance categories (decision names)
             decisions = self.categories[idx] if self.categories is not None else []
             return [

diff --git a/semantic_router/linear.py b/semantic_router/linear.py
@@ -0,0 +1,30 @@
+from typing import Tuple
+
+import numpy as np
+from numpy.linalg import norm
+
+
+def similarity_matrix(xq: np.ndarray, index: np.ndarray) -> np.ndarray:
+    """Compute the similarity scores between a query vector and a set of vectors.
+
+    Args:
+        xq: A query vector (1d ndarray)
+        index: A set of vectors.
+
+    Returns:
+        The similarity between the query vector and the set of vectors.
+    """
+
+    index_norm = norm(index, axis=1)
+    xq_norm = norm(xq.T)
+    sim = np.dot(index, xq.T) / (index_norm * xq_norm)
+    return sim
+
+
+def top_scores(sim: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
+    # get indices of top_k records
+    top_k = min(top_k, sim.shape[0])
+    idx = np.argpartition(sim, -top_k)[-top_k:]
+    scores = sim[idx]
+
+    return scores, idx
diff --git a/tests/functional/test_linear.py b/tests/functional/test_linear.py
@@ -0,0 +1,69 @@
+import numpy as np
+import pytest
+
+from semantic_router.linear import similarity_matrix, top_scores
+
+
+@pytest.fixture
+def ident_vector():
+    return np.identity(10)[0]
+
+
+@pytest.fixture
+def test_index():
+    return np.array([[3, 0, 0], [2, 1, 0], [0, 1, 0]])
+
+
+def test_similarity_matrix__dimensionality():
+    """Test that the similarity matrix is square."""
+    xq = np.random.random((10,))  # 10-dimensional embedding vector
+    index = np.random.random((100, 10))
+    S = similarity_matrix(xq, index)
+    assert S.shape == (100,)
+
+
+def test_similarity_matrix__is_norm_max(ident_vector):
+    """
+    Using identical vectors should yield a maximum similarity of 1
+    """
+    index = np.repeat(np.atleast_2d(ident_vector), 3, axis=0)
+    sim = similarity_matrix(ident_vector, index)
+    assert sim.max() == 1.0
+
+
+def test_similarity_matrix__is_norm_min(ident_vector):
+    """
+    Using orthogonal vectors should yield a minimum similarity of 0
+    """
+    orth_v = np.roll(np.atleast_2d(ident_vector), 1)
+    index = np.repeat(orth_v, 3, axis=0)
+    sim = similarity_matrix(ident_vector, index)
+    assert sim.min() == 0.0
+
+
+def test_top_scores__is_sorted(test_index):
+    """
+    Test that the top_scores function returns a sorted list of scores.
+    """
+
+    xq = test_index[0]  # should have max similarity
+
+    sim = similarity_matrix(xq, test_index)
+    _, idx = top_scores(sim, 3)
+
+    # Scores and indexes should be sorted ascending
+    assert np.array_equal(idx, np.array([2, 1, 0]))
+
+
+def test_top_scores__scores(test_index):
+    """
+    Test that for a known vector and a known index, the top_scores function
+    returns exactly the expected scores.
+    """
+    xq = test_index[0]  # should have max similarity
+
+    sim = similarity_matrix(xq, test_index)
+    scores, _ = top_scores(sim, 3)
+
+    # Scores and indexes should be sorted ascending
+    assert np.allclose(scores, np.array([0.0, 0.89442719, 1.0]))
diff --git a/tests/unit/encoders/test_base.py b/tests/unit/encoders/test_base.py
@@ -0,0 +1,16 @@
+import pytest
+
+from semantic_router.encoders import BaseEncoder
+
+
+class TestBaseEncoder:
+    @pytest.fixture
+    def base_encoder(self):
+        return BaseEncoder(name="TestEncoder")
+
+    def test_base_encoder_initialization(self, base_encoder):
+        assert base_encoder.name == "TestEncoder", "Initialization of name failed"
+
+    def test_base_encoder_call_method_not_implemented(self, base_encoder):
+        with pytest.raises(NotImplementedError):
+            base_encoder(["some", "texts"])
diff --git a/tests/unit/encoders/test_cohere.py b/tests/unit/encoders/test_cohere.py
@@ -0,0 +1,41 @@
+import pytest
+
+from semantic_router.encoders import CohereEncoder
+
+
+@pytest.fixture
+def cohere_encoder(mocker):
+    mocker.patch("cohere.Client")
+    return CohereEncoder(cohere_api_key="test_api_key")
+
+
+class TestCohereEncoder:
+    def test_initialization_with_api_key(self, cohere_encoder):
+        assert cohere_encoder.client is not None, "Client should be initialized"
+        assert (
+            cohere_encoder.name == "embed-english-v3.0"
+        ), "Default name not set correctly"
+
+    def test_initialization_without_api_key(self, mocker, monkeypatch):
+        monkeypatch.delenv("COHERE_API_KEY", raising=False)
+        mocker.patch("cohere.Client")
+        with pytest.raises(ValueError):
+            CohereEncoder()
+
+    def test_call_method(self, cohere_encoder, mocker):
+        mock_embed = mocker.MagicMock()
+        mock_embed.embeddings = [[0.1, 0.2, 0.3]]
+        cohere_encoder.client.embed.return_value = mock_embed
+
+        result = cohere_encoder(["test"])
+        assert isinstance(result, list), "Result should be a list"
+        assert all(
+            isinstance(sublist, list) for sublist in result
+        ), "Each item in result should be a list"
+        cohere_encoder.client.embed.assert_called_once()
+
+    def test_call_with_uninitialized_client(self, mocker):
+        mocker.patch("cohere.Client", return_value=None)
+        encoder = CohereEncoder(cohere_api_key="test_api_key")
+        with pytest.raises(ValueError):
+            encoder(["test"])