Skip to content

Commit

Permalink
merge and dependencies upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
jamescalam committed Dec 2, 2023
2 parents 35b0c8e + 096991a commit 7ca32aa
Show file tree
Hide file tree
Showing 15 changed files with 952 additions and 320 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Test

on:
push:
branches:
- main
pull_request:

env:
POETRY_VERSION: "1.5.1"

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.11"
steps:
- uses: actions/checkout@v4
- name: Cache Poetry
uses: actions/cache@v3
with:
path: ~/.poetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install poetry
run: |
pipx install poetry==$POETRY_VERSION
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: poetry
- name: Install dependencies
run: |
poetry install
- name: Pytest
run: |
make test
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ venv/
.env*.local
.env
mac.env

# Code coverage history
.coverage
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ lint_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep
lint lint_diff:
poetry run black $(PYTHON_FILES) --check
poetry run ruff .

test:
poetry run pytest -vv --cov=semantic_router --cov-report=term-missing --cov-fail-under=100
60 changes: 14 additions & 46 deletions docs/examples/hybrid-layer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,30 +46,20 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"COHERE_API_KEY\"] = \"<<COHERE_API_KEY>>\""
"os.environ[\"COHERE_API_KEY\"] = \"BQBiUqqjDRsYl1QKKux4JsqKdDkjyInS5T3Z3eJP\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/jamesbriggs/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from semantic_router.schema import Decision\n",
"\n",
Expand All @@ -95,7 +85,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -133,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -156,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -167,40 +157,18 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'politics'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dl(\"don't you love politics?\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'chitchat'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dl(\"how's the weather today?\")"
]
Expand Down
692 changes: 426 additions & 266 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ authors = [
"James Briggs <[email protected]>",
"Siraj Aizlewood <[email protected]>",
"Simonas Jakubonis <[email protected]>",
"Luca Mannini <[email protected]>"
"Luca Mannini <[email protected]>",
"Bogdan Buduroiu <[email protected]>"
]
readme = "README.md"

Expand All @@ -15,13 +16,17 @@ python = "^3.10"
pydantic = "^1.8.2"
openai = "^0.28.1"
cohere = "^4.32"
numpy = "^1.25.2"
pinecone-text = "^0.7.0"


[tool.poetry.group.dev.dependencies]
ipykernel = "^6.26.0"
ruff = "^0.1.5"
black = "^23.11.0"
pytest = "^7.4.3"
pytest-mock = "^3.12.0"
pytest-cov = "^4.1.0"

[build-system]
requires = ["poetry-core"]
Expand Down
11 changes: 4 additions & 7 deletions semantic_router/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
OpenAIEncoder,
BM25Encoder
)
from semantic_router.linear import similarity_matrix, top_scores
from semantic_router.schema import Decision


Expand Down Expand Up @@ -69,13 +70,9 @@ def _query(self, text: str, top_k: int = 5):
xq = np.squeeze(xq) # Reduce to 1d array.

if self.index is not None:
index_norm = norm(self.index, axis=1)
xq_norm = norm(xq.T)
sim = np.dot(self.index, xq.T) / (index_norm * xq_norm)
# get indices of top_k records
top_k = min(top_k, sim.shape[0])
idx = np.argpartition(sim, -top_k)[-top_k:]
scores = sim[idx]
# calculate similarity matrix
sim = similarity_matrix(xq, self.index)
scores, idx = top_scores(sim, top_k)
# get the utterance categories (decision names)
decisions = self.categories[idx] if self.categories is not None else []
return [
Expand Down
30 changes: 30 additions & 0 deletions semantic_router/linear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Tuple

import numpy as np
from numpy.linalg import norm


def similarity_matrix(xq: np.ndarray, index: np.ndarray) -> np.ndarray:
"""Compute the similarity scores between a query vector and a set of vectors.
Args:
xq: A query vector (1d ndarray)
index: A set of vectors.
Returns:
The similarity between the query vector and the set of vectors.
"""

index_norm = norm(index, axis=1)
xq_norm = norm(xq.T)
sim = np.dot(index, xq.T) / (index_norm * xq_norm)
return sim


def top_scores(sim: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
# get indices of top_k records
top_k = min(top_k, sim.shape[0])
idx = np.argpartition(sim, -top_k)[-top_k:]
scores = sim[idx]

return scores, idx
69 changes: 69 additions & 0 deletions tests/functional/test_linear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
import pytest

from semantic_router.linear import similarity_matrix, top_scores


@pytest.fixture
def ident_vector():
return np.identity(10)[0]


@pytest.fixture
def test_index():
return np.array([[3, 0, 0], [2, 1, 0], [0, 1, 0]])


def test_similarity_matrix__dimensionality():
"""Test that the similarity matrix is square."""
xq = np.random.random((10,)) # 10-dimensional embedding vector
index = np.random.random((100, 10))
S = similarity_matrix(xq, index)
assert S.shape == (100,)


def test_similarity_matrix__is_norm_max(ident_vector):
"""
Using identical vectors should yield a maximum similarity of 1
"""
index = np.repeat(np.atleast_2d(ident_vector), 3, axis=0)
sim = similarity_matrix(ident_vector, index)
assert sim.max() == 1.0


def test_similarity_matrix__is_norm_min(ident_vector):
"""
Using orthogonal vectors should yield a minimum similarity of 0
"""
orth_v = np.roll(np.atleast_2d(ident_vector), 1)
index = np.repeat(orth_v, 3, axis=0)
sim = similarity_matrix(ident_vector, index)
assert sim.min() == 0.0


def test_top_scores__is_sorted(test_index):
"""
Test that the top_scores function returns a sorted list of scores.
"""

xq = test_index[0] # should have max similarity

sim = similarity_matrix(xq, test_index)
_, idx = top_scores(sim, 3)

# Scores and indexes should be sorted ascending
assert np.array_equal(idx, np.array([2, 1, 0]))


def test_top_scores__scores(test_index):
"""
Test that for a known vector and a known index, the top_scores function
returns exactly the expected scores.
"""
xq = test_index[0] # should have max similarity

sim = similarity_matrix(xq, test_index)
scores, _ = top_scores(sim, 3)

# Scores and indexes should be sorted ascending
assert np.allclose(scores, np.array([0.0, 0.89442719, 1.0]))
16 changes: 16 additions & 0 deletions tests/unit/encoders/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

from semantic_router.encoders import BaseEncoder


class TestBaseEncoder:
@pytest.fixture
def base_encoder(self):
return BaseEncoder(name="TestEncoder")

def test_base_encoder_initialization(self, base_encoder):
assert base_encoder.name == "TestEncoder", "Initialization of name failed"

def test_base_encoder_call_method_not_implemented(self, base_encoder):
with pytest.raises(NotImplementedError):
base_encoder(["some", "texts"])
41 changes: 41 additions & 0 deletions tests/unit/encoders/test_cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pytest

from semantic_router.encoders import CohereEncoder


@pytest.fixture
def cohere_encoder(mocker):
mocker.patch("cohere.Client")
return CohereEncoder(cohere_api_key="test_api_key")


class TestCohereEncoder:
def test_initialization_with_api_key(self, cohere_encoder):
assert cohere_encoder.client is not None, "Client should be initialized"
assert (
cohere_encoder.name == "embed-english-v3.0"
), "Default name not set correctly"

def test_initialization_without_api_key(self, mocker, monkeypatch):
monkeypatch.delenv("COHERE_API_KEY", raising=False)
mocker.patch("cohere.Client")
with pytest.raises(ValueError):
CohereEncoder()

def test_call_method(self, cohere_encoder, mocker):
mock_embed = mocker.MagicMock()
mock_embed.embeddings = [[0.1, 0.2, 0.3]]
cohere_encoder.client.embed.return_value = mock_embed

result = cohere_encoder(["test"])
assert isinstance(result, list), "Result should be a list"
assert all(
isinstance(sublist, list) for sublist in result
), "Each item in result should be a list"
cohere_encoder.client.embed.assert_called_once()

def test_call_with_uninitialized_client(self, mocker):
mocker.patch("cohere.Client", return_value=None)
encoder = CohereEncoder(cohere_api_key="test_api_key")
with pytest.raises(ValueError):
encoder(["test"])
Loading

0 comments on commit 7ca32aa

Please sign in to comment.