Skip to content

Commit

Permalink
chore: cleanup set up of operations
Browse files Browse the repository at this point in the history
  • Loading branch information
Askir committed Dec 12, 2024
1 parent ff28a39 commit 5e76cf9
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 19 deletions.
47 changes: 47 additions & 0 deletions docs/python-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,50 @@ context.configure(
```

This should now prevent alembic from generating tables for these models when you run `alembic revision --autogenerate`.



pgai provides native Alembic operations for managing vectorizers. For them to work you need to run `setup_alembic` in your env.py file. Which registers the pgai operations under the global op context:

```python
from pgai.alembic import register_operations

register_operations()
```

Then you can use the `create_vectorizer` operation to create a vectorizer for your model. As well as the `drop_vectorizer` operation to remove it.

```python
from alembic import op
from pgai.configuration import (
OpenAIEmbeddingConfig,
ChunkingConfig,
DiskANNIndexingConfig
)


def upgrade() -> None:
op.create_vectorizer(
source_table="blog_posts",
target_table="blog_posts_content_embeddings_store",
embedding=OpenAIEmbeddingConfig(
model="text-embedding-3-small",
dimensions=768
),
chunking=ChunkingConfig.recursive_character_text_splitter(
source_column="content",
chunk_size=50,
chunk_overlap=10
),
indexing=DiskANNIndexingConfig(
min_rows=10,
num_dimensions=768
)
)


def downgrade() -> None:
op.drop_vectorizer(vectorizer_id=1, drop_all=True)
```

The `create_vectorizer` operation supports all configuration options available in the [SQL API](vectorizer-api-reference.md).
8 changes: 6 additions & 2 deletions projects/pgai/pgai/alembic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from pgai.alembic.operations import CreateVectorizerOp, DropVectorizerOp
from pgai.alembic.operations import (
CreateVectorizerOp,
DropVectorizerOp,
register_operations,
)

__all__ = ["CreateVectorizerOp", "DropVectorizerOp"]
__all__ = ["CreateVectorizerOp", "DropVectorizerOp", "register_operations"]
18 changes: 14 additions & 4 deletions projects/pgai/pgai/alembic/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
)


@Operations.register_operation("create_vectorizer")
class CreateVectorizerOp(MigrateOperation):
"""Create a vectorizer for automatic embedding generation."""

Expand Down Expand Up @@ -68,7 +67,6 @@ def reverse(self) -> MigrateOperation:
return DropVectorizerOp(None, True)


@Operations.register_operation("drop_vectorizer")
class DropVectorizerOp(MigrateOperation):
"""Drop a vectorizer and its associated objects."""

Expand All @@ -93,14 +91,12 @@ def reverse(self) -> MigrateOperation:
return CreateVectorizerOp(None)


@Operations.implementation_for(CreateVectorizerOp)
def create_vectorizer(operations: Operations, operation: CreateVectorizerOp):
"""Implement CREATE VECTORIZER."""
params = operation.params
operations.execute(params.to_sql())


@Operations.implementation_for(DropVectorizerOp)
def drop_vectorizer(operations: Operations, operation: DropVectorizerOp):
"""Implement DROP VECTORIZER with cleanup of dependent objects."""
connection = operations.get_bind()
Expand All @@ -111,3 +107,17 @@ def drop_vectorizer(operations: Operations, operation: DropVectorizerOp):
text("SELECT ai.drop_vectorizer(:id, drop_all=>:drop_all)"),
{"id": vectorizer_id, "drop_all": operation.drop_all},
)


_operations_registered = False


def register_operations():
global _operations_registered

if not _operations_registered:
Operations.register_operation("create_vectorizer")(CreateVectorizerOp)
Operations.register_operation("drop_vectorizer")(DropVectorizerOp)
Operations.implementation_for(CreateVectorizerOp)(create_vectorizer)
Operations.implementation_for(DropVectorizerOp)(drop_vectorizer)
_operations_registered = True
14 changes: 2 additions & 12 deletions projects/pgai/pgai/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Literal, Protocol, runtime_checkable

from alembic.autogenerate.api import AutogenContext

from pgai.vectorizer.processing import ProcessingDefault


Expand Down Expand Up @@ -120,15 +121,13 @@ def to_python_arg(self) -> str:

@dataclass
class NoIndexingConfig:

def to_sql_argument(self) -> str:
return ", indexing => ai.indexing_none()"

def to_python_arg(self) -> str:
return format_python_arg("indexing", self)



@dataclass
class DiskANNIndexingConfig:
min_rows: int | None = None
Expand All @@ -140,7 +139,6 @@ class DiskANNIndexingConfig:
num_bits_per_dimension: int | None = None
create_when_queue_empty: bool | None = None


def to_sql_argument(self) -> str:
params: list[str] = []
if self.min_rows is not None:
Expand Down Expand Up @@ -177,8 +175,6 @@ class HNSWIndexingConfig:
ef_construction: int | None = None
create_when_queue_empty: bool | None = None



def to_sql_argument(self) -> str:
params: list[str] = []
if self.min_rows is not None:
Expand All @@ -201,23 +197,20 @@ def to_python_arg(self) -> str:

@dataclass
class NoSchedulingConfig:

def to_sql_argument(self) -> str:
return ", scheduling => ai.scheduling_none()"

def to_python_arg(self) -> str:
return format_python_arg("scheduling", self)



@dataclass
class SchedulingConfig:
schedule_interval: str | None = None
initial_start: str | None = None
fixed_schedule: bool | None = None
timezone: str | None = None


def to_sql_argument(self) -> str:
params: list[str] = []
if self.schedule_interval is not None:
Expand All @@ -239,7 +232,6 @@ class ProcessingConfig:
batch_size: int | None = None
concurrency: int | None = None


def to_sql_argument(self) -> str:
params: list[str] = []
if self.batch_size is not None:
Expand All @@ -251,7 +243,6 @@ def to_sql_argument(self) -> str:
def to_python_arg(self) -> str:
return format_python_arg("processing", self)


@classmethod
def from_db_config(cls, config: ProcessingDefault) -> "ProcessingConfig":
return cls(
Expand Down Expand Up @@ -288,7 +279,6 @@ class CreateVectorizerParams:
grant_to: list[str] | None = None
enqueue_existing: bool = True


def to_sql(self) -> str:
parts = ["SELECT ai.create_vectorizer(", f"'{self.source_table}'::regclass"]

Expand Down Expand Up @@ -368,4 +358,4 @@ def to_python(self, autogen_context: AutogenContext) -> str:
f"from pgai.configuration import {import_names}"
)

return "op.create_vectorizer(\n " + ",\n ".join(args) + "\n)"
return "op.create_vectorizer(\n " + ",\n ".join(args) + "\n)"
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
from pgai.alembic import register_operations

register_operations()

config = context.config
if config.config_file_name is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Revises: {revises}
Create Date: {create_date}
"""
from alembic import op
from pgai.alembic import CreateVectorizerOp
from pgai.configuration import (
OpenAIEmbeddingConfig,
ChunkingConfig
Expand Down

0 comments on commit 5e76cf9

Please sign in to comment.