From 60a332700420cae04da325805af29968936cec83 Mon Sep 17 00:00:00 2001 From: Jascha Date: Tue, 3 Dec 2024 12:23:42 +0100 Subject: [PATCH] docs: update docs with simplified vectorizer field --- docs/python-integration.md | 69 +++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/docs/python-integration.md b/docs/python-integration.md index aca1a294..53c4ff27 100644 --- a/docs/python-integration.md +++ b/docs/python-integration.md @@ -1,6 +1,6 @@ # SQLAlchemy Integration with pgAI Vectorizer -The `VectorizerField` is a custom SQLAlchemy field type that integrates pgAI's vectorization capabilities directly into your SQLAlchemy models. This allows you to easily create, manage, and query vector embeddings for your text data using familiar SQLAlchemy patterns. +The `VectorizerField` is a SQLAlchemy helper type that integrates pgAI's vectorization capabilities directly into your SQLAlchemy models. This allows you to easily query vector embeddings created by pgai using familiar SQLAlchemy patterns. ## Installation @@ -15,24 +15,27 @@ pip install "pgai[sqlalchemy]" Here's a basic example of how to use the `VectorizerField`: ```python -from sqlalchemy import Column, Integer, Text -from sqlalchemy.orm import declarative_base -from pgai.sqlalchemy import VectorizerField +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column +from pgai.sqlalchemy import VectorizerField, EmbeddingModel -Base = declarative_base() +class Base(DeclarativeBase): + pass class BlogPost(Base): __tablename__ = "blog_posts" - id = Column(Integer, primary_key=True) - title = Column(Text, nullable=False) - content = Column(Text, nullable=False) + id: Mapped[int] = mapped_column(primary_key=True) + title: Mapped[str] + content: Mapped[str] # Add vector embeddings for the content field content_embeddings = VectorizerField( dimensions=768, add_relationship=True, ) + + # Optional: Type hint for the relationship + content_embeddings_relation: Mapped[list[EmbeddingModel["BlogPost"]]] ``` ## Configuration @@ -40,7 +43,28 @@ class BlogPost(Base): The `VectorizerField` accepts the following parameters: - `dimensions` (int): The size of the embedding vector (required) -- `add_relationship` (bool): Whether to automatically create a relationship to the embeddings table (default: True) +- `target_schema` (str, optional): Override the schema for the embeddings table. If not provided, inherits from the parent model's schema +- `target_table` (str, optional): Override the table name for embeddings. Default is `{table_name}_{field_name}_store` +- `add_relationship` (bool): Whether to automatically create a relationship to the embeddings table (default: False) + +**Note:** The `VectorizerField` generates a new SQLAlchemy model, that is available under the attribute that you specify. If you are using alembics autogenerate functionality to generate migrations, you may need to exclude these models from the autogenerate process. +They are tagged with `pgai_managed=True`so you can simply exclude them by adding the following to your `env.py`: + +```python +def include_object(object, name, type_, reflected, compare_to): + if object.info.get("pgai_managed", False): + return False + return True + +context.configure( + connection=connection, + target_metadata=target_metadata, + include_object=include_object + ) +``` + +The model is only created at runtime, so depending on how your alembic migrations are set up this step could be skipped. Simply see what happens if you run `alembic revision --autogenerate` and if the model is included, add the above code. + ## Setting up the Vectorizer @@ -59,6 +83,8 @@ SELECT ai.create_vectorizer( ); ``` +We recommend adding this to a migration script and run it via alembic. + ## Querying Embeddings The `VectorizerField` provides several ways to work with embeddings: @@ -77,20 +103,20 @@ for embedding in embeddings: ### 2. Relationship Access -If `add_relationship=True`, you can access embeddings through the model relationship: +If `add_relationship=True`, you can access embeddings through the relationship field: ```python blog_post = session.query(BlogPost).first() -for embedding in blog_post.content_embeddings: +for embedding in blog_post.content_embeddings_relation: # Note: uses _relation suffix print(embedding.chunk) ``` ### 3. Semantic Search -You can perform semantic similarity searches using cosine distance: +You can perform semantic similarity searches using [pgvector-pythons](https://github.com/pgvector/pgvector-python) distance functions: ```python -from sqlalchemy import func +from sqlalchemy import func, text similar_posts = ( session.query(BlogPost.content_embeddings) @@ -114,15 +140,12 @@ for embedding in similar_posts: ### 4. Join Queries -You can combine embedding queries with regular SQL queries: +You can combine embedding queries with regular SQL queries using the relationship: ```python results = ( session.query(BlogPost, BlogPost.content_embeddings) - .join( - BlogPost.content_embeddings, - BlogPost.id == BlogPost.content_embeddings.id, - ) + .join(BlogPost.content_embeddings_relation) .filter(BlogPost.title.ilike("%search term%")) .all() ) @@ -130,12 +153,4 @@ results = ( for post, embedding in results: print(f"Title: {post.title}") print(f"Chunk: {embedding.chunk}") -``` - -## Generated Tables and Relationships - -When you use a `VectorizerField`, it creates: - -1. A table for storing embeddings (default name: `{table_name}_{field_name}_store`) -2. A one-to-many relationship between your model and the embeddings -3. A relationship from embeddings back to the parent model \ No newline at end of file +``` \ No newline at end of file