timescale · kolaente · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
@@ -6,15 +6,31 @@ create or replace function ai.embedding_openai
 , dimensions pg_catalog.int4
 , chat_user pg_catalog.text default null
 , api_key_name pg_catalog.text default 'OPENAI_API_KEY'
+, use_batch_api pg_catalog.bool default false
+, embedding_batch_schema pg_catalog.name default null
+, embedding_batch_table pg_catalog.name default null
+, embedding_batch_chunks_table pg_catalog.name default null
 ) returns pg_catalog.jsonb
 as $func$
+declare
+    _vectorizer_id pg_catalog.int4;
+begin
+    _vectorizer_id = pg_catalog.nextval('ai.vectorizer_id_seq'::pg_catalog.regclass);
+    embedding_batch_schema = coalesce(embedding_batch_schema, 'ai');
+    embedding_batch_table = coalesce(embedding_batch_table, pg_catalog.concat('_vectorizer_embedding_batches_', _vectorizer_id));
+    embedding_batch_chunks_table = coalesce(embedding_batch_chunks_table, pg_catalog.concat('_vectorizer_embedding_batch_chunks_', _vectorizer_id));
+
     select json_object
     ( 'implementation': 'openai'
     , 'config_type': 'embedding'
     , 'model': model
     , 'dimensions': dimensions
     , 'user': chat_user
     , 'api_key_name': api_key_name
+    , 'use_batch_api': use_batch_api
+    , 'embedding_batch_schema': embedding_batch_schema
+    , 'embedding_batch_table': embedding_batch_table
+    , 'embedding_batch_chunks_table': embedding_batch_chunks_table
     absent on null
     )
 $func$ language sql immutable security invoker
@@ -81,6 +97,9 @@ as $func$
 declare
     _config_type pg_catalog.text;
     _implementation pg_catalog.text;
+    _embedding_batch_schema pg_catalog.text;
+    _embedding_batch_table pg_catalog.text;
+    _embedding_batch_chunks_table pg_catalog.text;
 begin
     if pg_catalog.jsonb_typeof(config) operator(pg_catalog.!=) 'object' then
         raise exception 'embedding config is not a jsonb object';
@@ -93,6 +112,19 @@ begin
     _implementation = config operator(pg_catalog.->>) 'implementation';
     case _implementation
         when 'openai' then
+            -- make sure embedding batch table name is available
+            select (config operator (pg_catalog.->> 'embedding_batch_schema'))::text into _embedding_batch_schema;
+            select (config operator (pg_catalog.->> 'embedding_batch_table'))::text into _embedding_batch_table;
+            select (config operator (pg_catalog.->> 'embedding_batch_chunks_table'))::text into _embedding_batch_chunks_table;
+            if pg_catalog.to_regclass(pg_catalog.format('%I.%I', _embedding_batch_schema, _embedding_batch_table)) is not null then
+                raise exception 'an object named %.% already exists. specify an alternate embedding_batch_table explicitly', queue_schema, queue_table;
+            end if;
+
+            -- make sure embedding batch chunks table name is available
+            if pg_catalog.to_regclass(pg_catalog.format('%I.%I', _embedding_batch_schema, _embedding_batch_chunks_table)) is not null then
+                raise exception 'an object named %.% already exists. specify an alternate embedding_batch_chunks_table explicitly', queue_schema, queue_table;
+            end if;
+
             -- ok
         when 'ollama' then
             -- ok

@@ -1,5 +1,3 @@
-
-
 -------------------------------------------------------------------------------
 -- execute_vectorizer
 create or replace function ai.execute_vectorizer(vectorizer_id pg_catalog.int4) returns void
@@ -44,6 +42,7 @@ declare
     _vectorizer_id pg_catalog.int4;
     _sql pg_catalog.text;
     _job_id pg_catalog.int8;
+    _implementation pg_catalog.text;
 begin
     -- make sure all the roles listed in grant_to exist
     if grant_to is not null then
@@ -225,6 +224,17 @@ begin
         scheduling = pg_catalog.jsonb_insert(scheduling, array['job_id'], pg_catalog.to_jsonb(_job_id));
     end if;
 
+    -- create batch embedding tables
+    select (embedding operator (pg_catalog.->> 'implementation'))::text into _implementation;
+    if _implementation = 'openai' then
+        perform ai._vectorizer_create_embedding_batches_table
+            (embedding_batch_schema
+            , embedding_batch_table
+            , embedding_batch_chunks_table
+            , grant_to
+            );
+    end if;
+
     insert into ai.vectorizer
     ( id
     , source_schema

@@ -0,0 +1,98 @@
+-------------------------------------------------------------------------------
+-- _vectorizer_create_queue_table
+create or replace function ai._vectorizer_create_embedding_batches_table
+( embedding_batch_schema name
+, embedding_batch_table name
+, embedding_batch_chunks_table name
+, grant_to name[]
+) returns void as
+$func$
+declare
+    _sql text;
+begin
+    -- create the batches table
+    select pg_catalog.format
+           ( $sql$create table %I.%I(
+    openai_batch_id VARCHAR(255) PRIMARY KEY,
+    input_file_id   VARCHAR(255) NOT NULL,
+    output_file_id  VARCHAR(255),
+    status          VARCHAR(255) NOT NULL,
+    errors          JSONB,
+    created_at      TIMESTAMP(0) NOT NULL DEFAULT NOW(),
+    expires_at      TIMESTAMP(0),
+    completed_at    TIMESTAMP(0),
+    failed_at       TIMESTAMP(0),
+    next_attempt_after TIMESTAMPTZ,
+    total_attempts BIGINT NOT NULL DEFAULT 0
+))$sql$
+               , embedding_batch_schema
+               , embedding_batch_table
+           ) into strict _sql
+    ;
+    execute _sql;
+
+    -- create the index
+    select pg_catalog.format
+           ( $sql$create index on %I.%I (status)$sql$
+               , embedding_batch_schema, embedding_batch_table
+           ) into strict _sql
+    ;
+    execute _sql;
+
+    -- create the batch chunks table
+    select pg_catalog.format
+           ( $sql$create table %I.%I(
+    id                 VARCHAR(255) PRIMARY KEY,
+    embedding_batch_id VARCHAR(255) REFERENCES %I.%I (openai_batch_id),
+    chunk              TEXT
+))$sql$
+               , embedding_batch_schema
+               , embedding_batch_chunks_table
+               , embedding_batch_schema
+               , embedding_batch_table
+           ) into strict _sql
+    ;
+    execute _sql;
+
+    if grant_to is not null then
+        -- grant usage on queue schema to grant_to roles
+        select pg_catalog.format
+               ( $sql$grant usage on schema %I to %s$sql$
+                   , embedding_batch_schema
+                   , (
+                     select pg_catalog.string_agg(pg_catalog.quote_ident(x), ', ')
+                     from pg_catalog.unnest(grant_to) x
+                 )
+               ) into strict _sql;
+        execute _sql;
+
+        -- grant select, update, delete on batches table to grant_to roles
+        select pg_catalog.format
+               ( $sql$grant select, insert, update, delete on %I.%I to %s$sql$
+                   , embedding_batch_schema
+                   , embedding_batch_table
+                   , (
+                     select pg_catalog.string_agg(pg_catalog.quote_ident(x), ', ')
+                     from pg_catalog.unnest(grant_to) x
+                 )
+               ) into strict _sql;
+        execute _sql;
+
+        -- grant select, update, delete on batch chunks table to grant_to roles
+        select pg_catalog.format
+               ( $sql$grant select, insert, update, delete on %I.%I to %s$sql$
+                   , embedding_batch_schema
+                   , embedding_batch_chunks_table
+                   , (
+                     select pg_catalog.string_agg(pg_catalog.quote_ident(x), ', ')
+                     from pg_catalog.unnest(grant_to) x
+                 )
+               ) into strict _sql;
+        execute _sql;
+    end if;
+end;
+$func$
+    language plpgsql volatile security invoker
+                     set search_path to pg_catalog, pg_temp
+;
+
@@ -1,4 +1,6 @@
+import json
 import re
+import tempfile
 from collections.abc import Sequence
 from functools import cached_property
 from typing import Any, Literal
@@ -39,12 +41,20 @@ class OpenAI(ApiKeyMixin, BaseModel, Embedder):
         model (str): The name of the OpenAI model used for embeddings.
         dimensions (int | None): Optional dimensions for the embeddings.
         user (str | None): Optional user identifier for OpenAI API usage.
+        use_batch (bool): Whether to use OpenAI Batch API.
+        embedding_batch_schema (str | None): The schema where the embedding batches are stored.
+        embedding_batch_table (str | None): The table where the embedding batches are stored.
+        embedding_batch_chunks_table (str | None): The table where the embedding batch chunks are stored.
     """
 
     implementation: Literal["openai"]
     model: str
     dimensions: int | None = None
     user: str | None = None
+    use_batch: bool = False
+    embedding_batch_schema: str | None = None
+    embedding_batch_table: str | None = None
+    embedding_batch_chunks_table: str | None = None
 
     @cached_property
     def _openai_dimensions(self) -> int | openai.NotGiven:
@@ -129,6 +139,50 @@ async def embed(
                 model_token_length, encoded_documents
             )
 
+    async def create_and_submit_embedding_batch(
+            self,
+            documents: list[dict[str, Any]],
+    ) -> openai.types.Batch:
+        """
+        Creates a batch of embeddings using OpenAI's embeddings API as outlined in
+        https://platform.openai.com/docs/guides/batch/batch-api?lang=python
+
+        Args:
+            documents (list[str]): A list of document chunks to be embedded.
+
+        Returns:
+
+        """
+
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl", mode="w") as temp_file:
+            for document in documents:
+                entry = {
+                    "custom_id": document["unique_full_chunk_id"],
+                    "method": "POST",
+                    "url": "/v1/embeddings",
+                    "body": {
+                        "model": self.model,
+                        "input": document["chunk"],
+                    },
+                }
+                temp_file.write(json.dumps(entry) + "\n")
+
+            temp_file.close()
+
+        client = openai.OpenAI() # TODO there has to be a client already which I could use instead?
+
+        with open(temp_file.name, "rb") as file:
+            batch_input_file = client.files.create(
+                file=file,
+                purpose="batch",
+            )
+
+        return client.batches.create(
+            input_file_id=batch_input_file.id,
+            endpoint="/v1/embeddings",
+            completion_window="24h",
+        )
+
     async def _filter_by_length_and_embed(
         self, model_token_length: int, encoded_documents: list[list[int]]
     ) -> Sequence[EmbeddingVector | ChunkEmbeddingError]: