bump sdk, cleanup code, readme refresh

MeltanoLabs · Jan 18, 2024 · 4918b2c · 4918b2c
1 parent 50e35fa
commit 4918b2c
Show file tree

Hide file tree

Showing 4 changed files with 1,240 additions and 1,014 deletions.
diff --git a/README.md b/README.md
@@ -13,9 +13,11 @@ Built with the [Meltano Singer SDK](https://sdk.meltano.com).
 
 | Setting                   | Required | Default | Description |
 |:--------------------------|:--------:|:-------:|:------------|
-| document_text_property    | False    | page_content |             |
-| document_metadata_property| False    | metadata |             |
+| document_text_property    | False    | page_content | The name of the property containing the document text. |
+| document_metadata_property| False    | metadata | The name of the property containing the document metadata. |
 | openai_api_key            | False    | None    | OpenAI API key. Optional if `OPENAI_API_KEY` env var is set. |
+| splitter_config            | False    | { "chunk_size": 1000, "chunk_overlap": 200, }    | Configuration for the text splitter. |
+| split_documents            | False    | True    | Whether to split document into chunks. |
 | stream_maps               | False    | None    | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). |
 | stream_map_config         | False    | None    | User-defined config values to be used within map expressions. |
 

diff --git a/map_gpt_embeddings/mappers.py b/map_gpt_embeddings/mappers.py
@@ -59,6 +59,21 @@ def map_schema_message(self, message_dict: dict) -> t.Iterable[Message]:
             secret=True,
             description="OpenAI API key. Optional if `OPENAI_API_KEY` env var is set.",
         ),
+        th.Property(
+            "splitter_config",
+            th.ObjectType(),
+            description="Configuration for the text splitter.",
+            default={
+                "chunk_size": 1000,
+                "chunk_overlap": 200,
+            }
+        ),
+        th.Property(
+            "split_documents",
+            th.BooleanType,
+            description="Whether to split document into chunks.",
+            default=True,
+        ),
     ).to_dict()
 
     def _validate_config(
@@ -103,27 +118,21 @@ def split_record(self, record: dict) -> t.Iterable[dict]:
         Yields:
             A generator of record dicts.
         """
-        raw_document_text = record[self.config["document_text_property"]]
-        metadata_dict = record[self.config["document_metadata_property"]]
-
-        if not self.config.get("split_documents", True):
+        if not self.config["split_documents"]:
             yield record
             return
 
-        splitter_config = self.config.get("splitter_config", {})
-        if "chunk_size" not in splitter_config:
-            splitter_config["chunk_size"] = 1000
-        if "chunk_overlap" not in splitter_config:
-            splitter_config["chunk_overlap"] = 200
-        text_splitter = RecursiveCharacterTextSplitter(**splitter_config)
+        raw_document_text = record[self.config["document_text_property"]]
+        metadata_dict = record[self.config["document_metadata_property"]]
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            **self.config["splitter_config"]
+        )
 
         document = Document(page_content=raw_document_text, metadata=metadata_dict)
 
         document_segments = text_splitter.split_documents([document])
 
-        # assert document_segments and len(
-        #     document_segments
-        # ), "No documents output from split."
         if len(document_segments) > 1:
             self.logger.debug("Document split into %s segments", len(document_segments))
         elif len(document_segments) == 1: