updated pinecone to version 2x

llm-tools · Feb 7, 2024 · c2da0d0 · c2da0d0
1 parent 5b2f8e1
commit c2da0d0
Show file tree

Hide file tree

Showing 9 changed files with 217 additions and 190 deletions.
diff --git a/README.md b/README.md
@@ -333,20 +333,28 @@ You can enable Pinecone storage by following these steps -
 npm install @pinecone-database/pinecone
 ```
 
--   Set the pinecone environment variables `PINECONE_API_KEY` and `PINECONE_ENVIRONMENT`. These can be obtained from the **API Keys** section on the Pinecone dashboard.
+-   Set the pinecone environment variable `PINECONE_API_KEY`. This can be obtained from the **API Keys** section on the Pinecone dashboard.
 
 ```bash
-PINECONE_API_KEY="e65a4ec0-14f7-40c5-903e-f8529127b817"
-PINECONE_ENVIRONMENT="us-west1-gcp-free"
+PINECONE_API_KEY=<your api key>
 ```
 
 -   Set the Pinecone database as your choice of `vectorDb`
 
 ```TS
-.setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' }))
+.setVectorDb(new PineconeDb({
+    projectName: 'test',
+    namespace: 'dev',
+    indexSpec: {
+        pod: {
+            podType: 'p1.x1',
+            environment: 'us-east1-gcp',
+        },
+    },
+}))
 ```
 
-**Note:** The `projectName` will be used to create the Pinecone index name for this application.
+**Note:** Pinecone supports serverless and pod based index deployments. You can control how you want your index created using the indexSpec attribute. This is mandatory to be provided but comes with full type specification.
 
 ## LanceDB
 
@@ -636,6 +644,21 @@ If you want us to add support for a specific embedding model, please create an [
 
 All PRs are welcome.
 
+# Langsmith Integration
+
+Langsmith allows you to keep track of how you use LLM and embedding models. It logs histories, token uses and other metadata. Follow these three simple steps to enable -
+
+-   Sign up for an account with [Langsmith](https://smith.langchain.com/)
+-   Generate an API Key from your admin page
+-   Set the following environment keys in your project
+
+```bash
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+export LANGCHAIN_PROJECT="<project name>"
+export LANGCHAIN_API_KEY="<api key>"
+```
+
 # Azure OpenAI
 
 In order to be able to use an OpenAI model on Azure, it first needs to be deployed. Please refer to [Azure OpenAI documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) on how to deploy a model on Azure. To run this library, you will need to deploy two models -

diff --git a/examples/pinecone/src/index.ts b/examples/pinecone/src/index.ts
@@ -12,7 +12,18 @@ const llmApplication = await new LLMApplicationBuilder()
     .addLoader(new YoutubeLoader({ videoIdOrUrl: 'https://www.youtube.com/watch?v=w2KbwC-s7pY' }))
     .addLoader(new TextLoader({ text: 'The best company name for a company making colorful socks is MrSocks' }))
     .setCache(new LmdbCache({ path: path.resolve(path.dirname(__filename), '../../../cache') }))
-    .setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' }))
+    .setVectorDb(
+        new PineconeDb({
+            projectName: 'test',
+            namespace: 'dev',
+            indexSpec: {
+                pod: {
+                    podType: 'p1.x1',
+                    environment: 'us-east1-gcp',
+                },
+            },
+        }),
+    )
     .build();
 
 console.log(await llmApplication.query('What is paxos?'));

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@llm-tools/embedjs",
-    "version": "0.0.53",
+    "version": "0.0.57",
     "description": "A NodeJS RAG framework to easily work with LLMs and custom datasets",
     "main": "dist/index.js",
     "types": "dist/index.d.ts",
@@ -49,31 +49,31 @@
     "homepage": "https://github.com/llm-tools/embedjs#readme",
     "dependencies": {
         "@huggingface/inference": "^2.6.4",
-        "@langchain/cohere": "^0.0.3",
+        "@langchain/cohere": "^0.0.4",
         "@langchain/openai": "^0.0.14",
         "axios": "^1.6.7",
         "confluence.js": "^1.7.2",
         "debug": "^4.3.4",
         "html-to-text": "^9.0.5",
-        "langchain": "^0.1.12",
+        "langchain": "^0.1.13",
         "md5": "^2.3.0",
         "pdf-parse-fork": "^1.2.0",
-        "sitemapper": "^3.2.8",
-        "usetube": "^2.2.7",
+        "sitemapper": "^3.1.8",
+        "usetube": "^2.0.2",
         "uuid": "^9.0.1",
         "youtube-transcript": "^1.0.6"
     },
     "devDependencies": {
-        "@pinecone-database/pinecone": "^1.1.2",
+        "@pinecone-database/pinecone": "^2.0.1",
         "@qdrant/js-client-rest": "^1.7.0",
         "@tsconfig/recommended": "^1.0.3",
         "@types/debug": "^4.1.12",
         "@types/html-to-text": "^9.0.4",
         "@types/md5": "^2.3.5",
         "@types/node": "^20.11.16",
         "@types/usetube": "^2.1.2",
-        "@typescript-eslint/eslint-plugin": "^6.20.0",
-        "@typescript-eslint/parser": "^6.20.0",
+        "@typescript-eslint/eslint-plugin": "^6.21.0",
+        "@typescript-eslint/parser": "^6.21.0",
         "chromadb": "^1.8.1",
         "cohere-ai": "^7.7.5",
         "eslint": "^8.56.0",
@@ -82,28 +82,28 @@
         "hnswlib-node": "^2.1.0",
         "ioredis": "^5.3.2",
         "lmdb": "^2.9.2",
-        "prettier": "^3.2.4",
+        "prettier": "^3.2.5",
         "rimraf": "^5.0.5",
         "typescript": "^5.3.3",
-        "vectordb": "^0.4.7",
+        "vectordb": "^0.4.8",
         "weaviate-ts-client": "^2.0.0"
     },
     "peerDependencies": {
-        "@pinecone-database/pinecone": "^1.1.2",
+        "@pinecone-database/pinecone": "^2.0.1",
         "@qdrant/js-client-rest": "^1.7.0",
         "chromadb": "^1.8.1",
         "cohere-ai": "^7.7.5",
         "hnswlib-node": "^2.1.0",
         "ioredis": "^5.3.2",
         "lmdb": "^2.9.2",
-        "vectordb": "^0.4.7",
+        "vectordb": "^0.4.8",
         "weaviate-ts-client": "^2.0.0"
     },
     "overrides": {
-        "@pinecone-database/pinecone": "^1.1.2",
+        "@pinecone-database/pinecone": "^2.0.1",
         "weaviate-ts-client": "^2.0.0",
         "hnswlib-node": "^2.1.0",
-        "vectordb": "^0.4.7"
+        "vectordb": "^0.4.8"
     },
     "peerDependenciesMeta": {
         "@pinecone-database/pinecone": {

diff --git a/src/core/llm-application.ts b/src/core/llm-application.ts
@@ -2,7 +2,7 @@ import createDebugMessages from 'debug';
 
 import { BaseDb } from '../interfaces/base-db.js';
 import { BaseLoader } from '../interfaces/base-loader.js';
-import { AddLoaderReturn, Chunk, EmbeddedChunk } from '../global/types.js';
+import { AddLoaderReturn, Chunk, EmbeddedChunk, LoaderChunk } from '../global/types.js';
 import { LLMApplicationBuilder } from './llm-application-builder.js';
 import { DEFAULT_INSERT_BATCH_SIZE } from '../global/constants.js';
 import { cleanString, stringFormat } from '../util/strings.js';
@@ -78,28 +78,11 @@ export class LLMApplication {
         return this.vectorDb.insertChunks(embedChunks);
     }
 
-    public async addLoader(loader: BaseLoader): Promise<AddLoaderReturn> {
-        const uniqueId = loader.getUniqueId();
-        this.debug('Add loader called for', uniqueId);
-        await loader.init();
-
-        const chunks = await loader.getChunks();
-        if (this.cache && (await this.cache.hasLoader(uniqueId))) {
-            const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId);
-
-            const chunkIds: string[] = [];
-            for (let i = 0; i < previousChunkCount; i++) {
-                chunkIds.push(this.getChunkUniqueId(uniqueId, i));
-            }
-
-            this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId);
-            if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds);
-        }
-
+    private async batchLoadChunks(uniqueId: string, incrementalGenerator: AsyncGenerator<LoaderChunk, void, void>) {
         let batchSize = 0,
             newInserts = 0,
             formattedChunks: Chunk[] = [];
-        for await (const chunk of chunks) {
+        for await (const chunk of incrementalGenerator) {
             batchSize++;
 
             const formattedChunk = {
@@ -117,10 +100,48 @@ export class LLMApplication {
                 batchSize = 0;
             }
         }
+
         newInserts += await this.batchLoadEmbeddings(uniqueId, formattedChunks);
+        return { newInserts, formattedChunks };
+    }
+
+    private async incrementalLoader(uniqueId: string, incrementalGenerator: AsyncGenerator<LoaderChunk, void, void>) {
+        this.debug(`incrementalChunkAvailable for loader`, uniqueId);
+        const { newInserts } = await this.batchLoadChunks(uniqueId, incrementalGenerator);
+        this.debug(`${newInserts} new incrementalChunks processed`, uniqueId);
+    }
+
+    public async addLoader(loader: BaseLoader): Promise<AddLoaderReturn> {
+        const uniqueId = loader.getUniqueId();
+        this.debug('Add loader called for', uniqueId);
+        await loader.init();
+
+        const chunks = await loader.getChunks();
+        if (this.cache && (await this.cache.hasLoader(uniqueId))) {
+            const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId);
+
+            const chunkIds: string[] = [];
+            for (let i = 0; i < previousChunkCount; i++) {
+                chunkIds.push(this.getChunkUniqueId(uniqueId, i));
+            }
+
+            this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId);
+            if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds);
+        }
+
+        const { newInserts, formattedChunks } = await this.batchLoadChunks(uniqueId, chunks);
 
         if (this.cache) await this.cache.addLoader(uniqueId, formattedChunks.length);
         this.debug(`Add loader completed with ${newInserts} new entries for`, uniqueId);
+
+        if (loader.canIncrementallyLoad) {
+            this.debug(`Registering incremental loader`, uniqueId);
+
+            loader.on('incrementalChunkAvailable', async (incrementalGenerator) => {
+                await this.incrementalLoader(uniqueId, incrementalGenerator);
+            });
+        }
+
         return { entriesAdded: newInserts, uniqueId };
     }
 

diff --git a/src/interfaces/base-loader.ts b/src/interfaces/base-loader.ts
@@ -1,27 +1,35 @@
 import createDebugMessages from 'debug';
+import { EventEmitter } from 'node:events';
 
 import { LoaderChunk } from '../global/types.js';
 import { BaseCache } from './base-cache.js';
 
 export abstract class BaseLoader<
     T extends Record<string, string | number | boolean> = Record<string, string | number | boolean>,
     M extends Record<string, unknown> = Record<string, null>,
-> {
+> extends EventEmitter {
     private static cache?: BaseCache;
 
     public static setCache(cache?: BaseCache) {
         BaseLoader.cache = cache;
     }
 
     protected readonly uniqueId: string;
+    private readonly _canIncrementallyLoad: boolean;
 
-    constructor(uniqueId: string) {
+    constructor(uniqueId: string, canIncrementallyLoad: boolean = false) {
+        super();
         this.uniqueId = uniqueId;
+        this._canIncrementallyLoad = canIncrementallyLoad;
         createDebugMessages('embedjs:loader:BaseLoader')(`New loader class initalized with key ${uniqueId}`);
     }
 
     async init() {}
 
+    public get canIncrementallyLoad() {
+        return this._canIncrementallyLoad;
+    }
+
     getUniqueId(): string {
         return this.uniqueId;
     }
@@ -45,5 +53,9 @@ export abstract class BaseLoader<
         return BaseLoader.cache.loaderCustomHas(this.getCustomCacheKey(key));
     }
 
+    protected async loadIncrementalChunk(incrementalGenerator: AsyncGenerator<LoaderChunk<T>, void, void>) {
+        this.emit('incrementalChunkAvailable', incrementalGenerator);
+    }
+
     abstract getChunks(): AsyncGenerator<LoaderChunk<T>, void, void>;
 }
diff --git a/src/loaders/confluence-loader.ts b/src/loaders/confluence-loader.ts
@@ -75,6 +75,7 @@ export class ConfluenceLoader extends BaseLoader<{ type: 'ConfluenceLoader' }> {
                 }
             }
 
+            if (!content.body.view.value) continue;
             const webLoader = new WebLoader({ content: content.body.view.value });
             for await (const result of await webLoader.getChunks()) {
                 yield {

diff --git a/src/loaders/medusa-loader.ts b/src/loaders/medusa-loader.ts
-Original file line number
+Diff line change
@@ Expand Up @@
                     }
                 }
+                if (!content.body.view.value) continue;
                 const webLoader = new WebLoader({ content: content.body.view.value });
                 for await (const result of await webLoader.getChunks()) {
                     yield {
@@ Expand Down @@