From c2da0d0756a0f6cb2204133fdd2001e066ebad4e Mon Sep 17 00:00:00 2001 From: Adhityan K V Date: Wed, 7 Feb 2024 01:15:45 +0100 Subject: [PATCH] updated pinecone to version 2x --- README.md | 33 +++++- examples/pinecone/src/index.ts | 13 ++- package-lock.json | 184 +++++++++++++++---------------- package.json | 28 ++--- src/core/llm-application.ts | 61 ++++++---- src/interfaces/base-loader.ts | 16 ++- src/loaders/confluence-loader.ts | 1 + src/loaders/medusa-loader.ts | 47 -------- src/vectorDb/pinecone-db.ts | 24 ++-- 9 files changed, 217 insertions(+), 190 deletions(-) delete mode 100644 src/loaders/medusa-loader.ts diff --git a/README.md b/README.md index af9fa791..4b43a32c 100644 --- a/README.md +++ b/README.md @@ -333,20 +333,28 @@ You can enable Pinecone storage by following these steps - npm install @pinecone-database/pinecone ``` -- Set the pinecone environment variables `PINECONE_API_KEY` and `PINECONE_ENVIRONMENT`. These can be obtained from the **API Keys** section on the Pinecone dashboard. +- Set the pinecone environment variable `PINECONE_API_KEY`. This can be obtained from the **API Keys** section on the Pinecone dashboard. ```bash -PINECONE_API_KEY="e65a4ec0-14f7-40c5-903e-f8529127b817" -PINECONE_ENVIRONMENT="us-west1-gcp-free" +PINECONE_API_KEY= ``` - Set the Pinecone database as your choice of `vectorDb` ```TS -.setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' })) +.setVectorDb(new PineconeDb({ + projectName: 'test', + namespace: 'dev', + indexSpec: { + pod: { + podType: 'p1.x1', + environment: 'us-east1-gcp', + }, + }, +})) ``` -**Note:** The `projectName` will be used to create the Pinecone index name for this application. +**Note:** Pinecone supports serverless and pod based index deployments. You can control how you want your index created using the indexSpec attribute. This is mandatory to be provided but comes with full type specification. ## LanceDB @@ -636,6 +644,21 @@ If you want us to add support for a specific embedding model, please create an [ All PRs are welcome. +# Langsmith Integration + +Langsmith allows you to keep track of how you use LLM and embedding models. It logs histories, token uses and other metadata. Follow these three simple steps to enable - + +- Sign up for an account with [Langsmith](https://smith.langchain.com/) +- Generate an API Key from your admin page +- Set the following environment keys in your project + +```bash +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com" +export LANGCHAIN_PROJECT="" +export LANGCHAIN_API_KEY="" +``` + # Azure OpenAI In order to be able to use an OpenAI model on Azure, it first needs to be deployed. Please refer to [Azure OpenAI documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) on how to deploy a model on Azure. To run this library, you will need to deploy two models - diff --git a/examples/pinecone/src/index.ts b/examples/pinecone/src/index.ts index 40f9e4eb..4f076ada 100644 --- a/examples/pinecone/src/index.ts +++ b/examples/pinecone/src/index.ts @@ -12,7 +12,18 @@ const llmApplication = await new LLMApplicationBuilder() .addLoader(new YoutubeLoader({ videoIdOrUrl: 'https://www.youtube.com/watch?v=w2KbwC-s7pY' })) .addLoader(new TextLoader({ text: 'The best company name for a company making colorful socks is MrSocks' })) .setCache(new LmdbCache({ path: path.resolve(path.dirname(__filename), '../../../cache') })) - .setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' })) + .setVectorDb( + new PineconeDb({ + projectName: 'test', + namespace: 'dev', + indexSpec: { + pod: { + podType: 'p1.x1', + environment: 'us-east1-gcp', + }, + }, + }), + ) .build(); console.log(await llmApplication.query('What is paxos?')); diff --git a/package-lock.json b/package-lock.json index bb2715b3..41103a54 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,31 +1,31 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.53", + "version": "0.0.57", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@llm-tools/embedjs", - "version": "0.0.53", + "version": "0.0.57", "license": "Apache-2.0", "dependencies": { "@huggingface/inference": "^2.6.4", - "@langchain/cohere": "^0.0.3", + "@langchain/cohere": "^0.0.4", "@langchain/openai": "^0.0.14", "axios": "^1.6.7", "confluence.js": "^1.7.2", "debug": "^4.3.4", "html-to-text": "^9.0.5", - "langchain": "^0.1.12", + "langchain": "^0.1.13", "md5": "^2.3.0", "pdf-parse-fork": "^1.2.0", - "sitemapper": "^3.2.8", - "usetube": "^2.2.7", + "sitemapper": "^3.1.8", + "usetube": "^2.0.2", "uuid": "^9.0.1", "youtube-transcript": "^1.0.6" }, "devDependencies": { - "@pinecone-database/pinecone": "^1.1.2", + "@pinecone-database/pinecone": "^2.0.1", "@qdrant/js-client-rest": "^1.7.0", "@tsconfig/recommended": "^1.0.3", "@types/debug": "^4.1.12", @@ -33,8 +33,8 @@ "@types/md5": "^2.3.5", "@types/node": "^20.11.16", "@types/usetube": "^2.1.2", - "@typescript-eslint/eslint-plugin": "^6.20.0", - "@typescript-eslint/parser": "^6.20.0", + "@typescript-eslint/eslint-plugin": "^6.21.0", + "@typescript-eslint/parser": "^6.21.0", "chromadb": "^1.8.1", "cohere-ai": "^7.7.5", "eslint": "^8.56.0", @@ -43,24 +43,24 @@ "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "prettier": "^3.2.4", + "prettier": "^3.2.5", "rimraf": "^5.0.5", "typescript": "^5.3.3", - "vectordb": "^0.4.7", + "vectordb": "^0.4.8", "weaviate-ts-client": "^2.0.0" }, "engines": { "node": ">= 18.0.0" }, "peerDependencies": { - "@pinecone-database/pinecone": "^1.1.2", + "@pinecone-database/pinecone": "^2.0.1", "@qdrant/js-client-rest": "^1.7.0", "chromadb": "^1.8.1", "cohere-ai": "^7.7.5", "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "vectordb": "^0.4.7", + "vectordb": "^0.4.8", "weaviate-ts-client": "^2.0.0" }, "peerDependenciesMeta": { @@ -360,9 +360,9 @@ } }, "node_modules/@lancedb/vectordb-darwin-arm64": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.7.tgz", - "integrity": "sha512-kACOIytgjBfX8NRwjPKe311XRN3lbSN13B7avT5htMd3kYm3AnnMag9tZhlwoO7lIuvGaXhy7mApygJrjhfJ4g==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.8.tgz", + "integrity": "sha512-FpnJaw7KmNdD/FtOw9AcmPL5P+L04AcnfPj9ZyEjN8iCwB/qaOGYgdfBv+EbEtfHIsqA12q/1BRduu9KdB6BIA==", "cpu": [ "arm64" ], @@ -373,9 +373,9 @@ ] }, "node_modules/@lancedb/vectordb-darwin-x64": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.7.tgz", - "integrity": "sha512-vb74iK5uPWCwz5E60r3yWp/R/HSg54/Z9AZWYckYXqsPv4w/nfbkM5iZhfRqqR/9uE6JClWJKOtjbk7b8CFRFg==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.8.tgz", + "integrity": "sha512-RafOEYyZIgphp8wPGuVLFaTc8aAqo0NCO1LQMx0mB0xV96vrdo0Mooivs+dYN3RFfSHtTKPw9O1Jc957Vp1TLg==", "cpu": [ "x64" ], @@ -386,9 +386,9 @@ ] }, "node_modules/@lancedb/vectordb-linux-arm64-gnu": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.7.tgz", - "integrity": "sha512-jHp7THm6S9sB8RaCxGoZXLAwGAUHnawUUilB1K3mvQsRdfB2bBs0f7wDehW+PDhr+Iog4LshaWbcnoQEUJWR+Q==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.8.tgz", + "integrity": "sha512-WlbYNfj4+v1hBHUluF+hnlG/A0ZaQFdXBTGDfHQniL11o+n3emWm4ujP5nSAoQHXjSH9DaOTGr/N4Mc9Xe+luw==", "cpu": [ "arm64" ], @@ -399,9 +399,9 @@ ] }, "node_modules/@lancedb/vectordb-linux-x64-gnu": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.7.tgz", - "integrity": "sha512-LKbVe6Wrp/AGqCCjKliNDmYoeTNgY/wfb2DTLjrx41Jko/04ywLrJ6xSEAn3XD5RDCO5u3fyUdXHHHv5a3VAAQ==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.8.tgz", + "integrity": "sha512-z+qFJrDqnNEv4JcwYDyt51PHmWjuM/XaOlSjpBnyyuUImeY+QcwctMuyXt8+Q4zhuqQR1AhLKrMwCU+YmMfk5g==", "cpu": [ "x64" ], @@ -412,9 +412,9 @@ ] }, "node_modules/@lancedb/vectordb-win32-x64-msvc": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.7.tgz", - "integrity": "sha512-C5ln4+wafeY1Sm4PeV0Ios9lUaQVVip5Mjl9XU7ngioSEMEuXI/XMVfIdVfDPppVNXPeQxg33wLA272uw88D1Q==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.8.tgz", + "integrity": "sha512-VjUryVvEA04r0j4lU9pJy84cmjuQm1GhBzbPc8kwbn5voT4A6BPglrlNsU0Zc+j8Fbjyvauzw2lMEcMsF4F0rw==", "cpu": [ "x64" ], @@ -425,9 +425,9 @@ ] }, "node_modules/@langchain/cohere": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/@langchain/cohere/-/cohere-0.0.3.tgz", - "integrity": "sha512-+ph/SESIw8Ut/ngm55iVFIGzmdvitTvI5Y68UJUel5FKKA+y8zXpx6sLLohQ1wZWx83a2OAuGsm0B9Xxs4ERjA==", + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/@langchain/cohere/-/cohere-0.0.4.tgz", + "integrity": "sha512-tu8pQRH8m8CZPQDrRsz2t6hzNqbF1E0shAETqavFOrV8yzgj+eh1pckTKnPcw9noqaonmgkCuVuAqAK+WreaEA==", "dependencies": { "@langchain/core": "~0.1", "cohere-ai": "^7.6.2" @@ -1014,9 +1014,9 @@ } }, "node_modules/@pinecone-database/pinecone": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-1.1.3.tgz", - "integrity": "sha512-bGldvvoAr4agVZ2ql4RZesXIDjMLjnuqNmKYfMQoVO3UFRYeuO9z+1WJodvanGIPY2iGh1w9yz0jDAkBiT53qw==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-2.0.1.tgz", + "integrity": "sha512-a1ejzrqdSQ2yW+9QUi2TVlKwYUbrvGH+QH6POJhITyaOz9ANE+EhXqToC9af93Ctzq9n87+bOUvBvewLeW++Mw==", "devOptional": true, "dependencies": { "@sinclair/typebox": "^0.29.0", @@ -1307,16 +1307,16 @@ "integrity": "sha512-WUtIVRUZ9i5dYXefDEAI7sh9/O7jGvHg7Df/5O/gtH3Yabe5odI3UWopVR1qbPXQtvOxWu3mM4XxlYeZtMWF4g==" }, "node_modules/@typescript-eslint/eslint-plugin": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.20.0.tgz", - "integrity": "sha512-fTwGQUnjhoYHeSF6m5pWNkzmDDdsKELYrOBxhjMrofPqCkoC2k3B2wvGHFxa1CTIqkEn88nlW1HVMztjo2K8Hg==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.21.0.tgz", + "integrity": "sha512-oy9+hTPCUFpngkEZUSzbf9MxI65wbKFoQYsgPdILTfbUldp5ovUuphZVe4i30emU9M/kP+T64Di0mxl7dSw3MA==", "dev": true, "dependencies": { "@eslint-community/regexpp": "^4.5.1", - "@typescript-eslint/scope-manager": "6.20.0", - "@typescript-eslint/type-utils": "6.20.0", - "@typescript-eslint/utils": "6.20.0", - "@typescript-eslint/visitor-keys": "6.20.0", + "@typescript-eslint/scope-manager": "6.21.0", + "@typescript-eslint/type-utils": "6.21.0", + "@typescript-eslint/utils": "6.21.0", + "@typescript-eslint/visitor-keys": "6.21.0", "debug": "^4.3.4", "graphemer": "^1.4.0", "ignore": "^5.2.4", @@ -1342,15 +1342,15 @@ } }, "node_modules/@typescript-eslint/parser": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.20.0.tgz", - "integrity": "sha512-bYerPDF/H5v6V76MdMYhjwmwgMA+jlPVqjSDq2cRqMi8bP5sR3Z+RLOiOMad3nsnmDVmn2gAFCyNgh/dIrfP/w==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.21.0.tgz", + "integrity": "sha512-tbsV1jPne5CkFQCgPBcDOt30ItF7aJoZL997JSF7MhGQqOeT3svWRYxiqlfA5RUdlHN6Fi+EI9bxqbdyAUZjYQ==", "dev": true, "dependencies": { - "@typescript-eslint/scope-manager": "6.20.0", - "@typescript-eslint/types": "6.20.0", - "@typescript-eslint/typescript-estree": "6.20.0", - "@typescript-eslint/visitor-keys": "6.20.0", + "@typescript-eslint/scope-manager": "6.21.0", + "@typescript-eslint/types": "6.21.0", + "@typescript-eslint/typescript-estree": "6.21.0", + "@typescript-eslint/visitor-keys": "6.21.0", "debug": "^4.3.4" }, "engines": { @@ -1370,13 +1370,13 @@ } }, "node_modules/@typescript-eslint/scope-manager": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.20.0.tgz", - "integrity": "sha512-p4rvHQRDTI1tGGMDFQm+GtxP1ZHyAh64WANVoyEcNMpaTFn3ox/3CcgtIlELnRfKzSs/DwYlDccJEtr3O6qBvA==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.21.0.tgz", + "integrity": "sha512-OwLUIWZJry80O99zvqXVEioyniJMa+d2GrqpUTqi5/v5D5rOrppJVBPa0yKCblcigC0/aYAzxxqQ1B+DS2RYsg==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.20.0", - "@typescript-eslint/visitor-keys": "6.20.0" + "@typescript-eslint/types": "6.21.0", + "@typescript-eslint/visitor-keys": "6.21.0" }, "engines": { "node": "^16.0.0 || >=18.0.0" @@ -1387,13 +1387,13 @@ } }, "node_modules/@typescript-eslint/type-utils": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.20.0.tgz", - "integrity": "sha512-qnSobiJQb1F5JjN0YDRPHruQTrX7ICsmltXhkV536mp4idGAYrIyr47zF/JmkJtEcAVnIz4gUYJ7gOZa6SmN4g==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.21.0.tgz", + "integrity": "sha512-rZQI7wHfao8qMX3Rd3xqeYSMCL3SoiSQLBATSiVKARdFGCYSRvmViieZjqc58jKgs8Y8i9YvVVhRbHSTA4VBag==", "dev": true, "dependencies": { - "@typescript-eslint/typescript-estree": "6.20.0", - "@typescript-eslint/utils": "6.20.0", + "@typescript-eslint/typescript-estree": "6.21.0", + "@typescript-eslint/utils": "6.21.0", "debug": "^4.3.4", "ts-api-utils": "^1.0.1" }, @@ -1414,9 +1414,9 @@ } }, "node_modules/@typescript-eslint/types": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.20.0.tgz", - "integrity": "sha512-MM9mfZMAhiN4cOEcUOEx+0HmuaW3WBfukBZPCfwSqFnQy0grXYtngKCqpQN339X3RrwtzspWJrpbrupKYUSBXQ==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.21.0.tgz", + "integrity": "sha512-1kFmZ1rOm5epu9NZEZm1kckCDGj5UJEf7P1kliH4LKu/RkwpsfqqGmY2OOcUs18lSlQBKLDYBOGxRVtrMN5lpg==", "dev": true, "engines": { "node": "^16.0.0 || >=18.0.0" @@ -1427,13 +1427,13 @@ } }, "node_modules/@typescript-eslint/typescript-estree": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.20.0.tgz", - "integrity": "sha512-RnRya9q5m6YYSpBN7IzKu9FmLcYtErkDkc8/dKv81I9QiLLtVBHrjz+Ev/crAqgMNW2FCsoZF4g2QUylMnJz+g==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.21.0.tgz", + "integrity": "sha512-6npJTkZcO+y2/kr+z0hc4HwNfrrP4kNYh57ek7yCNlrBjWQ1Y0OS7jiZTkgumrvkX5HkEKXFZkkdFNkaW2wmUQ==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.20.0", - "@typescript-eslint/visitor-keys": "6.20.0", + "@typescript-eslint/types": "6.21.0", + "@typescript-eslint/visitor-keys": "6.21.0", "debug": "^4.3.4", "globby": "^11.1.0", "is-glob": "^4.0.3", @@ -1479,17 +1479,17 @@ } }, "node_modules/@typescript-eslint/utils": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.20.0.tgz", - "integrity": "sha512-/EKuw+kRu2vAqCoDwDCBtDRU6CTKbUmwwI7SH7AashZ+W+7o8eiyy6V2cdOqN49KsTcASWsC5QeghYuRDTyOOg==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.21.0.tgz", + "integrity": "sha512-NfWVaC8HP9T8cbKQxHcsJBY5YE1O33+jpMwN45qzWWaPDZgLIbo12toGMWnmhvCpd3sIxkpDw3Wv1B3dYrbDQQ==", "dev": true, "dependencies": { "@eslint-community/eslint-utils": "^4.4.0", "@types/json-schema": "^7.0.12", "@types/semver": "^7.5.0", - "@typescript-eslint/scope-manager": "6.20.0", - "@typescript-eslint/types": "6.20.0", - "@typescript-eslint/typescript-estree": "6.20.0", + "@typescript-eslint/scope-manager": "6.21.0", + "@typescript-eslint/types": "6.21.0", + "@typescript-eslint/typescript-estree": "6.21.0", "semver": "^7.5.4" }, "engines": { @@ -1504,12 +1504,12 @@ } }, "node_modules/@typescript-eslint/visitor-keys": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.20.0.tgz", - "integrity": "sha512-E8Cp98kRe4gKHjJD4NExXKz/zOJ1A2hhZc+IMVD6i7w4yjIvh6VyuRI0gRtxAsXtoC35uGMaQ9rjI2zJaXDEAw==", + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.21.0.tgz", + "integrity": "sha512-JJtkDduxLi9bivAB+cYOVMtbkqdPOhZ+ZI5LC47MIRrDV4Yn2o+ZnW10Nkmr28xRpSpdJ6Sm42Hjf2+REYXm0A==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.20.0", + "@typescript-eslint/types": "6.21.0", "eslint-visitor-keys": "^3.4.1" }, "engines": { @@ -3503,9 +3503,9 @@ } }, "node_modules/langchain": { - "version": "0.1.12", - "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.1.12.tgz", - "integrity": "sha512-F3WK6KJGeA+gnXIrijKy892yEGzUOpO4pEWWphUrCxrtfjXh1hFcXfj5Oh14qGvaUCmn8ezBqQMJ/LhL6z3DhQ==", + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.1.13.tgz", + "integrity": "sha512-A56Qi5Jlxc3b/G9ny/B6gsqujFp9dFpnT7HIKFCq/mS/5foCb0nQ8P2F/Pwv8COg6b/be9Gwu1WQ97L93uE+dw==", "dependencies": { "@anthropic-ai/sdk": "^0.9.1", "@langchain/community": "~0.0.20", @@ -3555,7 +3555,6 @@ "epub2": "^3.0.1", "fast-xml-parser": "^4.2.7", "google-auth-library": "^8.9.0", - "googleapis": "^126.0.1", "handlebars": "^4.7.8", "html-to-text": "^9.0.5", "ignore": "^5.2.0", @@ -3661,9 +3660,6 @@ "google-auth-library": { "optional": true }, - "googleapis": { - "optional": true - }, "handlebars": { "optional": true }, @@ -4449,9 +4445,9 @@ } }, "node_modules/prettier": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.4.tgz", - "integrity": "sha512-FWu1oLHKCrtpO1ypU6J0SbK2d9Ckwysq6bHj/uaCP26DxrPpppCLQRGVuqAxSTvhF00AcvDRyYrLNW7ocBhFFQ==", + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.5.tgz", + "integrity": "sha512-3/GWa9aOC0YeD7LUfvOG2NiDyhOWRvt1k+rcKhOuYnMY24iiCphgneUfJDyFXd6rZCAnuLBv6UeAULtrhT/F4A==", "dev": true, "bin": { "prettier": "bin/prettier.cjs" @@ -5141,9 +5137,9 @@ } }, "node_modules/vectordb": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/vectordb/-/vectordb-0.4.7.tgz", - "integrity": "sha512-qrQb1uGPXmhfyup/BB6q43mcP6uG9mSedy38vyUcKj5l89WHhVVvxwisHC18Q7mUQbatC0bZMYrV3lMmRggOuw==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/vectordb/-/vectordb-0.4.8.tgz", + "integrity": "sha512-UnxMRhmMfjDoGzIcBMd4oISBYE60UtHXyiSbzWbvYwko6CPqQ6pc3FzpXer2qsfvEVqlNXQX+hDeMZ6gf2mtdQ==", "cpu": [ "x64", "arm64" @@ -5161,11 +5157,11 @@ "axios": "^1.4.0" }, "optionalDependencies": { - "@lancedb/vectordb-darwin-arm64": "0.4.7", - "@lancedb/vectordb-darwin-x64": "0.4.7", - "@lancedb/vectordb-linux-arm64-gnu": "0.4.7", - "@lancedb/vectordb-linux-x64-gnu": "0.4.7", - "@lancedb/vectordb-win32-x64-msvc": "0.4.7" + "@lancedb/vectordb-darwin-arm64": "0.4.8", + "@lancedb/vectordb-darwin-x64": "0.4.8", + "@lancedb/vectordb-linux-arm64-gnu": "0.4.8", + "@lancedb/vectordb-linux-x64-gnu": "0.4.8", + "@lancedb/vectordb-win32-x64-msvc": "0.4.8" } }, "node_modules/weak-lru-cache": { diff --git a/package.json b/package.json index 4853ed59..8e2a5d7e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.53", + "version": "0.0.57", "description": "A NodeJS RAG framework to easily work with LLMs and custom datasets", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -49,22 +49,22 @@ "homepage": "https://github.com/llm-tools/embedjs#readme", "dependencies": { "@huggingface/inference": "^2.6.4", - "@langchain/cohere": "^0.0.3", + "@langchain/cohere": "^0.0.4", "@langchain/openai": "^0.0.14", "axios": "^1.6.7", "confluence.js": "^1.7.2", "debug": "^4.3.4", "html-to-text": "^9.0.5", - "langchain": "^0.1.12", + "langchain": "^0.1.13", "md5": "^2.3.0", "pdf-parse-fork": "^1.2.0", - "sitemapper": "^3.2.8", - "usetube": "^2.2.7", + "sitemapper": "^3.1.8", + "usetube": "^2.0.2", "uuid": "^9.0.1", "youtube-transcript": "^1.0.6" }, "devDependencies": { - "@pinecone-database/pinecone": "^1.1.2", + "@pinecone-database/pinecone": "^2.0.1", "@qdrant/js-client-rest": "^1.7.0", "@tsconfig/recommended": "^1.0.3", "@types/debug": "^4.1.12", @@ -72,8 +72,8 @@ "@types/md5": "^2.3.5", "@types/node": "^20.11.16", "@types/usetube": "^2.1.2", - "@typescript-eslint/eslint-plugin": "^6.20.0", - "@typescript-eslint/parser": "^6.20.0", + "@typescript-eslint/eslint-plugin": "^6.21.0", + "@typescript-eslint/parser": "^6.21.0", "chromadb": "^1.8.1", "cohere-ai": "^7.7.5", "eslint": "^8.56.0", @@ -82,28 +82,28 @@ "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "prettier": "^3.2.4", + "prettier": "^3.2.5", "rimraf": "^5.0.5", "typescript": "^5.3.3", - "vectordb": "^0.4.7", + "vectordb": "^0.4.8", "weaviate-ts-client": "^2.0.0" }, "peerDependencies": { - "@pinecone-database/pinecone": "^1.1.2", + "@pinecone-database/pinecone": "^2.0.1", "@qdrant/js-client-rest": "^1.7.0", "chromadb": "^1.8.1", "cohere-ai": "^7.7.5", "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "vectordb": "^0.4.7", + "vectordb": "^0.4.8", "weaviate-ts-client": "^2.0.0" }, "overrides": { - "@pinecone-database/pinecone": "^1.1.2", + "@pinecone-database/pinecone": "^2.0.1", "weaviate-ts-client": "^2.0.0", "hnswlib-node": "^2.1.0", - "vectordb": "^0.4.7" + "vectordb": "^0.4.8" }, "peerDependenciesMeta": { "@pinecone-database/pinecone": { diff --git a/src/core/llm-application.ts b/src/core/llm-application.ts index 45b605ec..9475992b 100644 --- a/src/core/llm-application.ts +++ b/src/core/llm-application.ts @@ -2,7 +2,7 @@ import createDebugMessages from 'debug'; import { BaseDb } from '../interfaces/base-db.js'; import { BaseLoader } from '../interfaces/base-loader.js'; -import { AddLoaderReturn, Chunk, EmbeddedChunk } from '../global/types.js'; +import { AddLoaderReturn, Chunk, EmbeddedChunk, LoaderChunk } from '../global/types.js'; import { LLMApplicationBuilder } from './llm-application-builder.js'; import { DEFAULT_INSERT_BATCH_SIZE } from '../global/constants.js'; import { cleanString, stringFormat } from '../util/strings.js'; @@ -78,28 +78,11 @@ export class LLMApplication { return this.vectorDb.insertChunks(embedChunks); } - public async addLoader(loader: BaseLoader): Promise { - const uniqueId = loader.getUniqueId(); - this.debug('Add loader called for', uniqueId); - await loader.init(); - - const chunks = await loader.getChunks(); - if (this.cache && (await this.cache.hasLoader(uniqueId))) { - const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId); - - const chunkIds: string[] = []; - for (let i = 0; i < previousChunkCount; i++) { - chunkIds.push(this.getChunkUniqueId(uniqueId, i)); - } - - this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId); - if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds); - } - + private async batchLoadChunks(uniqueId: string, incrementalGenerator: AsyncGenerator) { let batchSize = 0, newInserts = 0, formattedChunks: Chunk[] = []; - for await (const chunk of chunks) { + for await (const chunk of incrementalGenerator) { batchSize++; const formattedChunk = { @@ -117,10 +100,48 @@ export class LLMApplication { batchSize = 0; } } + newInserts += await this.batchLoadEmbeddings(uniqueId, formattedChunks); + return { newInserts, formattedChunks }; + } + + private async incrementalLoader(uniqueId: string, incrementalGenerator: AsyncGenerator) { + this.debug(`incrementalChunkAvailable for loader`, uniqueId); + const { newInserts } = await this.batchLoadChunks(uniqueId, incrementalGenerator); + this.debug(`${newInserts} new incrementalChunks processed`, uniqueId); + } + + public async addLoader(loader: BaseLoader): Promise { + const uniqueId = loader.getUniqueId(); + this.debug('Add loader called for', uniqueId); + await loader.init(); + + const chunks = await loader.getChunks(); + if (this.cache && (await this.cache.hasLoader(uniqueId))) { + const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId); + + const chunkIds: string[] = []; + for (let i = 0; i < previousChunkCount; i++) { + chunkIds.push(this.getChunkUniqueId(uniqueId, i)); + } + + this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId); + if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds); + } + + const { newInserts, formattedChunks } = await this.batchLoadChunks(uniqueId, chunks); if (this.cache) await this.cache.addLoader(uniqueId, formattedChunks.length); this.debug(`Add loader completed with ${newInserts} new entries for`, uniqueId); + + if (loader.canIncrementallyLoad) { + this.debug(`Registering incremental loader`, uniqueId); + + loader.on('incrementalChunkAvailable', async (incrementalGenerator) => { + await this.incrementalLoader(uniqueId, incrementalGenerator); + }); + } + return { entriesAdded: newInserts, uniqueId }; } diff --git a/src/interfaces/base-loader.ts b/src/interfaces/base-loader.ts index 2b3fa2f3..3e84bc44 100644 --- a/src/interfaces/base-loader.ts +++ b/src/interfaces/base-loader.ts @@ -1,4 +1,5 @@ import createDebugMessages from 'debug'; +import { EventEmitter } from 'node:events'; import { LoaderChunk } from '../global/types.js'; import { BaseCache } from './base-cache.js'; @@ -6,7 +7,7 @@ import { BaseCache } from './base-cache.js'; export abstract class BaseLoader< T extends Record = Record, M extends Record = Record, -> { +> extends EventEmitter { private static cache?: BaseCache; public static setCache(cache?: BaseCache) { @@ -14,14 +15,21 @@ export abstract class BaseLoader< } protected readonly uniqueId: string; + private readonly _canIncrementallyLoad: boolean; - constructor(uniqueId: string) { + constructor(uniqueId: string, canIncrementallyLoad: boolean = false) { + super(); this.uniqueId = uniqueId; + this._canIncrementallyLoad = canIncrementallyLoad; createDebugMessages('embedjs:loader:BaseLoader')(`New loader class initalized with key ${uniqueId}`); } async init() {} + public get canIncrementallyLoad() { + return this._canIncrementallyLoad; + } + getUniqueId(): string { return this.uniqueId; } @@ -45,5 +53,9 @@ export abstract class BaseLoader< return BaseLoader.cache.loaderCustomHas(this.getCustomCacheKey(key)); } + protected async loadIncrementalChunk(incrementalGenerator: AsyncGenerator, void, void>) { + this.emit('incrementalChunkAvailable', incrementalGenerator); + } + abstract getChunks(): AsyncGenerator, void, void>; } diff --git a/src/loaders/confluence-loader.ts b/src/loaders/confluence-loader.ts index 58625aa4..ad157e65 100644 --- a/src/loaders/confluence-loader.ts +++ b/src/loaders/confluence-loader.ts @@ -75,6 +75,7 @@ export class ConfluenceLoader extends BaseLoader<{ type: 'ConfluenceLoader' }> { } } + if (!content.body.view.value) continue; const webLoader = new WebLoader({ content: content.body.view.value }); for await (const result of await webLoader.getChunks()) { yield { diff --git a/src/loaders/medusa-loader.ts b/src/loaders/medusa-loader.ts deleted file mode 100644 index 1c513ef2..00000000 --- a/src/loaders/medusa-loader.ts +++ /dev/null @@ -1,47 +0,0 @@ -import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; -import createDebugMessages from 'debug'; -import { convert } from 'html-to-text'; -import axios from 'axios'; -import md5 from 'md5'; - -import { BaseLoader } from '../interfaces/base-loader.js'; -import { cleanString } from '../util/strings.js'; - -export class MedusaLoader extends BaseLoader<{ type: 'MedusaLoader' }> { - private readonly debug = createDebugMessages('embedjs:loader:MedusaLoader'); - private readonly url: string; - - constructor({ url }: { url: string }) { - super(`MedusaLoader_${md5(url)}`); - this.url = url; - } - - override async *getChunks() { - const chunker = new RecursiveCharacterTextSplitter({ chunkSize: 2000, chunkOverlap: 0 }); - - try { - const { data } = await axios.get(this.url, { responseType: 'document' }); - const text = convert(data, { - wordwrap: false, - }); - - let i = 0; - const chunks = await chunker.splitText(cleanString(text)); - for (const chunk of chunks) { - yield { - pageContent: chunk, - contentHash: md5(chunk), - metadata: { - type: <'MedusaLoader'>'MedusaLoader', - source: this.url, - chunkId: i, - }, - }; - - i++; - } - } catch (e) { - this.debug('Could not parse website url', this.url, e); - } - } -} diff --git a/src/vectorDb/pinecone-db.ts b/src/vectorDb/pinecone-db.ts index 6728f876..dc50e445 100644 --- a/src/vectorDb/pinecone-db.ts +++ b/src/vectorDb/pinecone-db.ts @@ -1,3 +1,4 @@ +import { CreateIndexRequestSpec } from '@pinecone-database/pinecone/dist/pinecone-generated-ts-fetch/index.js'; import { Pinecone, PineconeRecord } from '@pinecone-database/pinecone'; import createDebugMessages from 'debug'; @@ -7,28 +8,37 @@ import { createArrayChunks, mapAsync } from '../util/arrays.js'; export class PineconeDb implements BaseDb { private readonly debug = createDebugMessages('embedjs:vector:PineconeDb'); - private static readonly PINECONE_INSERT_CHUNK_SIZE = 500; + private static readonly PINECONE_INSERT_CHUNK_SIZE = 200; //Pinecone only allows inserting 2MB worth of chunks at a time; this is an approximation private readonly client: Pinecone; private readonly namespace: string; private readonly projectName: string; + private readonly indexSpec: CreateIndexRequestSpec; + + constructor({ + projectName, + namespace, + indexSpec, + }: { + projectName: string; + namespace: string; + indexSpec: CreateIndexRequestSpec; + }) { + this.client = new Pinecone(); - constructor({ projectName, namespace }: { projectName: string; namespace: string }) { - this.client = new Pinecone({ - apiKey: process.env.PINECONE_API_KEY, - environment: process.env.PINECONE_ENVIRONMENT, - }); this.projectName = projectName; this.namespace = namespace; + this.indexSpec = indexSpec; } async init({ dimensions }: { dimensions: number }) { - const list = (await this.client.listIndexes()).map((i) => i.name); + const list = (await this.client.listIndexes()).indexes.map((i) => i.name); if (list.indexOf(this.projectName) > -1) return; await this.client.createIndex({ name: this.projectName, dimension: dimensions, + spec: this.indexSpec, metric: 'cosine', }); }