From 4678d3d7746f5b992a0ad5b55d3c79c99c109802 Mon Sep 17 00:00:00 2001 From: Phil Nash Date: Thu, 26 Sep 2024 22:28:12 +0800 Subject: [PATCH 1/2] Implements the Astra DB vector store. --- package-lock.json | 148 ++++++++++++++++++++++++++++++++++++++- package.json | 10 ++- src/vectorDb/astra-db.ts | 69 ++++++++++++++++++ 3 files changed, 223 insertions(+), 4 deletions(-) create mode 100644 src/vectorDb/astra-db.ts diff --git a/package-lock.json b/package-lock.json index 29c30f9d..20a33fd1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -75,6 +75,7 @@ }, "peerDependencies": { "@azure/cosmos": "^4.1.1", + "@datastax/astra-db-ts": "^1.5.0", "@lancedb/lancedb": "^0.10.0", "@pinecone-database/pinecone": "^3.0.3", "@qdrant/js-client-rest": "^1.11.0", @@ -1926,6 +1927,21 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@datastax/astra-db-ts": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@datastax/astra-db-ts/-/astra-db-ts-1.5.0.tgz", + "integrity": "sha512-Z9pEVyyHfglh8XAKrIASxdvORdei4pLUKDDGarqYvBkA9B9rKdqqdN+4I42Dz8paU5uscu8FwM5mc+Ly/U6jfA==", + "peer": true, + "dependencies": { + "fetch-h2": "^3.0.2", + "safe-stable-stringify": "^2.4.3", + "typed-emitter": "^2.1.0", + "uuidv7": "^0.6.3" + }, + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@eslint-community/eslint-utils": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.0.tgz", @@ -4606,6 +4622,12 @@ "integrity": "sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==", "dev": true }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "peer": true + }, "node_modules/@types/usetube": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/@types/usetube/-/usetube-2.1.2.tgz", @@ -4892,6 +4914,12 @@ "node": ">= 8.0.0" } }, + "node_modules/already": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/already/-/already-2.2.1.tgz", + "integrity": "sha512-qk6RIVMS/R1yTvBzfIL1T76PsIL7DIVCINoLuFw2YXKLpLtsTobqdChMs8m3OhuPS3CEE3+Ra5ibYiqdyogbsQ==", + "peer": true + }, "node_modules/ansi-escapes": { "version": "4.3.2", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz", @@ -5376,6 +5404,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/callguard": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/callguard/-/callguard-2.0.0.tgz", + "integrity": "sha512-I3nd+fuj20FK1qu00ImrbH+II+8ULS6ioYr9igqR1xyqySoqc3DiHEyUM0mkoAdKeLGg2CtGnO8R3VRQX5krpQ==", + "peer": true + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -6645,6 +6679,24 @@ "bser": "2.1.1" } }, + "node_modules/fetch-h2": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/fetch-h2/-/fetch-h2-3.0.2.tgz", + "integrity": "sha512-Lo6UPdMKKc9Ond7yjG2vq0mnocspOLh1oV6+XZdtfdexacvMSz5xm3WoQhTAdoR2+UqPlyMNqcqfecipoD+l/A==", + "peer": true, + "dependencies": { + "@types/tough-cookie": "^4.0.0", + "already": "^2.2.1", + "callguard": "^2.0.0", + "get-stream": "^6.0.1", + "through2": "^4.0.2", + "to-arraybuffer": "^1.0.1", + "tough-cookie": "^4.0.0" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/fflate": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.1.tgz", @@ -9728,6 +9780,12 @@ "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, + "node_modules/psl": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", + "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==", + "peer": true + }, "node_modules/pump": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", @@ -9741,7 +9799,6 @@ "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", - "devOptional": true, "engines": { "node": ">=6" } @@ -9776,6 +9833,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/querystringify": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz", + "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==", + "peer": true + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -9902,6 +9965,12 @@ "node": ">=0.10.0" } }, + "node_modules/requires-port": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", + "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==", + "peer": true + }, "node_modules/resolve": { "version": "1.22.8", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", @@ -10075,6 +10144,16 @@ "queue-microtask": "^1.2.2" } }, + "node_modules/rxjs": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", + "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", + "optional": true, + "peer": true, + "dependencies": { + "tslib": "^2.1.0" + } + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -10094,6 +10173,15 @@ } ] }, + "node_modules/safe-stable-stringify": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz", + "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==", + "peer": true, + "engines": { + "node": ">=10" + } + }, "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -10762,6 +10850,12 @@ "integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==", "dev": true }, + "node_modules/to-arraybuffer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/to-arraybuffer/-/to-arraybuffer-1.0.1.tgz", + "integrity": "sha512-okFlQcoGTi4LQBG/PgSYblw9VOyptsz2KJZqc6qtgGdes8VktzUQkj4BI2blit072iS8VODNcMA+tvnS9dnuMA==", + "peer": true + }, "node_modules/to-fast-properties": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz", @@ -10799,6 +10893,21 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tough-cookie": { + "version": "4.1.4", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.4.tgz", + "integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==", + "peer": true, + "dependencies": { + "psl": "^1.1.33", + "punycode": "^2.1.1", + "universalify": "^0.2.0", + "url-parse": "^1.5.3" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/tr46": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz", @@ -10897,6 +11006,15 @@ "node": ">=4" } }, + "node_modules/typed-emitter": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/typed-emitter/-/typed-emitter-2.1.0.tgz", + "integrity": "sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==", + "peer": true, + "optionalDependencies": { + "rxjs": "*" + } + }, "node_modules/typescript": { "version": "5.6.2", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz", @@ -10942,6 +11060,15 @@ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" }, + "node_modules/universalify": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz", + "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==", + "peer": true, + "engines": { + "node": ">= 4.0.0" + } + }, "node_modules/update-browserslist-db": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz", @@ -10986,6 +11113,16 @@ "resolved": "https://registry.npmjs.org/url-join/-/url-join-4.0.1.tgz", "integrity": "sha512-jk1+QP6ZJqyOiuEI9AEWQfju/nB2Pw466kbA0LEZljHwKeMgd9WrAEgEGxjPDD2+TNbbb37rTyhEfrCXfuKXnA==" }, + "node_modules/url-parse": { + "version": "1.5.10", + "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz", + "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==", + "peer": true, + "dependencies": { + "querystringify": "^2.1.1", + "requires-port": "^1.0.0" + } + }, "node_modules/usetube": { "version": "2.2.7", "resolved": "https://registry.npmjs.org/usetube/-/usetube-2.2.7.tgz", @@ -11021,6 +11158,15 @@ "uuid": "dist/bin/uuid" } }, + "node_modules/uuidv7": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/uuidv7/-/uuidv7-0.6.3.tgz", + "integrity": "sha512-zV3eW2NlXTsun/aJ7AixxZjH/byQcH/r3J99MI0dDEkU2cJIBJxhEWUHDTpOaLPRNhebPZoeHuykYREkI9HafA==", + "peer": true, + "bin": { + "uuidv7": "cli.js" + } + }, "node_modules/v8-to-istanbul": { "version": "9.3.0", "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.3.0.tgz", diff --git a/package.json b/package.json index 016febb7..74e4e342 100644 --- a/package.json +++ b/package.json @@ -82,6 +82,7 @@ }, "devDependencies": { "@azure/cosmos": "^4.1.1", + "@lancedb/lancedb": "^0.10.0", "@pinecone-database/pinecone": "^3.0.3", "@qdrant/js-client-rest": "^1.11.0", "@tsconfig/recommended": "^1.0.7", @@ -114,11 +115,12 @@ "rimraf": "^6.0.1", "ts-jest": "29.2.5", "typescript": "^5.6.2", - "@lancedb/lancedb": "^0.10.0", "weaviate-ts-client": "^2.2.0" }, "peerDependencies": { "@azure/cosmos": "^4.1.1", + "@datastax/astra-db-ts": "^1.5.0", + "@lancedb/lancedb": "^0.10.0", "@pinecone-database/pinecone": "^3.0.3", "@qdrant/js-client-rest": "^1.11.0", "chromadb": "^1.9.2", @@ -127,8 +129,7 @@ "ioredis": "^5.4.1", "lmdb": "^3.1.3", "mongodb": "^6.9.0", - "weaviate-ts-client": "^2.2.0", - "@lancedb/lancedb": "^0.10.0" + "weaviate-ts-client": "^2.2.0" }, "overrides": { "@pinecone-database/pinecone": "^3.0.3", @@ -168,6 +169,9 @@ }, "@azure/cosmos": { "optional": true + }, + "@datastax/astra-db-ts": { + "optional": true } }, "engines": { diff --git a/src/vectorDb/astra-db.ts b/src/vectorDb/astra-db.ts new file mode 100644 index 00000000..fce3b0fb --- /dev/null +++ b/src/vectorDb/astra-db.ts @@ -0,0 +1,69 @@ +import { BaseDb } from '../interfaces/base-db.js'; +import { Collection, DataAPIClient, Db } from '@datastax/astra-db-ts'; +import { InsertChunkData, ExtractChunkData } from '../global/types.js'; + +export class AstraDb implements BaseDb { + private db: Db; + private collectionName: string; + private collection: Collection; + private dimensions: number; + + constructor({ + endpoint, + apiKey, + collectionName, + namespace = 'default_keyspace', + }: { + endpoint: string; + apiKey: string; + namespace?: string; + collectionName: string; + }) { + const client = new DataAPIClient(apiKey); + this.db = client.db(endpoint, { namespace }); + this.collectionName = collectionName; + } + + async init({ dimensions }: { dimensions: number }): Promise { + this.dimensions = dimensions; + this.collection = await this.db.createCollection(this.collectionName, { + vector: { dimension: dimensions, metric: 'cosine' }, + checkExists: false, + }); + } + + async insertChunks(chunks: InsertChunkData[]): Promise { + const result = await this.collection.insertMany( + chunks.map((chunk) => ({ + $vector: chunk.vector, + metadata: chunk.metadata, + pageContent: chunk.pageContent, + })), + ); + return result.insertedCount; + } + + async similaritySearch(query: number[], k: number): Promise { + const cursor = this.collection.find({}, { sort: { $vector: query }, limit: k, includeSimilarity: true }); + const results = await cursor.toArray(); + return results.map((result) => ({ + score: result.similarity, + pageContent: result.pageContent, + metadata: result.metadata, + })); + } + + async getVectorCount(): Promise { + // This gives a very rough estimate of the number of documents in the collection. It is not guaranteed to be accurate, and should not be used as a source of truth for the number of documents in the collection. + return this.collection.estimatedDocumentCount(); + } + + async deleteKeys(uniqueLoaderId: string): Promise { + const result = await this.collection.deleteMany({ 'metadata.uniqueLoaderId': uniqueLoaderId }); + return result.deletedCount > 0; + } + async reset(): Promise { + await this.collection.drop(); + await this.init({ dimensions: this.dimensions }); + } +} From 1fcbfcbcf81373c6cf035ed6b648250f899ced1f Mon Sep 17 00:00:00 2001 From: Phil Nash Date: Mon, 30 Sep 2024 11:03:11 +1000 Subject: [PATCH 2/2] Adds documentation in README for Astra DB --- README.md | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 89febec6..6a8475df 100644 --- a/README.md +++ b/README.md @@ -60,17 +60,17 @@ The author(s) are looking to add core maintainers for this opensource project. R - [Delete loader](#delete-loader) - [Get count of embedded chunks](#get-count-of-embedded-chunks) - [Remove all embeddings / reset](#remove-all-embeddings--reset) - - [Set relevance cutoff](#set-cut-off-for-relevance) - - [Add new loader after init](#add-new-loaders-later) + - [Set cut-off for relevance](#set-cut-off-for-relevance) + - [Add new loaders later](#add-new-loaders-later) - [Loader inference](#loader-inference) - [Loaders supported](#loaders-supported) - [Youtube video](#youtube-video) - [Youtube channel](#youtube-channel) - [Youtube search](#youtube-search) - [PDF file](#pdf-file) - - [Word document](#docx-file) - - [Excel document](#excel-file) - - [Powerpoint document](#powerpoint-file) + - [Docx file](#docx-file) + - [Excel file](#excel-file) + - [Powerpoint file](#powerpoint-file) - [Web page](#web-page) - [Confluence](#confluence) - [Sitemap](#sitemap) @@ -95,7 +95,7 @@ The author(s) are looking to add core maintainers for this opensource project. R - [Ada](#ada) - [Cohere](#cohere) - [Gecko Embedding](#gecko-embedding) - - [Ollama Embedding](#ollama-local-embedding) + - [Ollama local embedding](#ollama-local-embedding) - [Use custom embedding model](#use-custom-embedding-model) - [More embedding models coming soon](#more-embedding-models-coming-soon) - [Vector databases supported](#vector-databases-supported) @@ -105,22 +105,23 @@ The author(s) are looking to add core maintainers for this opensource project. R - [HNSWLib](#hnswlib) - [Weaviate](#weaviate) - [Qdrant](#qdrant) - - [MongoDB](#mongodb-vector-database) + - [MongoDB (vector database)](#mongodb-vector-database) + - [Astra DB](#astra-db) - [Bring your own database](#bring-your-own-database) - [More databases coming soon](#more-databases-coming-soon) - [Caches](#caches) - [LMDB](#lmdb) - - [InMemory](#inmemory-cache) + - [InMemory (cache)](#inmemory-cache) - [Redis](#redis) - - [MongoDb](#mongodb-cache) + - [MongoDB (cache)](#mongodb-cache) - [Bring your own cache](#bring-your-own-cache) - [More caches coming soon](#more-caches-coming-soon) - [Conversation history](#conversation-history) - - [InMemory](#inmemory-conversation) - - [MongoDb](#mongodb-conversation) + - [InMemory (conversation)](#inmemory-conversation) + - [MongoDB (conversation)](#mongodb-conversation) - [Langsmith Integration](#langsmith-integration) - [Sample projects](#sample-projects) -- [Contributors](#contributors) +- [Contributing](#contributing) # Getting started @@ -959,6 +960,33 @@ import { MongoDb } from '@llm-tools/embedjs/vectorDb/mongodb'; **Note:** you can also optionally configure the database and collection name the library will use with the constructor parameters `dbName` and `collectionName`. Default values are used if these are not provided. +## Astra DB + +[Astra DB is a document database with a highly performant vector index](https://www.datastax.com/products/datastax-astra) powered by Apache Cassandra and available as a managed service. + +To use Astra DB as your vector database follow these steps: + +- [Sign up for an Astra DB account](https://astra.datastax.com/signup). It is free to sign up and doesn't require a credit card. +- Create a database (this takes a couple of minutes to provision) +- From the database overview page get the API Endpoint and generate an Application Token +- Install the Astra DB package in your project: + ```bash + npm install @datastax/astra-db-ts + ``` +- Set Astra DB as your choice of `vectorDb` + + ```TS + import { AstraDb } from '@llm-tools/embedjs/vectorDb/astra-db' + + .setVectorDb( + new AstraDb({ + endpoint: process.env.ASTRA_DB_API_ENDPOINT, + apiKey: process.env.ASTRA_DB_APP_TOKEN, + collectionName: "documents" + }), + ) + ``` + ## Bring your own database You can pass along your vector database to the `setVectorDb` method by implementing the interface `BaseDb`. Here's how that would look like -