From 8de829c64ab4f677a9bc9497c9ae8254969afac7 Mon Sep 17 00:00:00 2001 From: Adhityan K V Date: Wed, 24 Jan 2024 12:50:31 +0100 Subject: [PATCH] added confluence loader --- README.md | 72 +++++++-- package-lock.json | 243 +++++++++++++++++++------------ package.json | 24 +-- src/core/llm-application.ts | 2 +- src/index.ts | 2 + src/loaders/confluence-loader.ts | 88 +++++++++++ src/loaders/web-loader.ts | 26 +++- src/vectorDb/lance-db.ts | 5 +- src/vectorDb/qdrant-db.ts | 1 - tsconfig.json | 4 +- 10 files changed, 334 insertions(+), 133 deletions(-) create mode 100644 src/loaders/confluence-loader.ts diff --git a/README.md b/README.md index d4f315fc..29f7a7f3 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,11 @@ The library also supports caches which provide caching for embeddings, loaders a - [Dry run](#get-context) - [Loaders supported](#loaders-supported) - [Youtube](#youtube-video) + - [Youtube channels](#youtube-channel) + - [Youtube search](#youtube-search) - [PDF](#pdf-file) - [Web page](#web-page) + - [Confluence](#confluence) - [Text](#text) - [Custom loader](#add-a-custom-loader) - [How to request more loaders](#more-loaders-coming-soon) @@ -62,6 +65,7 @@ The library also supports caches which provide caching for embeddings, loaders a - [LanceDB](#lancedb) - [Chroma](#chroma) - [HNSWLib](#hnswlib) + - [Weaviate](#weaviate) - [Own Database](#bring-your-own-database) - [How to request new vector databases](#more-databases-coming-soon) - [Caches](#caches) @@ -204,6 +208,22 @@ To add any youtube video to your app, use `YoutubeLoader`. .addLoader(new YoutubeLoader({ videoIdOrUrl: 'w2KbwC-s7pY' })) ``` +## Youtube channel + +To add all videos in a youtube channel, use `YoutubeChannelLoader`. + +```TS +.addLoader(new YoutubeChannelLoader({ channelId: '...' })) +``` + +## Youtube search + +To do a general youtube search and add the popular search results, use `YoutubeSearchLoader`. + +```TS +.addLoader(new YoutubeSearchLoader({ searchString: '...' })) +``` + ## PDF file To add a pdf file, use `PdfLoader`. You can add a local file - @@ -234,6 +254,16 @@ To add a web page, use `WebLoader`. .addLoader(new WebLoader({ url: 'https://en.wikipedia.org/wiki/Formula_One' })) ``` +## Confluence + +To add a confluence space, use `ConfluenceLoader`. + +```TS +.addLoader(new ConfluenceLoader({ spaceNames: ['...'] })) +``` + +**Note:** The confluence space name is the value you see in the url in the space overview page `/wiki/spaces/{{ space name }}/overview`. + ## Text To supply your own text, use `TextLoader`. @@ -331,7 +361,7 @@ In this case, the `path` property is used as a prefix to create the temporary di npm install chromadb ``` -- Set Chhroma database as your choice of `vectorDb` +- Set Chroma database as your choice of `vectorDb` ```TS .setVectorDb(new ChromaDb({ url: 'http://localhost:8000' })) @@ -361,6 +391,22 @@ npm install hnswlib-node **Note:** This is a purely in-memory vector store. All values are lost when application is restarted. +## Weaviate + +[Weaviate](https://weaviate.io/) is an open source vector store. You can deploy it locally on docker or use their managed cloud offering. Follow these steps to use Weaviate as your vector database - + +- Install Weaviate package in your project + +```bash +npm install weaviate-ts-client +``` + +- Set Weaviate database as your choice of `vectorDb` + +```TS +.setVectorDb(new WeaviateDb()) +``` + ## Bring your own database You can pass along your vector database to the `setVectorDb` method by implementing the interface `BaseDb`. Here's how that would look like - @@ -540,14 +586,16 @@ Once done, you can pass this class to the `setEmbeddingModel` method like shown ## More embedding models coming soon -If you want us to add support for a specific embedding model, please create an [issue](https://github.com/llm-tools/embedjs/issues) and we will prioritize it. Our current priority is to add support for the [HuggingFace's Sentence Transformer](https://huggingface.co/sentence-transformers) model. All PRs are welcome. +If you want us to add support for a specific embedding model, please create an [issue](https://github.com/llm-tools/embedjs/issues) and we will prioritize it. Our current priority is to add support for the [HuggingFace's Models](https://huggingface.co/sentence-transformers). Support for the open source models under HuggingFace are available in alpha - please set `HuggingFace` as your choice of model to test. + +All PRs are welcome. # Azure OpenAI In order to be able to use an OpenAI model on Azure, it first needs to be deployed. Please refer to [Azure OpenAI documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) on how to deploy a model on Azure. To run this library, you will need to deploy two models - - text-embedding-ada -- GPT-3.5-turbo +- GPT-3.5-turbo (or the 4 series) Once these models are deployed, using Azure OpenAI instead of the regular OpenAI is easy to do. Just follow these steps - @@ -568,25 +616,17 @@ export OPENAI_API_KEY= **NOTE:** At the time of writing this, Azure OpenAI is an invite only program. -# Important dependencies - -EmbedJS is built on top of the fantastic work being done on OpenAI and the open source community behind it. Internally it uses - - -- [Langchain](https://github.com/hwchase17/langchain) a fantastic LLM library -- [OpenAI Ada embedding](https://platform.openai.com/docs/guides/embeddings) to create embeddings -- [OpenAI ChatGPT API](https://platform.openai.com/docs/guides/gpt/chat-completions-api) as the LLM to get answers to prompts. - # Projects Here's a list of projects / examples built with EmbedJs -| **Project** | **Description** | **Author** | -| -------------------------------------------------------------------------------- | ----------------------------- | ------------ | -| [nestjs-embedjs-template](https://github.com/llm-tools/nestjs-embedjs-template) | A NestJS server side template | K V Adhityan | -| [nextjs-chatbot-template](https://github.com/llm-tools/chat-bot-nextjs-template) | A NextJS chat bot template | K V Adhityan | +| **Project** | **Description** | +| -------------------------------------------------------------------------------- | ----------------------------- | +| [nestjs-embedjs-template](https://github.com/llm-tools/nestjs-embedjs-template) | A NestJS server side template | +| [nextjs-chatbot-template](https://github.com/llm-tools/chat-bot-nextjs-template) | A NextJS chat bot template | # Contributors - [K V Adhityan](https://adhityan.com/) -Looking for contrbutors to add to the list above. +Looking for contrbutors to add to the list above. Reach out to me on Linkedin if you are interested to contribute. diff --git a/package-lock.json b/package-lock.json index 1a66b5c9..fd3b0d20 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,21 +1,22 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.43", + "version": "0.0.44", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@llm-tools/embedjs", - "version": "0.0.43", + "version": "0.0.44", "license": "Apache-2.0", "dependencies": { "@huggingface/inference": "^2.6.4", "@langchain/cohere": "^0.0.2", - "@langchain/openai": "^0.0.11", + "@langchain/openai": "^0.0.12", "axios": "^1.6.5", + "confluence.js": "^1.7.2", "debug": "^4.3.4", "html-to-text": "^9.0.5", - "langchain": "^0.1.2", + "langchain": "^0.1.6", "md5": "^2.3.0", "pdf-parse-fork": "^1.2.0", "sitemapper": "^3.2.8", @@ -24,25 +25,25 @@ "youtube-transcript": "^1.0.6" }, "devDependencies": { - "@pinecone-database/pinecone": "^1.1.3", + "@pinecone-database/pinecone": "^1.1.2", "@qdrant/js-client-rest": "^1.7.0", "@tsconfig/recommended": "^1.0.3", "@types/debug": "^4.1.12", "@types/html-to-text": "^9.0.4", "@types/md5": "^2.3.5", - "@types/node": "^20.11.0", + "@types/node": "^20.11.6", "@types/usetube": "^2.1.2", - "@typescript-eslint/eslint-plugin": "^6.18.1", - "@typescript-eslint/parser": "^6.18.1", - "chromadb": "^1.7.3", - "cohere-ai": "^7.6.2", + "@typescript-eslint/eslint-plugin": "^6.19.1", + "@typescript-eslint/parser": "^6.19.1", + "chromadb": "^1.8.1", + "cohere-ai": "^7.7.3", "eslint": "^8.56.0", "eslint-config-prettier": "^9.1.0", "eslint-plugin-prettier": "^5.1.3", "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "prettier": "^3.2.1", + "prettier": "^3.2.4", "rimraf": "^5.0.5", "typescript": "^5.3.3", "vectordb": "^0.4.3", @@ -436,11 +437,11 @@ } }, "node_modules/@langchain/community": { - "version": "0.0.15", - "resolved": "https://registry.npmjs.org/@langchain/community/-/community-0.0.15.tgz", - "integrity": "sha512-Dc9IqDa1BpkGQPUsOfSC/sD1a13GpEO6LVe+v4oI8wG+ncx4/5epteXsPPfhg+u/u6sK0jqaOrg18dCjQD1YYw==", + "version": "0.0.19", + "resolved": "https://registry.npmjs.org/@langchain/community/-/community-0.0.19.tgz", + "integrity": "sha512-4+vpEINOvCZVqjcVVmctvA7ewMSopRbqJ/leDx7hvkeW1iC/aZJDqzWZPxiYNM0VxmB8qeTPFSMB0l4AKwO74w==", "dependencies": { - "@langchain/core": "~0.1.9", + "@langchain/core": "~0.1.16", "@langchain/openai": "~0.0.10", "flat": "^5.0.2", "langsmith": "~0.0.48", @@ -772,9 +773,9 @@ } }, "node_modules/@langchain/core": { - "version": "0.1.12", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.1.12.tgz", - "integrity": "sha512-lBfPEjcizJzkZjVTNJp0j+a85BFaXEjzyiUlsh7GwZRERwkrMEV2vtCSRrujbsnZcHzxN67K2bL02KHNLgWkOg==", + "version": "0.1.17", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.1.17.tgz", + "integrity": "sha512-PNmQgyAsDFm3DsZD+Djmm+sxH8xTGMlAryhYNgTg1Wkvhh+ztCqcVVYAv+aWch8CM56FBYMD8Guq0TJuRJJxEA==", "dependencies": { "ansi-styles": "^5.0.0", "camelcase": "6", @@ -785,18 +786,27 @@ "p-queue": "^6.6.2", "p-retry": "4", "uuid": "^9.0.0", - "zod": "^3.22.3" + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" }, "engines": { "node": ">=18" } }, + "node_modules/@langchain/core/node_modules/zod-to-json-schema": { + "version": "3.22.3", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.22.3.tgz", + "integrity": "sha512-9isG8SqRe07p+Aio2ruBZmLm2Q6Sq4EqmXOiNpDxp+7f0LV6Q/LX65fs5Nn+FV/CzfF3NLBoksXbS2jNYIfpKw==", + "peerDependencies": { + "zod": "^3.22.4" + } + }, "node_modules/@langchain/openai": { - "version": "0.0.11", - "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.0.11.tgz", - "integrity": "sha512-km0N+b1bcbtHVDmTEGs7LJFsCrjAcOMnkYlBQrhuIcyhJ052iF4zPpfptScQcZ25dW1yJJI8DS6dVVeM76l52w==", + "version": "0.0.12", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.0.12.tgz", + "integrity": "sha512-MR9x1xRXwJpdYlVx9Tga89q/MvxPrSTYyA5vy9tQ8dfQHNWnlgmI4gB/hDIsWUu1ooScagD4wW+aTnohTX+g+g==", "dependencies": { - "@langchain/core": "~0.1.9", + "@langchain/core": "~0.1.13", "js-tiktoken": "^1.0.7", "openai": "^4.24.2", "zod": "^3.22.3", @@ -1244,9 +1254,9 @@ "dev": true }, "node_modules/@types/node": { - "version": "20.11.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.0.tgz", - "integrity": "sha512-o9bjXmDNcF7GbM4CNQpmi+TutCgap/K3w1JyKgxAjqx41zp9qlIAVFi0IhCNsJcXolEqLWhbFbEeL0PvYm4pcQ==", + "version": "20.11.6", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.6.tgz", + "integrity": "sha512-+EOokTnksGVgip2PbYbr3xnR7kZigh4LbybAfBAw5BpnQ+FqBYUsvCEjYd70IXKlbohQ64mzEYmMtlWUY8q//Q==", "dependencies": { "undici-types": "~5.26.4" } @@ -1297,16 +1307,16 @@ "integrity": "sha512-WUtIVRUZ9i5dYXefDEAI7sh9/O7jGvHg7Df/5O/gtH3Yabe5odI3UWopVR1qbPXQtvOxWu3mM4XxlYeZtMWF4g==" }, "node_modules/@typescript-eslint/eslint-plugin": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.18.1.tgz", - "integrity": "sha512-nISDRYnnIpk7VCFrGcu1rnZfM1Dh9LRHnfgdkjcbi/l7g16VYRri3TjXi9Ir4lOZSw5N/gnV/3H7jIPQ8Q4daA==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.19.1.tgz", + "integrity": "sha512-roQScUGFruWod9CEyoV5KlCYrubC/fvG8/1zXuT0WTcxX87GnMMmnksMwSg99lo1xiKrBzw2icsJPMAw1OtKxg==", "dev": true, "dependencies": { "@eslint-community/regexpp": "^4.5.1", - "@typescript-eslint/scope-manager": "6.18.1", - "@typescript-eslint/type-utils": "6.18.1", - "@typescript-eslint/utils": "6.18.1", - "@typescript-eslint/visitor-keys": "6.18.1", + "@typescript-eslint/scope-manager": "6.19.1", + "@typescript-eslint/type-utils": "6.19.1", + "@typescript-eslint/utils": "6.19.1", + "@typescript-eslint/visitor-keys": "6.19.1", "debug": "^4.3.4", "graphemer": "^1.4.0", "ignore": "^5.2.4", @@ -1332,15 +1342,15 @@ } }, "node_modules/@typescript-eslint/parser": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.18.1.tgz", - "integrity": "sha512-zct/MdJnVaRRNy9e84XnVtRv9Vf91/qqe+hZJtKanjojud4wAVy/7lXxJmMyX6X6J+xc6c//YEWvpeif8cAhWA==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.19.1.tgz", + "integrity": "sha512-WEfX22ziAh6pRE9jnbkkLGp/4RhTpffr2ZK5bJ18M8mIfA8A+k97U9ZyaXCEJRlmMHh7R9MJZWXp/r73DzINVQ==", "dev": true, "dependencies": { - "@typescript-eslint/scope-manager": "6.18.1", - "@typescript-eslint/types": "6.18.1", - "@typescript-eslint/typescript-estree": "6.18.1", - "@typescript-eslint/visitor-keys": "6.18.1", + "@typescript-eslint/scope-manager": "6.19.1", + "@typescript-eslint/types": "6.19.1", + "@typescript-eslint/typescript-estree": "6.19.1", + "@typescript-eslint/visitor-keys": "6.19.1", "debug": "^4.3.4" }, "engines": { @@ -1360,13 +1370,13 @@ } }, "node_modules/@typescript-eslint/scope-manager": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.18.1.tgz", - "integrity": "sha512-BgdBwXPFmZzaZUuw6wKiHKIovms97a7eTImjkXCZE04TGHysG+0hDQPmygyvgtkoB/aOQwSM/nWv3LzrOIQOBw==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.19.1.tgz", + "integrity": "sha512-4CdXYjKf6/6aKNMSly/BP4iCSOpvMmqtDzRtqFyyAae3z5kkqEjKndR5vDHL8rSuMIIWP8u4Mw4VxLyxZW6D5w==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.18.1", - "@typescript-eslint/visitor-keys": "6.18.1" + "@typescript-eslint/types": "6.19.1", + "@typescript-eslint/visitor-keys": "6.19.1" }, "engines": { "node": "^16.0.0 || >=18.0.0" @@ -1377,13 +1387,13 @@ } }, "node_modules/@typescript-eslint/type-utils": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.18.1.tgz", - "integrity": "sha512-wyOSKhuzHeU/5pcRDP2G2Ndci+4g653V43gXTpt4nbyoIOAASkGDA9JIAgbQCdCkcr1MvpSYWzxTz0olCn8+/Q==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.19.1.tgz", + "integrity": "sha512-0vdyld3ecfxJuddDjACUvlAeYNrHP/pDeQk2pWBR2ESeEzQhg52DF53AbI9QCBkYE23lgkhLCZNkHn2hEXXYIg==", "dev": true, "dependencies": { - "@typescript-eslint/typescript-estree": "6.18.1", - "@typescript-eslint/utils": "6.18.1", + "@typescript-eslint/typescript-estree": "6.19.1", + "@typescript-eslint/utils": "6.19.1", "debug": "^4.3.4", "ts-api-utils": "^1.0.1" }, @@ -1404,9 +1414,9 @@ } }, "node_modules/@typescript-eslint/types": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.18.1.tgz", - "integrity": "sha512-4TuMAe+tc5oA7wwfqMtB0Y5OrREPF1GeJBAjqwgZh1lEMH5PJQgWgHGfYufVB51LtjD+peZylmeyxUXPfENLCw==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.19.1.tgz", + "integrity": "sha512-6+bk6FEtBhvfYvpHsDgAL3uo4BfvnTnoge5LrrCj2eJN8g3IJdLTD4B/jK3Q6vo4Ql/Hoip9I8aB6fF+6RfDqg==", "dev": true, "engines": { "node": "^16.0.0 || >=18.0.0" @@ -1417,13 +1427,13 @@ } }, "node_modules/@typescript-eslint/typescript-estree": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.18.1.tgz", - "integrity": "sha512-fv9B94UAhywPRhUeeV/v+3SBDvcPiLxRZJw/xZeeGgRLQZ6rLMG+8krrJUyIf6s1ecWTzlsbp0rlw7n9sjufHA==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.19.1.tgz", + "integrity": "sha512-aFdAxuhzBFRWhy+H20nYu19+Km+gFfwNO4TEqyszkMcgBDYQjmPJ61erHxuT2ESJXhlhrO7I5EFIlZ+qGR8oVA==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.18.1", - "@typescript-eslint/visitor-keys": "6.18.1", + "@typescript-eslint/types": "6.19.1", + "@typescript-eslint/visitor-keys": "6.19.1", "debug": "^4.3.4", "globby": "^11.1.0", "is-glob": "^4.0.3", @@ -1469,17 +1479,17 @@ } }, "node_modules/@typescript-eslint/utils": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.18.1.tgz", - "integrity": "sha512-zZmTuVZvD1wpoceHvoQpOiewmWu3uP9FuTWo8vqpy2ffsmfCE8mklRPi+vmnIYAIk9t/4kOThri2QCDgor+OpQ==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.19.1.tgz", + "integrity": "sha512-JvjfEZuP5WoMqwh9SPAPDSHSg9FBHHGhjPugSRxu5jMfjvBpq5/sGTD+9M9aQ5sh6iJ8AY/Kk/oUYVEMAPwi7w==", "dev": true, "dependencies": { "@eslint-community/eslint-utils": "^4.4.0", "@types/json-schema": "^7.0.12", "@types/semver": "^7.5.0", - "@typescript-eslint/scope-manager": "6.18.1", - "@typescript-eslint/types": "6.18.1", - "@typescript-eslint/typescript-estree": "6.18.1", + "@typescript-eslint/scope-manager": "6.19.1", + "@typescript-eslint/types": "6.19.1", + "@typescript-eslint/typescript-estree": "6.19.1", "semver": "^7.5.4" }, "engines": { @@ -1494,12 +1504,12 @@ } }, "node_modules/@typescript-eslint/visitor-keys": { - "version": "6.18.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.18.1.tgz", - "integrity": "sha512-/kvt0C5lRqGoCfsbmm7/CwMqoSkY3zzHLIjdhHZQW3VFrnz7ATecOHR7nb7V+xn4286MBxfnQfQhAmCI0u+bJA==", + "version": "6.19.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.19.1.tgz", + "integrity": "sha512-gkdtIO+xSO/SmI0W68DBg4u1KElmIUo3vXzgHyGPs6cxgB0sa3TlptRAAE0hUY1hM6FcDKEv7aIwiTGm76cXfQ==", "dev": true, "dependencies": { - "@typescript-eslint/types": "6.18.1", + "@typescript-eslint/types": "6.19.1", "eslint-visitor-keys": "^3.4.1" }, "engines": { @@ -1650,6 +1660,18 @@ "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" }, + "node_modules/atlassian-jwt": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/atlassian-jwt/-/atlassian-jwt-2.0.3.tgz", + "integrity": "sha512-G9oO3HHS1UKgsLRXj6nNKv2TY6g3PleBCdzHwbFeVKg+18GBFIMRz+ApxuOuWAgcL7RngNFF5rGNtw1Ss3hvTg==", + "dependencies": { + "jsuri": "^1.3.1", + "lodash": "^4.17.21" + }, + "engines": { + "node": ">= 0.4.0" + } + }, "node_modules/axios": { "version": "1.6.5", "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.5.tgz", @@ -1852,9 +1874,9 @@ } }, "node_modules/chromadb": { - "version": "1.7.3", - "resolved": "https://registry.npmjs.org/chromadb/-/chromadb-1.7.3.tgz", - "integrity": "sha512-3GgvQjpqgk5C89x5EuTDaXKbfrdqYDJ5UVyLQ3ZmwxnpetNc+HhRDGjkvXa5KSvpQ3lmKoyDoqnN4tZepfFkbw==", + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/chromadb/-/chromadb-1.8.1.tgz", + "integrity": "sha512-NpbYydbg4Uqt/9BXKgkZXn0fqpsh2Z1yjhkhKH+rcHMoq0pwI18BFSU2QU7Fk/ZypwGefW2AvqyE/3ZJIgy4QA==", "devOptional": true, "dependencies": { "cliui": "^8.0.1", @@ -1967,9 +1989,9 @@ } }, "node_modules/cohere-ai": { - "version": "7.6.2", - "resolved": "https://registry.npmjs.org/cohere-ai/-/cohere-ai-7.6.2.tgz", - "integrity": "sha512-0cMY8IniquZb7nnQBaLrWacxugNjFNbZQHsn0cJJ5rCZ0LT9OZu0fbKqtoAkcBqkdc9yAFZh5gWnW4ksNhHtpg==", + "version": "7.7.3", + "resolved": "https://registry.npmjs.org/cohere-ai/-/cohere-ai-7.7.3.tgz", + "integrity": "sha512-g6PF/sDusd/RJ+KgWPg+pMq8dO97Le3JcvgS+xV5j/Fr4By5psy/D87OdlHlfnhmQ/40gr2D7wgrOtb3Os/I4Q==", "dependencies": { "form-data": "4.0.0", "js-base64": "3.7.2", @@ -2069,6 +2091,18 @@ "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", "dev": true }, + "node_modules/confluence.js": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/confluence.js/-/confluence.js-1.7.2.tgz", + "integrity": "sha512-hCHC4tZNikLonSJdKjcc7CQa5XuWFVyzn5iCKYAtDs7lXWmQyzRoc7mTnKGgMDYNOOCuK+vVIwf9nEcecWXOtw==", + "dependencies": { + "atlassian-jwt": "^2.0.2", + "axios": "^1.6.2", + "form-data": "^4.0.0", + "oauth": "^0.10.0", + "tslib": "^2.6.2" + } + }, "node_modules/cross-fetch": { "version": "3.1.6", "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-3.1.6.tgz", @@ -3452,6 +3486,14 @@ "node": ">=0.10.0" } }, + "node_modules/jsuri": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/jsuri/-/jsuri-1.3.1.tgz", + "integrity": "sha512-LLdAeqOf88/X0hylAI7oSir6QUsz/8kOW0FcJzzu/SJRfORA/oPHycAOthkNp7eLPlTAbqVDFbqNRHkRVzEA3g==", + "engines": { + "node": "*" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -3461,28 +3503,28 @@ } }, "node_modules/langchain": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.1.2.tgz", - "integrity": "sha512-sJbVvIabjmsKDsIdfSiojHOg1EXbVsuVH6Y12rproGwYNRdQZ5zhikW+RttuRSmmEfCVfaGV5++nzYes9/jVAg==", + "version": "0.1.6", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.1.6.tgz", + "integrity": "sha512-ju4LLw6vTax3bfSkphcK8CE6yIKe1NXLT95Ys/gvoONRfZkV4FpNMjDc9Y+keUvzmTv2buhiFU/gslbA4eJtrw==", "dependencies": { "@anthropic-ai/sdk": "^0.9.1", - "@langchain/community": "~0.0.15", - "@langchain/core": "~0.1.12", - "@langchain/openai": "~0.0.10", + "@langchain/community": "~0.0.17", + "@langchain/core": "~0.1.16", + "@langchain/openai": "~0.0.12", "binary-extensions": "^2.2.0", "expr-eval": "^2.0.2", "js-tiktoken": "^1.0.7", "js-yaml": "^4.1.0", "jsonpointer": "^5.0.1", "langchainhub": "~0.0.6", - "langsmith": "~0.0.48", + "langsmith": "~0.0.59", "ml-distance": "^4.0.0", "openapi-types": "^12.1.3", "p-retry": "4", "uuid": "^9.0.0", "yaml": "^2.2.1", - "zod": "^3.22.3", - "zod-to-json-schema": "3.20.3" + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" }, "engines": { "node": ">=18" @@ -3499,7 +3541,7 @@ "@google-ai/generativelanguage": "^0.2.1", "@google-cloud/storage": "^6.10.1", "@notionhq/client": "^2.2.10", - "@pinecone-database/pinecone": "^1.1.0", + "@pinecone-database/pinecone": "*", "@supabase/supabase-js": "^2.10.0", "@vercel/kv": "^0.2.3", "@xata.io/client": "^0.28.0", @@ -3699,15 +3741,23 @@ } } }, + "node_modules/langchain/node_modules/zod-to-json-schema": { + "version": "3.22.3", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.22.3.tgz", + "integrity": "sha512-9isG8SqRe07p+Aio2ruBZmLm2Q6Sq4EqmXOiNpDxp+7f0LV6Q/LX65fs5Nn+FV/CzfF3NLBoksXbS2jNYIfpKw==", + "peerDependencies": { + "zod": "^3.22.4" + } + }, "node_modules/langchainhub": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/langchainhub/-/langchainhub-0.0.6.tgz", "integrity": "sha512-SW6105T+YP1cTe0yMf//7kyshCgvCTyFBMTgH2H3s9rTAR4e+78DA/BBrUL/Mt4Q5eMWui7iGuAYb3pgGsdQ9w==" }, "node_modules/langsmith": { - "version": "0.0.56", - "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.0.56.tgz", - "integrity": "sha512-nTi18WPEtkb7jqurrbi2K9KFzFGhGkb1HBiLQKtNbfm4wJo/+HIRljNqqWSLeQEgA1S2Rsx86BL5IwK04K5dVA==", + "version": "0.0.62", + "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.0.62.tgz", + "integrity": "sha512-OjjlNbxbfEUSgbBLA7JS7Lwg0M+oMZp4ZSwujR9TZBcSKvpv1f3lE2X9e9vTWe9huoUMlUAvwoaSWdDG6w6QLQ==", "dependencies": { "@types/uuid": "^9.0.1", "commander": "^10.0.1", @@ -3780,6 +3830,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/lodash": { + "version": "4.17.21", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" + }, "node_modules/lodash.assignwith": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/lodash.assignwith/-/lodash.assignwith-4.2.0.tgz", @@ -4093,6 +4148,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/oauth": { + "version": "0.10.0", + "resolved": "https://registry.npmjs.org/oauth/-/oauth-0.10.0.tgz", + "integrity": "sha512-1orQ9MT1vHFGQxhuy7E/0gECD3fd2fCC+PIX+/jgmU/gI3EpRocXtmtvxCO5x3WZ443FLTLFWNDjl5MPJf9u+Q==" + }, "node_modules/object-inspect": { "version": "1.13.1", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz", @@ -4397,9 +4457,9 @@ } }, "node_modules/prettier": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.1.tgz", - "integrity": "sha512-qSUWshj1IobVbKc226Gw2pync27t0Kf0EdufZa9j7uBSJay1CC+B3K5lAAZoqgX3ASiKuWsk6OmzKRetXNObWg==", + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.4.tgz", + "integrity": "sha512-FWu1oLHKCrtpO1ypU6J0SbK2d9Ckwysq6bHj/uaCP26DxrPpppCLQRGVuqAxSTvhF00AcvDRyYrLNW7ocBhFFQ==", "dev": true, "bin": { "prettier": "bin/prettier.cjs" @@ -4979,8 +5039,7 @@ "node_modules/tslib": { "version": "2.6.2", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", - "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==", - "devOptional": true + "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==" }, "node_modules/type-check": { "version": "0.4.0", diff --git a/package.json b/package.json index 18bce1b3..f12a218d 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.43", - "description": "A NodeJS framework to easily work with LLMs and custom datasets", + "version": "0.0.44", + "description": "A NodeJS RAG framework to easily work with LLMs and custom datasets", "main": "dist/index.js", "types": "dist/index.d.ts", "type": "module", @@ -50,11 +50,12 @@ "dependencies": { "@huggingface/inference": "^2.6.4", "@langchain/cohere": "^0.0.2", - "@langchain/openai": "^0.0.11", + "@langchain/openai": "^0.0.12", "axios": "^1.6.5", + "confluence.js": "^1.7.2", "debug": "^4.3.4", "html-to-text": "^9.0.5", - "langchain": "^0.1.2", + "langchain": "^0.1.6", "md5": "^2.3.0", "pdf-parse-fork": "^1.2.0", "sitemapper": "^3.2.8", @@ -63,25 +64,25 @@ "youtube-transcript": "^1.0.6" }, "devDependencies": { - "@pinecone-database/pinecone": "^1.1.3", + "@pinecone-database/pinecone": "^1.1.2", "@qdrant/js-client-rest": "^1.7.0", "@tsconfig/recommended": "^1.0.3", "@types/debug": "^4.1.12", "@types/html-to-text": "^9.0.4", "@types/md5": "^2.3.5", - "@types/node": "^20.11.0", + "@types/node": "^20.11.6", "@types/usetube": "^2.1.2", - "@typescript-eslint/eslint-plugin": "^6.18.1", - "@typescript-eslint/parser": "^6.18.1", - "chromadb": "^1.7.3", - "cohere-ai": "^7.6.2", + "@typescript-eslint/eslint-plugin": "^6.19.1", + "@typescript-eslint/parser": "^6.19.1", + "chromadb": "^1.8.1", + "cohere-ai": "^7.7.3", "eslint": "^8.56.0", "eslint-config-prettier": "^9.1.0", "eslint-plugin-prettier": "^5.1.3", "hnswlib-node": "^2.1.0", "ioredis": "^5.3.2", "lmdb": "^2.9.2", - "prettier": "^3.2.1", + "prettier": "^3.2.4", "rimraf": "^5.0.5", "typescript": "^5.3.3", "vectordb": "^0.4.3", @@ -131,6 +132,7 @@ "node": ">= 18.0.0" }, "overrides": { + "@pinecone-database/pinecone": "^1.1.2", "weaviate-ts-client": "^2.0.0", "hnswlib-node": "^2.1.0", "vectordb": "^0.4.3" diff --git a/src/core/llm-application.ts b/src/core/llm-application.ts index ced76e68..4bfb7d31 100644 --- a/src/core/llm-application.ts +++ b/src/core/llm-application.ts @@ -91,7 +91,7 @@ export class LLMApplication { } this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId); - await this.vectorDb.deleteKeys(chunkIds); + if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds); } let batchSize = 0, diff --git a/src/index.ts b/src/index.ts index 93629bd2..8d76022c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,7 @@ import { BaseModel } from './interfaces/base-model.js'; import { SIMPLE_MODELS } from './global/constants.js'; import { OpenAi } from './models/openai-model.js'; import { HuggingFace } from './models/huggingface-model.js'; +import { ConfluenceLoader } from './loaders/confluence-loader.js'; export { LLMApplication, @@ -32,6 +33,7 @@ export { YoutubeChannelLoader, YoutubeSearchLoader, SitemapLoader, + ConfluenceLoader, BaseModel, SIMPLE_MODELS, HuggingFace, diff --git a/src/loaders/confluence-loader.ts b/src/loaders/confluence-loader.ts new file mode 100644 index 00000000..61e11f6e --- /dev/null +++ b/src/loaders/confluence-loader.ts @@ -0,0 +1,88 @@ +import { Content } from 'confluence.js/out/api/models/content.js'; +import { ConfluenceClient } from 'confluence.js'; +import createDebugMessages from 'debug'; +import md5 from 'md5'; + +import { BaseLoader } from '../interfaces/base-loader.js'; +import { WebLoader } from './web-loader.js'; + +export class ConfluenceLoader extends BaseLoader<{ type: 'ConfluenceLoader' }> { + private readonly debug = createDebugMessages('embedjs:loader:ConfluenceLoader'); + + private readonly confluence: ConfluenceClient; + private readonly spaceNames: string[]; + + constructor({ + spaceNames, + confluenceBaseUrl, + confluenceUsername, + confluenceToken, + }: { + spaceNames: [string, ...string[]]; + confluenceBaseUrl?: string; + confluenceUsername?: string; + confluenceToken?: string; + }) { + super(`ConfluenceLoader_${md5(spaceNames.join(','))}`); + + this.spaceNames = spaceNames; + this.confluence = new ConfluenceClient({ + host: confluenceBaseUrl ?? process.env.CONFLUENCE_BASE_URL, + authentication: { + basic: { + username: confluenceUsername ?? process.env.CONFLUENCE_USER_NAME, + password: confluenceToken ?? process.env.CONFLUENCE_API_TOKEN, + }, + }, + }); + } + + override async *getChunks() { + for (const spaceKey of this.spaceNames) { + try { + let i = 0; + const spaceContent = await this.confluence.space.getContentForSpace({ spaceKey }); + this.debug( + `Confluence space (length ${spaceContent['page'].results.length}) obtained for space`, + spaceKey, + ); + + for await (const result of this.getContentChunks(spaceContent['page'].results)) { + result.metadata['chunkId'] = i; + yield result; + i++; + } + } catch (e) { + this.debug('Could not get space details', spaceKey, e); + continue; + } + } + } + + private async *getContentChunks(contentArray: Content[]) { + for (const { id } of contentArray) { + const content = await this.confluence.content.getContentById({ + id: id, + expand: ['body', 'children.page', 'body.view'], + }); + + if (content.children) { + for await (const result of this.getContentChunks(content.children.page.results)) { + yield result; + } + } + + const webLoader = new WebLoader({ content: content.body.view.value }); + for await (const result of await webLoader.getChunks()) { + yield { + pageContent: result.pageContent, + contentHash: result.contentHash, + metadata: { + type: <'ConfluenceLoader'>'ConfluenceLoader', + source: content.title, + }, + }; + } + } + } +} diff --git a/src/loaders/web-loader.ts b/src/loaders/web-loader.ts index 690bf000..f756c675 100644 --- a/src/loaders/web-loader.ts +++ b/src/loaders/web-loader.ts @@ -5,26 +5,36 @@ import axios from 'axios'; import md5 from 'md5'; import { BaseLoader } from '../interfaces/base-loader.js'; -import { cleanString } from '../util/strings.js'; +import { cleanString, truncateCenterString } from '../util/strings.js'; export class WebLoader extends BaseLoader<{ type: 'WebLoader' }> { private readonly debug = createDebugMessages('embedjs:loader:WebLoader'); - private readonly url: string; + private readonly contentOrUrl: string; + private readonly isUrl: boolean; - constructor({ url }: { url: string }) { - super(`WebLoader_${md5(url)}`); - this.url = url; + constructor({ url }: { url: string }); + constructor({ content }: { content: string }); + constructor({ content, url }: { content?: string; url?: string }) { + super(`WebLoader_${md5(content ? `CONTENT_${content}` : `URL_${url}`)}`); + + this.isUrl = content ? false : true; + this.contentOrUrl = content ?? url; } override async *getChunks() { const chunker = new RecursiveCharacterTextSplitter({ chunkSize: 2000, chunkOverlap: 0 }); try { - const { data } = await axios.get(this.url, { responseType: 'document' }); + const data = this.isUrl + ? (await axios.get(this.contentOrUrl, { responseType: 'document' })).data + : this.contentOrUrl; + const text = convert(data, { wordwrap: false, }); + const tuncatedObjectString = this.isUrl ? undefined : truncateCenterString(this.contentOrUrl, 50); + let i = 0; const chunks = await chunker.splitText(cleanString(text)); for (const chunk of chunks) { @@ -33,7 +43,7 @@ export class WebLoader extends BaseLoader<{ type: 'WebLoader' }> { contentHash: md5(chunk), metadata: { type: <'WebLoader'>'WebLoader', - source: this.url, + source: this.isUrl ? this.contentOrUrl : tuncatedObjectString, chunkId: i, }, }; @@ -41,7 +51,7 @@ export class WebLoader extends BaseLoader<{ type: 'WebLoader' }> { i++; } } catch (e) { - this.debug('Could not parse website url', this.url, e); + this.debug('Could not parse input', this.contentOrUrl, e); } } } diff --git a/src/vectorDb/lance-db.ts b/src/vectorDb/lance-db.ts index 8e16f73d..95069989 100644 --- a/src/vectorDb/lance-db.ts +++ b/src/vectorDb/lance-db.ts @@ -43,7 +43,8 @@ export class LanceDb implements BaseDb { }; }); - return this.table.add(mapped); + await this.table.add(mapped); + return mapped.length; //TODO: check if vectorDb has addressed the issue where add returns undefined } async similaritySearch(query: number[], k: number): Promise { @@ -70,7 +71,7 @@ export class LanceDb implements BaseDb { } async deleteKeys(keys: string[]): Promise { - await this.table.delete(`id IS IN (${keys.map((key) => `'${key}'`).join(',')})`); + await this.table.delete(`id IN (${keys.map((key) => `'${key}'`).join(',')})`); } async reset(): Promise { diff --git a/src/vectorDb/qdrant-db.ts b/src/vectorDb/qdrant-db.ts index 81af313f..80340ae9 100644 --- a/src/vectorDb/qdrant-db.ts +++ b/src/vectorDb/qdrant-db.ts @@ -53,7 +53,6 @@ export class QdrantDb implements BaseDb { payload: { pageContent: chunk.pageContent, ...chunk.metadata }, }; }); - console.log(upsertCommand); this.debug(`Inserting QDrant batch`); await this.client.upsert(this.projectName, { diff --git a/tsconfig.json b/tsconfig.json index daa06e3f..f25049fc 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,8 +15,8 @@ "useDefineForClassFields": true, "strictPropertyInitialization": false, "allowJs": false, - "strict": false + "strict": false, }, "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "docs"] + "exclude": ["node_modules", "dist", "docs"], }