Skip to content

Commit

Permalink
updated pinecone to version 2x
Browse files Browse the repository at this point in the history
  • Loading branch information
adhityan committed Feb 7, 2024
1 parent 5b2f8e1 commit c2da0d0
Show file tree
Hide file tree
Showing 9 changed files with 217 additions and 190 deletions.
33 changes: 28 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,20 +333,28 @@ You can enable Pinecone storage by following these steps -
npm install @pinecone-database/pinecone
```

- Set the pinecone environment variables `PINECONE_API_KEY` and `PINECONE_ENVIRONMENT`. These can be obtained from the **API Keys** section on the Pinecone dashboard.
- Set the pinecone environment variable `PINECONE_API_KEY`. This can be obtained from the **API Keys** section on the Pinecone dashboard.

```bash
PINECONE_API_KEY="e65a4ec0-14f7-40c5-903e-f8529127b817"
PINECONE_ENVIRONMENT="us-west1-gcp-free"
PINECONE_API_KEY=<your api key>
```

- Set the Pinecone database as your choice of `vectorDb`

```TS
.setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' }))
.setVectorDb(new PineconeDb({
projectName: 'test',
namespace: 'dev',
indexSpec: {
pod: {
podType: 'p1.x1',
environment: 'us-east1-gcp',
},
},
}))
```

**Note:** The `projectName` will be used to create the Pinecone index name for this application.
**Note:** Pinecone supports serverless and pod based index deployments. You can control how you want your index created using the indexSpec attribute. This is mandatory to be provided but comes with full type specification.

## LanceDB

Expand Down Expand Up @@ -636,6 +644,21 @@ If you want us to add support for a specific embedding model, please create an [

All PRs are welcome.

# Langsmith Integration

Langsmith allows you to keep track of how you use LLM and embedding models. It logs histories, token uses and other metadata. Follow these three simple steps to enable -

- Sign up for an account with [Langsmith](https://smith.langchain.com/)
- Generate an API Key from your admin page
- Set the following environment keys in your project

```bash
export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
export LANGCHAIN_PROJECT="<project name>"
export LANGCHAIN_API_KEY="<api key>"
```

# Azure OpenAI

In order to be able to use an OpenAI model on Azure, it first needs to be deployed. Please refer to [Azure OpenAI documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) on how to deploy a model on Azure. To run this library, you will need to deploy two models -
Expand Down
13 changes: 12 additions & 1 deletion examples/pinecone/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,18 @@ const llmApplication = await new LLMApplicationBuilder()
.addLoader(new YoutubeLoader({ videoIdOrUrl: 'https://www.youtube.com/watch?v=w2KbwC-s7pY' }))
.addLoader(new TextLoader({ text: 'The best company name for a company making colorful socks is MrSocks' }))
.setCache(new LmdbCache({ path: path.resolve(path.dirname(__filename), '../../../cache') }))
.setVectorDb(new PineconeDb({ projectName: 'test', namespace: 'dev' }))
.setVectorDb(
new PineconeDb({
projectName: 'test',
namespace: 'dev',
indexSpec: {
pod: {
podType: 'p1.x1',
environment: 'us-east1-gcp',
},
},
}),
)
.build();

console.log(await llmApplication.query('What is paxos?'));
Expand Down
184 changes: 90 additions & 94 deletions package-lock.json

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@llm-tools/embedjs",
"version": "0.0.53",
"version": "0.0.57",
"description": "A NodeJS RAG framework to easily work with LLMs and custom datasets",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down Expand Up @@ -49,31 +49,31 @@
"homepage": "https://github.com/llm-tools/embedjs#readme",
"dependencies": {
"@huggingface/inference": "^2.6.4",
"@langchain/cohere": "^0.0.3",
"@langchain/cohere": "^0.0.4",
"@langchain/openai": "^0.0.14",
"axios": "^1.6.7",
"confluence.js": "^1.7.2",
"debug": "^4.3.4",
"html-to-text": "^9.0.5",
"langchain": "^0.1.12",
"langchain": "^0.1.13",
"md5": "^2.3.0",
"pdf-parse-fork": "^1.2.0",
"sitemapper": "^3.2.8",
"usetube": "^2.2.7",
"sitemapper": "^3.1.8",
"usetube": "^2.0.2",
"uuid": "^9.0.1",
"youtube-transcript": "^1.0.6"
},
"devDependencies": {
"@pinecone-database/pinecone": "^1.1.2",
"@pinecone-database/pinecone": "^2.0.1",
"@qdrant/js-client-rest": "^1.7.0",
"@tsconfig/recommended": "^1.0.3",
"@types/debug": "^4.1.12",
"@types/html-to-text": "^9.0.4",
"@types/md5": "^2.3.5",
"@types/node": "^20.11.16",
"@types/usetube": "^2.1.2",
"@typescript-eslint/eslint-plugin": "^6.20.0",
"@typescript-eslint/parser": "^6.20.0",
"@typescript-eslint/eslint-plugin": "^6.21.0",
"@typescript-eslint/parser": "^6.21.0",
"chromadb": "^1.8.1",
"cohere-ai": "^7.7.5",
"eslint": "^8.56.0",
Expand All @@ -82,28 +82,28 @@
"hnswlib-node": "^2.1.0",
"ioredis": "^5.3.2",
"lmdb": "^2.9.2",
"prettier": "^3.2.4",
"prettier": "^3.2.5",
"rimraf": "^5.0.5",
"typescript": "^5.3.3",
"vectordb": "^0.4.7",
"vectordb": "^0.4.8",
"weaviate-ts-client": "^2.0.0"
},
"peerDependencies": {
"@pinecone-database/pinecone": "^1.1.2",
"@pinecone-database/pinecone": "^2.0.1",
"@qdrant/js-client-rest": "^1.7.0",
"chromadb": "^1.8.1",
"cohere-ai": "^7.7.5",
"hnswlib-node": "^2.1.0",
"ioredis": "^5.3.2",
"lmdb": "^2.9.2",
"vectordb": "^0.4.7",
"vectordb": "^0.4.8",
"weaviate-ts-client": "^2.0.0"
},
"overrides": {
"@pinecone-database/pinecone": "^1.1.2",
"@pinecone-database/pinecone": "^2.0.1",
"weaviate-ts-client": "^2.0.0",
"hnswlib-node": "^2.1.0",
"vectordb": "^0.4.7"
"vectordb": "^0.4.8"
},
"peerDependenciesMeta": {
"@pinecone-database/pinecone": {
Expand Down
61 changes: 41 additions & 20 deletions src/core/llm-application.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import createDebugMessages from 'debug';

import { BaseDb } from '../interfaces/base-db.js';
import { BaseLoader } from '../interfaces/base-loader.js';
import { AddLoaderReturn, Chunk, EmbeddedChunk } from '../global/types.js';
import { AddLoaderReturn, Chunk, EmbeddedChunk, LoaderChunk } from '../global/types.js';
import { LLMApplicationBuilder } from './llm-application-builder.js';
import { DEFAULT_INSERT_BATCH_SIZE } from '../global/constants.js';
import { cleanString, stringFormat } from '../util/strings.js';
Expand Down Expand Up @@ -78,28 +78,11 @@ export class LLMApplication {
return this.vectorDb.insertChunks(embedChunks);
}

public async addLoader(loader: BaseLoader): Promise<AddLoaderReturn> {
const uniqueId = loader.getUniqueId();
this.debug('Add loader called for', uniqueId);
await loader.init();

const chunks = await loader.getChunks();
if (this.cache && (await this.cache.hasLoader(uniqueId))) {
const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId);

const chunkIds: string[] = [];
for (let i = 0; i < previousChunkCount; i++) {
chunkIds.push(this.getChunkUniqueId(uniqueId, i));
}

this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId);
if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds);
}

private async batchLoadChunks(uniqueId: string, incrementalGenerator: AsyncGenerator<LoaderChunk, void, void>) {
let batchSize = 0,
newInserts = 0,
formattedChunks: Chunk[] = [];
for await (const chunk of chunks) {
for await (const chunk of incrementalGenerator) {
batchSize++;

const formattedChunk = {
Expand All @@ -117,10 +100,48 @@ export class LLMApplication {
batchSize = 0;
}
}

newInserts += await this.batchLoadEmbeddings(uniqueId, formattedChunks);
return { newInserts, formattedChunks };
}

private async incrementalLoader(uniqueId: string, incrementalGenerator: AsyncGenerator<LoaderChunk, void, void>) {
this.debug(`incrementalChunkAvailable for loader`, uniqueId);
const { newInserts } = await this.batchLoadChunks(uniqueId, incrementalGenerator);
this.debug(`${newInserts} new incrementalChunks processed`, uniqueId);
}

public async addLoader(loader: BaseLoader): Promise<AddLoaderReturn> {
const uniqueId = loader.getUniqueId();
this.debug('Add loader called for', uniqueId);
await loader.init();

const chunks = await loader.getChunks();
if (this.cache && (await this.cache.hasLoader(uniqueId))) {
const { chunkCount: previousChunkCount } = await this.cache.getLoader(uniqueId);

const chunkIds: string[] = [];
for (let i = 0; i < previousChunkCount; i++) {
chunkIds.push(this.getChunkUniqueId(uniqueId, i));
}

this.debug(`Loader previously run. Deleting previous ${chunkIds.length} keys`, uniqueId);
if (chunkIds.length > 0) await this.vectorDb.deleteKeys(chunkIds);
}

const { newInserts, formattedChunks } = await this.batchLoadChunks(uniqueId, chunks);

if (this.cache) await this.cache.addLoader(uniqueId, formattedChunks.length);
this.debug(`Add loader completed with ${newInserts} new entries for`, uniqueId);

if (loader.canIncrementallyLoad) {
this.debug(`Registering incremental loader`, uniqueId);

loader.on('incrementalChunkAvailable', async (incrementalGenerator) => {
await this.incrementalLoader(uniqueId, incrementalGenerator);
});
}

return { entriesAdded: newInserts, uniqueId };
}

Expand Down
16 changes: 14 additions & 2 deletions src/interfaces/base-loader.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,35 @@
import createDebugMessages from 'debug';
import { EventEmitter } from 'node:events';

import { LoaderChunk } from '../global/types.js';
import { BaseCache } from './base-cache.js';

export abstract class BaseLoader<
T extends Record<string, string | number | boolean> = Record<string, string | number | boolean>,
M extends Record<string, unknown> = Record<string, null>,
> {
> extends EventEmitter {
private static cache?: BaseCache;

public static setCache(cache?: BaseCache) {
BaseLoader.cache = cache;
}

protected readonly uniqueId: string;
private readonly _canIncrementallyLoad: boolean;

constructor(uniqueId: string) {
constructor(uniqueId: string, canIncrementallyLoad: boolean = false) {
super();
this.uniqueId = uniqueId;
this._canIncrementallyLoad = canIncrementallyLoad;
createDebugMessages('embedjs:loader:BaseLoader')(`New loader class initalized with key ${uniqueId}`);
}

async init() {}

public get canIncrementallyLoad() {
return this._canIncrementallyLoad;
}

getUniqueId(): string {
return this.uniqueId;
}
Expand All @@ -45,5 +53,9 @@ export abstract class BaseLoader<
return BaseLoader.cache.loaderCustomHas(this.getCustomCacheKey(key));
}

protected async loadIncrementalChunk(incrementalGenerator: AsyncGenerator<LoaderChunk<T>, void, void>) {
this.emit('incrementalChunkAvailable', incrementalGenerator);
}

abstract getChunks(): AsyncGenerator<LoaderChunk<T>, void, void>;
}
1 change: 1 addition & 0 deletions src/loaders/confluence-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ export class ConfluenceLoader extends BaseLoader<{ type: 'ConfluenceLoader' }> {
}
}

if (!content.body.view.value) continue;
const webLoader = new WebLoader({ content: content.body.view.value });
for await (const result of await webLoader.getChunks()) {
yield {
Expand Down
47 changes: 0 additions & 47 deletions src/loaders/medusa-loader.ts

This file was deleted.

Loading

0 comments on commit c2da0d0

Please sign in to comment.