diff --git a/docs/core_docs/.gitignore b/docs/core_docs/.gitignore index 01a0e487fd41..9353e462637e 100644 --- a/docs/core_docs/.gitignore +++ b/docs/core_docs/.gitignore @@ -34,16 +34,16 @@ yarn-error.log* /.quarto/ # AUTO_GENERATED_DOCS +docs/tutorials/summarization.md +docs/tutorials/summarization.mdx +docs/tutorials/sql_qa.md +docs/tutorials/sql_qa.mdx +docs/tutorials/retrievers.md +docs/tutorials/retrievers.mdx docs/tutorials/rag.md docs/tutorials/rag.mdx -docs/tutorials/query_analysis.md -docs/tutorials/query_analysis.mdx docs/tutorials/qa_chat_history.md docs/tutorials/qa_chat_history.mdx -docs/tutorials/pdf_qa.md -docs/tutorials/pdf_qa.mdx -docs/tutorials/local_rag.md -docs/tutorials/local_rag.mdx docs/tutorials/llm_chain.md docs/tutorials/llm_chain.mdx docs/tutorials/graph.md @@ -54,8 +54,6 @@ docs/tutorials/classification.md docs/tutorials/classification.mdx docs/tutorials/chatbot.md docs/tutorials/chatbot.mdx -docs/concepts/t.md -docs/concepts/t.mdx docs/how_to/trim_messages.md docs/how_to/trim_messages.mdx docs/how_to/tools_prompting.md @@ -218,6 +216,8 @@ docs/how_to/assign.md docs/how_to/assign.mdx docs/how_to/agent_executor.md docs/how_to/agent_executor.mdx +docs/concepts/t.md +docs/concepts/t.mdx docs/versions/migrating_memory/conversation_summary_memory.md docs/versions/migrating_memory/conversation_summary_memory.mdx docs/versions/migrating_memory/conversation_buffer_window_memory.md @@ -226,6 +226,12 @@ docs/versions/migrating_memory/chat_history.md docs/versions/migrating_memory/chat_history.mdx docs/troubleshooting/errors/INVALID_TOOL_RESULTS.md docs/troubleshooting/errors/INVALID_TOOL_RESULTS.mdx +docs/integrations/toolkits/vectorstore.md +docs/integrations/toolkits/vectorstore.mdx +docs/integrations/toolkits/sql.md +docs/integrations/toolkits/sql.mdx +docs/integrations/toolkits/openapi.md +docs/integrations/toolkits/openapi.mdx docs/integrations/vectorstores/weaviate.md docs/integrations/vectorstores/weaviate.mdx docs/integrations/vectorstores/upstash.md @@ -252,22 +258,10 @@ docs/integrations/vectorstores/elasticsearch.md docs/integrations/vectorstores/elasticsearch.mdx docs/integrations/vectorstores/chroma.md docs/integrations/vectorstores/chroma.mdx -docs/integrations/tools/tavily_search.md -docs/integrations/tools/tavily_search.mdx -docs/integrations/tools/serpapi.md -docs/integrations/tools/serpapi.mdx -docs/integrations/tools/google_scholar.md -docs/integrations/tools/google_scholar.mdx -docs/integrations/tools/exa_search.md -docs/integrations/tools/exa_search.mdx -docs/integrations/tools/duckduckgo_search.md -docs/integrations/tools/duckduckgo_search.mdx -docs/integrations/toolkits/vectorstore.md -docs/integrations/toolkits/vectorstore.mdx -docs/integrations/toolkits/sql.md -docs/integrations/toolkits/sql.mdx -docs/integrations/toolkits/openapi.md -docs/integrations/toolkits/openapi.mdx +docs/integrations/stores/in_memory.md +docs/integrations/stores/in_memory.mdx +docs/integrations/stores/file_system.md +docs/integrations/stores/file_system.mdx docs/integrations/text_embedding/togetherai.md docs/integrations/text_embedding/togetherai.mdx docs/integrations/text_embedding/pinecone.md @@ -290,14 +284,12 @@ docs/integrations/text_embedding/cohere.md docs/integrations/text_embedding/cohere.mdx docs/integrations/text_embedding/cloudflare_ai.md docs/integrations/text_embedding/cloudflare_ai.mdx +docs/integrations/text_embedding/bytedance_doubao.md +docs/integrations/text_embedding/bytedance_doubao.mdx docs/integrations/text_embedding/bedrock.md docs/integrations/text_embedding/bedrock.mdx docs/integrations/text_embedding/azure_openai.md docs/integrations/text_embedding/azure_openai.mdx -docs/integrations/stores/in_memory.md -docs/integrations/stores/in_memory.mdx -docs/integrations/stores/file_system.md -docs/integrations/stores/file_system.mdx docs/integrations/retrievers/tavily.md docs/integrations/retrievers/tavily.mdx docs/integrations/retrievers/kendra-retriever.md @@ -360,6 +352,8 @@ docs/integrations/chat/cohere.md docs/integrations/chat/cohere.mdx docs/integrations/chat/cloudflare_workersai.md docs/integrations/chat/cloudflare_workersai.mdx +docs/integrations/chat/cerebras.md +docs/integrations/chat/cerebras.mdx docs/integrations/chat/bedrock_converse.md docs/integrations/chat/bedrock_converse.mdx docs/integrations/chat/bedrock.md @@ -370,6 +364,16 @@ docs/integrations/chat/arcjet.md docs/integrations/chat/arcjet.mdx docs/integrations/chat/anthropic.md docs/integrations/chat/anthropic.mdx +docs/integrations/tools/tavily_search.md +docs/integrations/tools/tavily_search.mdx +docs/integrations/tools/serpapi.md +docs/integrations/tools/serpapi.mdx +docs/integrations/tools/google_scholar.md +docs/integrations/tools/google_scholar.mdx +docs/integrations/tools/exa_search.md +docs/integrations/tools/exa_search.mdx +docs/integrations/tools/duckduckgo_search.md +docs/integrations/tools/duckduckgo_search.mdx docs/integrations/retrievers/self_query/weaviate.md docs/integrations/retrievers/self_query/weaviate.mdx docs/integrations/retrievers/self_query/vectara.md @@ -386,6 +390,16 @@ docs/integrations/retrievers/self_query/hnswlib.md docs/integrations/retrievers/self_query/hnswlib.mdx docs/integrations/retrievers/self_query/chroma.md docs/integrations/retrievers/self_query/chroma.mdx +docs/integrations/document_loaders/file_loaders/unstructured.md +docs/integrations/document_loaders/file_loaders/unstructured.mdx +docs/integrations/document_loaders/file_loaders/text.md +docs/integrations/document_loaders/file_loaders/text.mdx +docs/integrations/document_loaders/file_loaders/pdf.md +docs/integrations/document_loaders/file_loaders/pdf.mdx +docs/integrations/document_loaders/file_loaders/directory.md +docs/integrations/document_loaders/file_loaders/directory.mdx +docs/integrations/document_loaders/file_loaders/csv.md +docs/integrations/document_loaders/file_loaders/csv.mdx docs/integrations/document_loaders/web_loaders/web_puppeteer.md docs/integrations/document_loaders/web_loaders/web_puppeteer.mdx docs/integrations/document_loaders/web_loaders/web_cheerio.md @@ -397,14 +411,4 @@ docs/integrations/document_loaders/web_loaders/pdf.mdx docs/integrations/document_loaders/web_loaders/langsmith.md docs/integrations/document_loaders/web_loaders/langsmith.mdx docs/integrations/document_loaders/web_loaders/firecrawl.md -docs/integrations/document_loaders/web_loaders/firecrawl.mdx -docs/integrations/document_loaders/file_loaders/unstructured.md -docs/integrations/document_loaders/file_loaders/unstructured.mdx -docs/integrations/document_loaders/file_loaders/text.md -docs/integrations/document_loaders/file_loaders/text.mdx -docs/integrations/document_loaders/file_loaders/pdf.md -docs/integrations/document_loaders/file_loaders/pdf.mdx -docs/integrations/document_loaders/file_loaders/directory.md -docs/integrations/document_loaders/file_loaders/directory.mdx -docs/integrations/document_loaders/file_loaders/csv.md -docs/integrations/document_loaders/file_loaders/csv.mdx \ No newline at end of file +docs/integrations/document_loaders/web_loaders/firecrawl.mdx \ No newline at end of file diff --git a/docs/core_docs/docs/integrations/text_embedding/bytedance_doubao.ipynb b/docs/core_docs/docs/integrations/text_embedding/bytedance_doubao.ipynb new file mode 100644 index 000000000000..f5f8fcaee35d --- /dev/null +++ b/docs/core_docs/docs/integrations/text_embedding/bytedance_doubao.ipynb @@ -0,0 +1,309 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "afaf8039", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "---\n", + "sidebar_label: ByteDance Doubao\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "9a3d6f34", + "metadata": {}, + "source": [ + "# ByteDanceDoubaoEmbeddings\n", + "\n", + "This will help you get started with ByteDanceDoubao [embedding models](/docs/concepts/embedding_models) using LangChain. For detailed documentation on `ByteDanceDoubaoEmbeddings` features and configuration options, please refer to the [API reference](https://api.js.langchain.com/classes/_langchain_community.embeddings_bytedance_doubao.ByteDanceDoubaoEmbeddings.html).\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Py support | Package downloads | Package latest |\n", + "| :--- | :--- | :---: | :---: | :---: | :---: |\n", + "| [ByteDanceDoubaoEmbeddings](https://api.js.langchain.com/classes/_langchain_community.embeddings_bytedance_doubao.ByteDanceDoubaoEmbeddings.html) | [@langchain/community](https://api.js.langchain.com/modules/_langchain_community.html) | ❌ | ❌ | ![NPM - Downloads](https://img.shields.io/npm/dm/@langchain/community?style=flat-square&label=%20&) | ![NPM - Version](https://img.shields.io/npm/v/@langchain/community?style=flat-square&label=%20&) |\n", + "\n", + "## Setup\n", + "\n", + "You'll need to sign up for an [ARK API key](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey) and set it as an environment variable named `ARK_API_KEY`. Then you should [create a entrypoint](https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint) for embedding models, and use the entrypoint's name as `model`.\n", + "\n", + "Then, you'll need to install the [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) package\n", + "\n", + "### Credentials\n", + "\n", + "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n", + "\n", + "```bash\n", + "# export LANGCHAIN_TRACING_V2=\"true\"\n", + "# export LANGCHAIN_API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "### Installation\n", + "\n", + "The LangChain ByteDanceDoubaoEmbeddings integration lives in the `@langchain/community` package:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "45dd1724", + "metadata": {}, + "source": [ + "## Instantiation\n", + "\n", + "Now we can instantiate our model object and embed text:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9ea7a09b", + "metadata": {}, + "outputs": [], + "source": [ + "import { ByteDanceDoubaoEmbeddings } from \"@langchain/community/embeddings/bytedance_doubao\";\n", + "\n", + "const embeddings = new ByteDanceDoubaoEmbeddings({\n", + " model: 'ep-xxx-xxx' // your entrypoint's name\n", + "});" + ] + }, + { + "cell_type": "markdown", + "id": "77d271b6", + "metadata": {}, + "source": [ + "## Indexing and Retrieval\n", + "\n", + "Embedding models are often used in retrieval-augmented generation (RAG) flows, both as part of indexing data as well as later retrieving it. For more detailed instructions, please see our RAG tutorials under the [working with external knowledge tutorials](/docs/tutorials/#working-with-external-knowledge).\n", + "\n", + "Below, see how to index and retrieve data using the `embeddings` object we initialized above. In this example, we will index and retrieve a sample document using the demo [`MemoryVectorStore`](/docs/integrations/vectorstores/memory)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d817716b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LangChain is the framework for building context-aware reasoning applications\n" + ] + } + ], + "source": [ + "// Create a vector store with a sample text\n", + "import { MemoryVectorStore } from \"langchain/vectorstores/memory\";\n", + "\n", + "const text = \"LangChain is the framework for building context-aware reasoning applications\";\n", + "\n", + "const vectorstore = await MemoryVectorStore.fromDocuments(\n", + " [{ pageContent: text, metadata: {} }],\n", + " embeddings,\n", + ");\n", + "\n", + "// Use the vector store as a retriever that returns a single document\n", + "const retriever = vectorstore.asRetriever(1);\n", + "\n", + "// Retrieve the most similar text\n", + "const retrievedDocuments = await retriever.invoke(\"What is LangChain?\");\n", + "\n", + "retrievedDocuments[0].pageContent;" + ] + }, + { + "cell_type": "markdown", + "id": "e02b9855", + "metadata": {}, + "source": [ + "## Direct Usage\n", + "\n", + "Under the hood, the vectorstore and retriever implementations are calling `embeddings.embedDocument(...)` and `embeddings.embedQuery(...)` to create embeddings for the text(s) used in `fromDocuments` and the retriever's `invoke` operations, respectively.\n", + "\n", + "You can directly call these methods to get embeddings for your own use cases.\n", + "\n", + "### Embed single texts\n", + "\n", + "You can embed queries for search with `embedQuery`. This generates a vector representation specific to the query:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0d2befcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " 0.026051683, 0.029081265, -0.040726297, -0.015116953, -0.010691089,\n", + " 0.030181013, -0.0065084146, -0.02079503, 0.013575795, 0.03452527,\n", + " 0.009578291, 0.007026421, -0.030110886, 0.013489622, -0.04294787,\n", + " 0.011141899, -0.043768786, -0.00362867, -0.0081198225, -0.03426076,\n", + " 0.010075142, 0.027787417, -0.09052663, -0.06039698, -0.009462592,\n", + " 0.06232288, 0.051121354, 0.011977532, 0.089046724, 0.059000008,\n", + " 0.031860664, -0.034242127, 0.020339863, 0.011483523, -0.05429335,\n", + " -0.04963588, 0.03263794, -0.05581542, 0.013908403, -0.012356067,\n", + " -0.007802118, -0.010027855, 0.00281217, -0.101886116, -0.079341754,\n", + " 0.011269771, 0.0035983133, -0.027667878, 0.032092705, -0.052843474,\n", + " -0.045283325, 0.0382421, 0.0193055, 0.011050924, 0.021132186,\n", + " -0.037696265, 0.0006107435, 0.0043520257, -0.028798066, 0.049155913,\n", + " 0.03590549, -0.0040995986, 0.019772101, -0.076119535, 0.0031298609,\n", + " 0.03368174, 0.039398745, -0.011813277, -0.019313531, -0.013108803,\n", + " -0.044905286, -0.022326004, -0.01656178, -0.06658457, 0.016789088,\n", + " 0.049952697, 0.006615693, -0.01694402, -0.018105473, 0.0049101883,\n", + " -0.004966945, 0.049762275, -0.03556957, -0.015986584, -0.03190983,\n", + " -0.05336687, -0.0020468342, -0.0016106658, -0.035291273, -0.029783724,\n", + " -0.010153295, 0.052100364, 0.05528949, 0.01379487, -0.024542747,\n", + " 0.028773975, 0.010087022, 0.030448131, -0.042391222, 0.016596776\n", + "]\n" + ] + } + ], + "source": [ + "const singleVector = await embeddings.embedQuery(text);\n", + "\n", + "console.log(singleVector.slice(0, 100));" + ] + }, + { + "cell_type": "markdown", + "id": "1b5a7d03", + "metadata": {}, + "source": [ + "### Embed multiple texts\n", + "\n", + "You can embed multiple texts for indexing with `embedDocuments`. The internals used for this method may (but do not have to) differ from embedding queries:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f4d6e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " 0.026051683, 0.029081265, -0.040726297, -0.015116953, -0.010691089,\n", + " 0.030181013, -0.0065084146, -0.02079503, 0.013575795, 0.03452527,\n", + " 0.009578291, 0.007026421, -0.030110886, 0.013489622, -0.04294787,\n", + " 0.011141899, -0.043768786, -0.00362867, -0.0081198225, -0.03426076,\n", + " 0.010075142, 0.027787417, -0.09052663, -0.06039698, -0.009462592,\n", + " 0.06232288, 0.051121354, 0.011977532, 0.089046724, 0.059000008,\n", + " 0.031860664, -0.034242127, 0.020339863, 0.011483523, -0.05429335,\n", + " -0.04963588, 0.03263794, -0.05581542, 0.013908403, -0.012356067,\n", + " -0.007802118, -0.010027855, 0.00281217, -0.101886116, -0.079341754,\n", + " 0.011269771, 0.0035983133, -0.027667878, 0.032092705, -0.052843474,\n", + " -0.045283325, 0.0382421, 0.0193055, 0.011050924, 0.021132186,\n", + " -0.037696265, 0.0006107435, 0.0043520257, -0.028798066, 0.049155913,\n", + " 0.03590549, -0.0040995986, 0.019772101, -0.076119535, 0.0031298609,\n", + " 0.03368174, 0.039398745, -0.011813277, -0.019313531, -0.013108803,\n", + " -0.044905286, -0.022326004, -0.01656178, -0.06658457, 0.016789088,\n", + " 0.049952697, 0.006615693, -0.01694402, -0.018105473, 0.0049101883,\n", + " -0.004966945, 0.049762275, -0.03556957, -0.015986584, -0.03190983,\n", + " -0.05336687, -0.0020468342, -0.0016106658, -0.035291273, -0.029783724,\n", + " -0.010153295, 0.052100364, 0.05528949, 0.01379487, -0.024542747,\n", + " 0.028773975, 0.010087022, 0.030448131, -0.042391222, 0.016596776\n", + "]\n", + "[\n", + " 0.0558515, 0.028698817, -0.037476595, 0.0048659276, -0.019229038,\n", + " -0.04713716, -0.020947812, -0.017550547, 0.01205507, 0.027693441,\n", + " -0.011791304, 0.009862203, 0.019662278, -0.037511427, -0.022662448,\n", + " 0.036224432, -0.051760387, -0.030165697, -0.008899774, -0.024518963,\n", + " 0.010077767, 0.032209765, -0.0854303, -0.038666975, -0.036021013,\n", + " 0.060899545, 0.045867186, 0.003365381, 0.09387081, 0.038216405,\n", + " 0.011449426, -0.016495887, 0.020602569, -0.02368503, -0.014733645,\n", + " -0.065408126, -0.0065152845, -0.027103946, 0.00038956117, -0.08648814,\n", + " 0.029316466, -0.054449145, 0.034129277, -0.055225655, -0.043182302,\n", + " 0.0011148591, 0.044116337, -0.046552557, 0.032423045, -0.03269365,\n", + " -0.05062933, 0.021473562, -0.011019348, -0.019621233, -0.0003149565,\n", + " -0.0046085776, 0.0052610254, -0.0029293327, -0.035793293, 0.034469575,\n", + " 0.037724957, 0.009572597, 0.014198464, -0.0878237, 0.0056973165,\n", + " 0.023563445, 0.030928325, 0.025520306, 0.01836824, -0.016456697,\n", + " -0.061934732, 0.009764942, -0.035812028, -0.04429064, 0.031323086,\n", + " 0.056027107, -0.0019782048, -0.015204176, -0.008684945, -0.0010460864,\n", + " 0.054642987, 0.044149086, -0.032964867, -0.012044753, -0.019075096,\n", + " -0.027932597, 0.018542245, -0.02602878, -0.04645578, -0.020976603,\n", + " 0.018999187, 0.050663687, 0.016725155, 0.0076955976, 0.011448177,\n", + " 0.053931057, -0.03234989, 0.024429373, -0.023123834, 0.02197912\n", + "]\n" + ] + } + ], + "source": [ + "const text2 = \"LangGraph is a library for building stateful, multi-actor applications with LLMs\";\n", + "\n", + "const vectors = await embeddings.embedDocuments([text, text2]);\n", + "\n", + "console.log(vectors[0].slice(0, 100));\n", + "console.log(vectors[1].slice(0, 100));" + ] + }, + { + "cell_type": "markdown", + "id": "b48d41bb", + "metadata": {}, + "source": [ + "## Related\n", + "\n", + "- Embedding model [conceptual guide](/docs/concepts/embedding_models)\n", + "- Embedding model [how-to guides](/docs/how_to/#embedding-models)" + ] + }, + { + "cell_type": "markdown", + "id": "eacd89fe", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all ByteDanceDoubaoEmbeddings features and configurations head to the API reference: https://api.js.langchain.com/classes/_langchain_community.embeddings_bytedance_doubao.ByteDanceDoubaoEmbeddings.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/.env.example b/examples/.env.example index 9aae33991e92..83a320555978 100644 --- a/examples/.env.example +++ b/examples/.env.example @@ -84,7 +84,8 @@ HANA_HOST=HANA_DB_ADDRESS HANA_PORT=HANA_DB_PORT HANA_UID=HANA_DB_USER HANA_PWD=HANA_DB_PASSWORD +ARK_API_KEY=ADD_YOURS_HERE # https://console.volcengine.com/ JIRA_HOST=ADD_YOURS_HERE JIRA_USERNAME=ADD_YOURS_HERE JIRA_ACCESS_TOKEN=ADD_YOURS_HERE -JIRA_PROJECT_KEY=ADD_YOURS_HERE \ No newline at end of file +JIRA_PROJECT_KEY=ADD_YOURS_HERE diff --git a/examples/src/embeddings/bytedance_doubao.ts b/examples/src/embeddings/bytedance_doubao.ts new file mode 100644 index 000000000000..a7a669decb6b --- /dev/null +++ b/examples/src/embeddings/bytedance_doubao.ts @@ -0,0 +1,9 @@ +import { ByteDanceDoubaoEmbeddings } from "@langchain/community/embeddings/bytedance_doubao"; + +const model = new ByteDanceDoubaoEmbeddings({ + model: "ep-xxx-xxx", +}); +const res = await model.embedQuery( + "What would be a good company name a company that makes colorful socks?" +); +console.log({ res }); diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index b1cef6d2c9fb..49f87dc328fb 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -146,6 +146,10 @@ embeddings/bedrock.cjs embeddings/bedrock.js embeddings/bedrock.d.ts embeddings/bedrock.d.cts +embeddings/bytedance_doubao.cjs +embeddings/bytedance_doubao.js +embeddings/bytedance_doubao.d.ts +embeddings/bytedance_doubao.d.cts embeddings/cloudflare_workersai.cjs embeddings/cloudflare_workersai.js embeddings/cloudflare_workersai.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 46853f88792b..dc58963eed75 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -72,6 +72,7 @@ export const config = { "embeddings/alibaba_tongyi": "embeddings/alibaba_tongyi", "embeddings/baidu_qianfan": "embeddings/baidu_qianfan", "embeddings/bedrock": "embeddings/bedrock", + "embeddings/bytedance_doubao": "embeddings/bytedance_doubao", "embeddings/cloudflare_workersai": "embeddings/cloudflare_workersai", "embeddings/cohere": "embeddings/cohere", "embeddings/deepinfra": "embeddings/deepinfra", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 7632c8e7bcbf..56af665d9fa9 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -1051,6 +1051,15 @@ "import": "./embeddings/bedrock.js", "require": "./embeddings/bedrock.cjs" }, + "./embeddings/bytedance_doubao": { + "types": { + "import": "./embeddings/bytedance_doubao.d.ts", + "require": "./embeddings/bytedance_doubao.d.cts", + "default": "./embeddings/bytedance_doubao.d.ts" + }, + "import": "./embeddings/bytedance_doubao.js", + "require": "./embeddings/bytedance_doubao.cjs" + }, "./embeddings/cloudflare_workersai": { "types": { "import": "./embeddings/cloudflare_workersai.d.ts", @@ -3336,6 +3345,10 @@ "embeddings/bedrock.js", "embeddings/bedrock.d.ts", "embeddings/bedrock.d.cts", + "embeddings/bytedance_doubao.cjs", + "embeddings/bytedance_doubao.js", + "embeddings/bytedance_doubao.d.ts", + "embeddings/bytedance_doubao.d.cts", "embeddings/cloudflare_workersai.cjs", "embeddings/cloudflare_workersai.js", "embeddings/cloudflare_workersai.d.ts", diff --git a/libs/langchain-community/src/embeddings/bytedance_doubao.ts b/libs/langchain-community/src/embeddings/bytedance_doubao.ts new file mode 100644 index 000000000000..d542bc14c989 --- /dev/null +++ b/libs/langchain-community/src/embeddings/bytedance_doubao.ts @@ -0,0 +1,176 @@ +import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings"; +import { chunkArray } from "@langchain/core/utils/chunk_array"; + +export interface ByteDanceDoubaoEmbeddingsParams extends EmbeddingsParams { + /** Model name to use */ + model: string; + + /** + * Timeout to use when making requests to ByteDanceDoubao. + */ + timeout?: number; + + /** + * The maximum number of documents to embed in a single request. This is + * limited by the ByteDanceDoubao API to a maximum of 2048. + */ + batchSize?: number; + + /** + * Whether to strip new lines from the input text. + */ + stripNewLines?: boolean; +} + +interface EmbeddingCreateParams { + model: ByteDanceDoubaoEmbeddingsParams["model"]; + input: string[]; + encoding_format?: "float"; +} + +interface EmbeddingResponse { + data: { + index: number; + embedding: number[]; + }[]; + + usage: { + prompt_tokens: number; + total_tokens: number; + }; + + id: string; +} + +interface EmbeddingErrorResponse { + type: string; + code: string; + param: string; + message: string; +} + +export class ByteDanceDoubaoEmbeddings + extends Embeddings + implements ByteDanceDoubaoEmbeddingsParams +{ + model: string; + + batchSize = 24; + + stripNewLines = true; + + apiKey: string; + + constructor( + fields?: Partial & { + verbose?: boolean; + apiKey?: string; + } + ) { + const fieldsWithDefaults = { maxConcurrency: 2, ...fields }; + super(fieldsWithDefaults); + + const apiKey = + fieldsWithDefaults?.apiKey ?? getEnvironmentVariable("ARK_API_KEY"); + + if (!apiKey) throw new Error("ByteDanceDoubao API key not found"); + + this.apiKey = apiKey; + + this.model = fieldsWithDefaults?.model ?? this.model; + this.batchSize = fieldsWithDefaults?.batchSize ?? this.batchSize; + this.stripNewLines = + fieldsWithDefaults?.stripNewLines ?? this.stripNewLines; + } + + /** + * Method to generate embeddings for an array of documents. Splits the + * documents into batches and makes requests to the ByteDanceDoubao API to generate + * embeddings. + * @param texts Array of documents to generate embeddings for. + * @returns Promise that resolves to a 2D array of embeddings for each document. + */ + async embedDocuments(texts: string[]): Promise { + const batches = chunkArray( + this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts, + this.batchSize + ); + const batchRequests = batches.map((batch) => { + const params = this.getParams(batch); + + return this.embeddingWithRetry(params); + }); + + const batchResponses = await Promise.all(batchRequests); + const embeddings: number[][] = []; + + for (let i = 0; i < batchResponses.length; i += 1) { + const batch = batches[i]; + const batchResponse = batchResponses[i] || []; + for (let j = 0; j < batch.length; j += 1) { + embeddings.push(batchResponse[j]); + } + } + + return embeddings; + } + + /** + * Method to generate an embedding for a single document. Calls the + * embeddingWithRetry method with the document as the input. + * @param text Document to generate an embedding for. + * @returns Promise that resolves to an embedding for the document. + */ + async embedQuery(text: string): Promise { + const params = this.getParams([ + this.stripNewLines ? text.replace(/\n/g, " ") : text, + ]); + + const embeddings = (await this.embeddingWithRetry(params)) || [[]]; + return embeddings[0]; + } + + /** + * Method to generate an embedding params. + * @param texts Array of documents to generate embeddings for. + * @returns an embedding params. + */ + private getParams( + texts: EmbeddingCreateParams["input"] + ): EmbeddingCreateParams { + return { + model: this.model, + input: texts, + }; + } + + /** + * Private method to make a request to the OpenAI API to generate + * embeddings. Handles the retry logic and returns the response from the + * API. + * @param request Request to send to the OpenAI API. + * @returns Promise that resolves to the response from the API. + */ + private async embeddingWithRetry(body: EmbeddingCreateParams) { + return fetch("https://ark.cn-beijing.volces.com/api/v3/embeddings", { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }).then(async (response) => { + const embeddingData: EmbeddingResponse | EmbeddingErrorResponse = + await response.json(); + + if ("code" in embeddingData && embeddingData.code) { + throw new Error(`${embeddingData.code}: ${embeddingData.message}`); + } + + return (embeddingData as EmbeddingResponse).data.map( + ({ embedding }) => embedding + ); + }); + } +} diff --git a/libs/langchain-community/src/embeddings/tests/bytedance_doubao.int.test.ts b/libs/langchain-community/src/embeddings/tests/bytedance_doubao.int.test.ts new file mode 100644 index 000000000000..f584e1dfbba2 --- /dev/null +++ b/libs/langchain-community/src/embeddings/tests/bytedance_doubao.int.test.ts @@ -0,0 +1,40 @@ +import { test, expect } from "@jest/globals"; +import { ByteDanceDoubaoEmbeddings } from "../bytedance_doubao.js"; + +const modelName = "ep-xxx-xxx"; +test.skip("Test ByteDanceDoubaoEmbeddings.embedQuery", async () => { + const embeddings = new ByteDanceDoubaoEmbeddings({ + model: modelName, + }); + const res = await embeddings.embedQuery("Hello world"); + expect(typeof res[0]).toBe("number"); +}); + +test.skip("Test ByteDanceDoubaoEmbeddings.embedDocuments", async () => { + const embeddings = new ByteDanceDoubaoEmbeddings({ + model: modelName, + }); + const res = await embeddings.embedDocuments(["Hello world", "Bye bye"]); + expect(res).toHaveLength(2); + expect(typeof res[0][0]).toBe("number"); + expect(typeof res[1][0]).toBe("number"); +}); + +test.skip("Test ByteDanceDoubaoEmbeddings concurrency", async () => { + const embeddings = new ByteDanceDoubaoEmbeddings({ + model: modelName, + batchSize: 1, + }); + const res = await embeddings.embedDocuments([ + "Hello world", + "Bye bye", + "Hello world", + "Bye bye", + "Hello world", + "Bye bye", + ]); + expect(res).toHaveLength(6); + expect(res.find((embedding) => typeof embedding[0] !== "number")).toBe( + undefined + ); +}); diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index daafa8c76bfa..2f6f5d86660a 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -27,6 +27,7 @@ export * as agents__toolkits__base from "../agents/toolkits/base.js"; export * as agents__toolkits__connery from "../agents/toolkits/connery/index.js"; export * as embeddings__alibaba_tongyi from "../embeddings/alibaba_tongyi.js"; export * as embeddings__baidu_qianfan from "../embeddings/baidu_qianfan.js"; +export * as embeddings__bytedance_doubao from "../embeddings/bytedance_doubao.js"; export * as embeddings__deepinfra from "../embeddings/deepinfra.js"; export * as embeddings__fireworks from "../embeddings/fireworks.js"; export * as embeddings__minimax from "../embeddings/minimax.js";