-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1a_rag_basics.py
50 lines (40 loc) · 1.87 KB
/
1a_rag_basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "books", "odyssey.txt")
persistent_directory = os.path.join(current_dir, "db", "chroma_db")
# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
print("Persistent directory does not exist. Initializing vector store...")
# Ensure the text file exists
if not os.path.exists(file_path):
raise FileNotFoundError(
f"The file {file_path} does not exist. Please check the path."
)
# Read the text content from the file
loader = TextLoader(file_path)
documents = loader.load()
# Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")
print(f"Sample chunk:\n{docs[0].page_content}\n")
# Create embeddings
print("\n--- Creating embeddings ---")
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small"
) # Update to a valid embedding model if needed
print("\n--- Finished creating embeddings ---")
# Create the vector store and persist it automatically
print("\n--- Creating vector store ---")
db = Chroma.from_documents(
docs, embeddings, persist_directory=persistent_directory)
print("\n--- Finished creating vector store ---")
else:
print("Vector store already exists. No need to initialize.")