-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2a_rag_basics_metadata.py
63 lines (51 loc) · 2.4 KB
/
2a_rag_basics_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Define the directory containing the text files and the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
books_dir = os.path.join(current_dir, "books")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")
print(f"Books directory: {books_dir}")
print(f"Persistent directory: {persistent_directory}")
# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
print("Persistent directory does not exist. Initializing vector store...")
# Ensure the books directory exists
if not os.path.exists(books_dir):
raise FileNotFoundError(
f"The directory {books_dir} does not exist. Please check the path."
)
# List all text files in the directory
book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]
# Read the text content from each file and store it with metadata
documents = []
for book_file in book_files:
file_path = os.path.join(books_dir, book_file)
loader = TextLoader(file_path)
book_docs = loader.load()
for doc in book_docs:
# Add metadata to each document indicating its source
doc.metadata = {"source": book_file}
documents.append(doc)
# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")
# Create embeddings
print("\n--- Creating embeddings ---")
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small"
) # Update to a valid embedding model if needed
print("\n--- Finished creating embeddings ---")
# Create the vector store and persist it
print("\n--- Creating and persisting vector store ---")
db = Chroma.from_documents(
docs, embeddings, persist_directory=persistent_directory)
print("\n--- Finished creating and persisting vector store ---")
else:
print("Vector store already exists. No need to initialize.")