-
Notifications
You must be signed in to change notification settings - Fork 0
/
embed_text.py
145 lines (110 loc) · 4.03 KB
/
embed_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import os
import re
import time
import openai
import pinecone
from dotenv import load_dotenv
def get_docs_link(file_path, input_dir="copilot/data/doc-sections"):
"""
Converts a file path to a documentation link.
Args:
file_path (str): Path to the input file.
input_dir (str): Path to the input directory.
Returns:
str: A documentation link.
"""
# Convert file path to a URL path
converted_path = file_path.replace(f"{input_dir}\\", "")
converted_path = re.sub(r"_section_", "/", converted_path)
converted_path = converted_path.replace(".txt", "")
converted_path = converted_path.replace("_", "-")
converted_path = converted_path.lower()
# Add anchor tag to URL
converted_path = re.sub(r"(.*)/(.*)", r"\1/#\2", converted_path)
# Remove "#no-header" from URL
converted_path = re.sub(r"(.*)/#no-header", r"\1", converted_path)
# Return URL
return f"https://docs.opensafely.org/{converted_path}"
def process_files(input_dir, index_name):
"""
Processes input files and generates embeddings.
Args:
input_dir (str): Path to the input directory.
index_name (str): Name of the Pinecone index.
"""
# Get list of input files
files = os.listdir(input_dir)
txt_files = [file for file in files if file.endswith(".txt")]
# Loop through input files and generate embeddings
for txt_file in txt_files:
input_file_path = os.path.join(input_dir, txt_file)
with open(input_file_path, "r", encoding="utf-8") as file:
# Read input file
plain_text = file.read()
# Get documentation link
link = get_docs_link(input_file_path)
print(f"Processing {input_file_path}...")
# Generate embedding
gpt3_embedding = get_embedding(plain_text)
# Upload embedding to Pinecone
if gpt3_embedding is not None:
key = link
index = pinecone.Index(index_name)
index.upsert([(key, gpt3_embedding, {"text": plain_text})])
# Wait for 1 second to avoid rate limit
time.sleep(1)
def get_embedding(text, model="text-embedding-ada-002"):
"""
Generates an embedding for a given text using OpenAI API.
Args:
text (str): The input text.
model (str): The name of the OpenAI model to use for generating embeddings.
Returns:
list: A list of 768 floating point numbers representing the embedding.
"""
# Remove newline characters from text
text = text.replace("\n", " ")
# Check if text is empty
if text == "":
return None
# Generate embedding using OpenAI API
embedding = openai.Embedding.create(input=[text], model=model)
return embedding["data"][0]["embedding"]
def parse_args():
"""
Parses command line arguments.
Returns:
argparse.Namespace: A namespace object containing the parsed arguments.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir",
type=str,
default="copilot/data/doc-sections",
help="Path to the input directory.",
)
return parser.parse_args()
def main():
"""
Main function.
"""
# Parse command line arguments
args = parse_args()
# Set input and output directories
input_dir = args.input_dir
load_dotenv()
# Get API keys from environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
# Initialize Pinecone and create an index
pinecone.init(api_key=pinecone_api_key, environment="us-west4-gcp")
index_name = os.getenv("PINECONE_INDEX_NAME")
if index_name not in pinecone.list_indexes():
pinecone.create_index(name=index_name, dimension=1536)
# Initialize OpenAI API
openai.api_key = openai_api_key
# Process input files
process_files(input_dir=input_dir, index_name=index_name)
if __name__ == "__main__":
main()