diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 000000000..56cccfe39 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,55 @@ +# Flask Backend + +This is the backend for the project, built using Flask. + +## Installation + +1. Install the required dependencies: + + ```bash + conda env create -f environment.yml + conda activate bluebook + ``` + +2. You will need to have your IP address allowlisted from Mongo to query the database. +3. You will also need to create an `.env` file that contains your API keys: + +``` +MONGO_URI="mongodb+srv://xxx" +OPENAI_API_KEY="sk-xxx" +``` + +## Usage + +1. Start the Flask server: + + ```bash + python app.py + ``` + +2. The server will start running on `http://localhost:8000`. + +3. Use your favorite API client (e.g., Postman) to send a POST request to `http://localhost:8000/api/chat` with the following JSON payload: + + ```json + { + "role": "user", + "content": "Tell me some courses about personal finance" + } + ``` + + You should receive a response with the recommended courses like this: + + ```json + { + "courses": [ + { + "course_code": "ECON 436", + "description": "How much should I be saving at age 35? How much of my portfolio should be invested in stocks at age 50? Which mortgage should I choose, and when should I refinance it? How much can I afford to spend per year in retirement? This course covers prescriptive models of personal saving, asset allocation, borrowing, and spending. The course is designed to answer questions facing anybody who manages their own money or is a manager in an organization that is trying to help clients manage their money.", + "title": "Personal Finance" + }, + ... + ], + "response": "To learn more about personal finance, you can start by taking courses or workshops that focus on financial management, budgeting, investing, and retirement planning. Some universities and educational platforms offer online courses on personal finance, such as ECON 436: Personal Finance and ECON 361: Corporate Finance. Additionally, you can explore resources like books, podcasts, and websites dedicated to personal finance advice and tips. It may also be helpful to consult with a financial advisor or planner for personalized guidance on managing your finances effectively." + } + ``` diff --git a/backend/__pycache__/lib.cpython-311.pyc b/backend/__pycache__/lib.cpython-311.pyc new file mode 100644 index 000000000..55266ca8b Binary files /dev/null and b/backend/__pycache__/lib.cpython-311.pyc differ diff --git a/backend/__pycache__/lib.cpython-312.pyc b/backend/__pycache__/lib.cpython-312.pyc new file mode 100644 index 000000000..24c8e2cf1 Binary files /dev/null and b/backend/__pycache__/lib.cpython-312.pyc differ diff --git a/backend/app.py b/backend/app.py index 0f1548ed9..e67011404 100644 --- a/backend/app.py +++ b/backend/app.py @@ -1,100 +1,148 @@ from flask import Flask, request, jsonify +from flask_cors import CORS import os from dotenv import load_dotenv from lib import chat_completion_request, create_embedding import json - from pymongo.mongo_client import MongoClient -uri = "mongodb+srv://bluebookairoot:@bluebookcluster.0hf4pzi.mongodb.net/?retryWrites=true&w=majority&appName=BluebookCluster" +COURSE_QUERY_LIMIT = 5 + +load_dotenv() -# connect to the MongoDB cluster +# database initialization +uri = os.getenv('MONGO_URI') client = MongoClient(uri) -db = client['bluebookai'] -collection = db['course-info'] +db = client['course_db'] +collection = db['parsed_courses'] +# mongo connection try: client.admin.command('ping') print("Pinged your deployment. You successfully connected to MongoDB!") except Exception as e: print(e) +# flask app = Flask(__name__) +CORS(app) -load_dotenv() - -@app.route('/chat', methods=['POST']) +@app.route('/api/chat', methods=['POST']) def chat(): + data = request.get_json() - if 'message' not in data: - return jsonify({"error": "Missing 'messages' in request body"}), 400 - user_messages = data['message'] - response = chat_completion_request(messages=user_messages) - message = response.choices[0].message - print(message) - # if message.tool_calls is None: - # return 'success' - # args = json.loads(message.tool_calls[0].function.arguments) - # query_vector = create_embedding(user_messages[-1]['content']) - # database_response = collection.aggregate([ - # { - # '$vectorSearch': { - # 'index': 'course-rating-index', - # 'path': 'embedding', - # 'filter': { - # 'rating': { - # args['operator']: args['rating'] - # } - # }, - # 'queryVector': query_vector, - # 'numCandidates': 5, - # 'limit': 5 - # } - # } - # ]) - # # print(database_response) - - # top_class = list(database_response)[0] - # json_response = { - # 'title': top_class['title'], - # 'rating': top_class['rating'], - # } - # return jsonify(json_response) - - # "{\"operator\":\"$gt\",\"rating\":4}" + + # remove id before sending to OpenAI + for message in user_messages: + if 'id' in message: + del message['id'] + if message['role'] == 'ai': + message['role'] = 'assistant' + + print(user_messages) + + # for safety check, not to be included in final response + user_messages_safety_check = user_messages.copy() + user_messages_safety_check.append({ + 'role': 'user', + 'content': 'Am I asking for help with courses or academics? Answer "yes" or "no".' + }) + + response_safety_check = chat_completion_request(messages=user_messages_safety_check) + response_safety_check = response_safety_check.choices[0].message.content - # ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_Ub07GeA6kaC2OZ8b8KlVmtZz', function=Function(arguments='{\n "subject_code": "CPSC",\n "rating": 3.5,\n "comparison_operator_rating": "$gte",\n "workload": 1,\n "comparison_operator_workload": "$lte"\n}', name='CourseFilter'), type='function')]) + if 'no' in response_safety_check.lower(): + response = 'I am sorry, but I can only assist with questions related to courses or academics at this time.' + json_response = { + 'response': response, + 'courses': [] + } + print('failed safety check') + return jsonify(json_response) + else: + print('passed safety check') + + # adding system message if user message does not include a system message header + if user_messages[0]['role'] != 'system': + user_messages.insert(0, { + 'role': 'system', + 'content': 'Your name is Eli. You are a helpful assistant for Yale University students to ask questions about courses and academics.' + }) + + # checking if database query is necessary + user_messages_database_relevancy_check = user_messages.copy() + user_messages_database_relevancy_check.append({ + 'role': 'user', + 'content': 'Will you be able to better answer my questions with information about specific courses related to the user query at Yale University? You should answer "yes" if you need information about courses at Yale that you don\'t have, otherwise you should answer "no".' + }) + + user_messages_database_relevancy_check = chat_completion_request(messages=user_messages_database_relevancy_check) + response_user_messages_database_relevancy_check = user_messages_database_relevancy_check.choices[0].message.content + if 'no' in response_user_messages_database_relevancy_check.lower(): + response = chat_completion_request(messages=user_messages) + response = response.choices[0].message.content + json_response = { + 'response': response, + 'courses': [] + } + print('no need to query database for course information') + return jsonify(json_response) + else: + print('need to query database for course information') + + # create embedding for user message to query against vector index query_vector = create_embedding(user_messages[-1]['content']) - print(user_messages[-1]) database_response = collection.aggregate([ - { + { '$vectorSearch': { - 'index': 'course-rating-index', + 'index': 'parsed_courses_title_description_index', 'path': 'embedding', + # 'filter': { + # 'rating': { + # args['operator']: args['rating'] + # } + # }, 'queryVector': query_vector, - 'numCandidates': 5, - 'limit': 5 - } + 'numCandidates': 30, + 'limit': COURSE_QUERY_LIMIT } - ]) + } + ]) + + database_response = list(database_response) + + recommended_courses = [ + { + 'course_code': course['course_code'], + 'title': course['title'], + 'description': course['description'], + 'areas': course['areas'] + } for course in database_response + ] + + recommendation_prompt = f'Here are some courses that might be relevant to the user request:\n\n' + for course in recommended_courses: + recommendation_prompt += f'{course["course_code"]}: {course["title"]}\n{course["description"]}\n\n' + recommendation_prompt += 'Provide a response to the user. Incorporate specific course information if it is relevant to the user request.' + + user_messages.append({ + 'role': 'system', + 'content': recommendation_prompt + }) + + response = chat_completion_request(messages=user_messages) + response = response.choices[0].message.content + - classes = list(database_response) - # top_class = classes[0] - print([c['title'] for c in classes]) - top_class = classes[0] json_response = { - # 'message': [{ - # 'role': response.choices[0].message.role, - # 'content': response.choices[0].message.content, - # }] - 'title': top_class['title'], - # 'rating': top_class['rating'], + 'response': response, + 'courses': recommended_courses } return jsonify(json_response) if __name__ == '__main__': - app.run(debug=True) + app.run(debug=True, port=8000) diff --git a/backend/archive/chroma_demo.ipynb b/backend/archive/chroma_demo.ipynb new file mode 100644 index 000000000..fec7a23bd --- /dev/null +++ b/backend/archive/chroma_demo.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install chromadb\n", + "!pip install openai\n", + "!pip install python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "import chromadb\n", + "import chromadb.utils.embedding_functions as embedding_functions\n", + "openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n", + " api_key=os.getenv(\"OPENAI_API_KEY\"),\n", + " model_name=\"text-embedding-ada-002\"\n", + " )\n", + "chroma_client = chromadb.Client()\n", + "collection = chroma_client.create_collection(name=\"my_collection\", embedding_function=openai_ef)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "collection.add(\n", + " documents=[\"This class sucks!\", \"I love this class!\", \"I have learned NOTHING from this class\", \"This is a must take\"],\n", + " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}, {\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", + " ids=[\"id1\", \"id2\", \"id3\", \"id4\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ids': [['id4', 'id2']], 'distances': [[0.4128364324569702, 0.46640193462371826]], 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]], 'embeddings': None, 'documents': [['This is a must take', 'I love this class!']], 'uris': None, 'data': None}\n" + ] + } + ], + "source": [ + "results = collection.query(\n", + " query_texts=[\"positive reviews\"],\n", + " n_results=2\n", + ")\n", + "print(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "collection = chroma_client.create_collection(name=\"course_list\", embedding_function=openai_ef)\n", + "collection.add(\n", + " documents=[\"CPSC 439 Software Engineering\", \"CPSC 484 Human Computer Interaction\", \"CPSC 323 Introduction to Systems Programming\", \"CPSC 471 Trustworthy Deep Learning\"],\n", + " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}, {\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", + " ids=[\"id1\", \"id2\", \"id3\", \"id4\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': [['id1']],\n", + " 'distances': [[0.4114447236061096]],\n", + " 'metadatas': [[{'source': 'my_source'}]],\n", + " 'embeddings': None,\n", + " 'documents': [['CPSC 439 Software Engineering']],\n", + " 'uris': None,\n", + " 'data': None}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.query(\n", + " query_texts=[\"devops\"],\n", + " n_results=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': [['id3']],\n", + " 'distances': [[0.3774021863937378]],\n", + " 'metadatas': [[{'source': 'my_source'}]],\n", + " 'embeddings': None,\n", + " 'documents': [['CPSC 323 Introduction to Systems Programming']],\n", + " 'uris': None,\n", + " 'data': None}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.query(\n", + " query_texts=[\"systems\"],\n", + " n_results=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': [['id2']],\n", + " 'distances': [[0.3394881784915924]],\n", + " 'metadatas': [[{'source': 'my_source'}]],\n", + " 'embeddings': None,\n", + " 'documents': [['CPSC 484 Human Computer Interaction']],\n", + " 'uris': None,\n", + " 'data': None}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.query(\n", + " query_texts=[\"hci\"],\n", + " n_results=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ids': [['id4']],\n", + " 'distances': [[0.4633175730705261]],\n", + " 'metadatas': [[{'source': 'my_source'}]],\n", + " 'embeddings': None,\n", + " 'documents': [['CPSC 471 Trustworthy Deep Learning']],\n", + " 'uris': None,\n", + " 'data': None}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.query(\n", + " query_texts=[\"DL\"],\n", + " n_results=1\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/environment.yml b/backend/environment.yml new file mode 100644 index 000000000..d868718fe --- /dev/null +++ b/backend/environment.yml @@ -0,0 +1,49 @@ +name: bluebook +channels: + - defaults +dependencies: + - bzip2=1.0.8=h80987f9_5 + - ca-certificates=2024.3.11=hca03da5_0 + - expat=2.5.0=h313beb8_0 + - libcxx=14.0.6=h848a8c0_0 + - libffi=3.4.4=hca03da5_0 + - ncurses=6.4=h313beb8_0 + - openssl=3.0.13=h1a28f6b_0 + - pip=23.3.1=py312hca03da5_0 + - python=3.12.2=h99e199e_0 + - readline=8.2=h1a28f6b_0 + - setuptools=68.2.2=py312hca03da5_0 + - sqlite=3.41.2=h80987f9_0 + - tk=8.6.12=hb8d0fd4_0 + - tzdata=2024a=h04d1e81_0 + - wheel=0.41.2=py312hca03da5_0 + - xz=5.4.6=h80987f9_0 + - zlib=1.2.13=h5a0b063_0 + - pip: + - annotated-types==0.6.0 + - anyio==4.3.0 + - blinker==1.7.0 + - certifi==2024.2.2 + - click==8.1.7 + - distro==1.9.0 + - dnspython==2.6.1 + - flask==3.0.2 + - h11==0.14.0 + - httpcore==1.0.5 + - httpx==0.27.0 + - idna==3.6 + - itsdangerous==2.1.2 + - jinja2==3.1.3 + - markupsafe==2.1.5 + - openai==1.14.3 + - pydantic==2.6.4 + - pydantic-core==2.16.3 + - pymongo==4.6.3 + - python-dotenv==1.0.1 + - python-env==1.0.0 + - sniffio==1.3.1 + - tenacity==8.2.3 + - tqdm==4.66.2 + - typing-extensions==4.10.0 + - werkzeug==3.0.1 +prefix: /opt/anaconda3/envs/bluebook diff --git a/backend/lib.py b/backend/lib.py index 7aa98e8d5..21ca9e4c3 100644 --- a/backend/lib.py +++ b/backend/lib.py @@ -2,6 +2,11 @@ import os from openai import OpenAI import json +from dotenv import load_dotenv + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Open the JSON file with open('course_subjects.json', 'r') as file: @@ -18,7 +23,7 @@ "type": "object", "properties": { "subject_code": { - ~ "type": "string", + "type": "string", "enum": [str(key) for key in subjects.keys()], "description": "A code for the subject of instruction", }, @@ -47,8 +52,7 @@ } ] -# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -client = OpenAI(api_key="") +client = OpenAI(api_key=OPENAI_API_KEY) @retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3)) def create_embedding(text, model='text-embedding-3-small'): @@ -64,7 +68,7 @@ def create_embedding(text, model='text-embedding-3-small'): return e @retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3)) -def chat_completion_request(messages, tools=None, tool_choice=None, model='gpt-3.5-turbo'): +def chat_completion_request(messages, tools=None, tool_choice=None, model='gpt-4'): try: response = client.chat.completions.create( model=model, diff --git a/backend/process_data.ipynb b/backend/process_data.ipynb new file mode 100644 index 000000000..76c06d986 --- /dev/null +++ b/backend/process_data.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "import json\n", + "from tqdm import tqdm\n", + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "client = OpenAI(api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_embedding(text):\n", + " response = client.embeddings.create(\n", + " model=\"text-embedding-3-small\",\n", + " input=text\n", + " )\n", + " return response.data[0].embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1206/1206 [04:05<00:00, 4.91it/s]\n" + ] + } + ], + "source": [ + "with open('202501.json', 'r') as file:\n", + " courses = json.load(file)\n", + "\n", + "for course in tqdm(courses):\n", + " text_to_embed = f\"{course['short_title']}: {course['description']}\"\n", + " embedding = get_embedding(text_to_embed) \n", + " course['embedding'] = embedding\n", + "\n", + "with open('courses_with_embeddings.json', 'w') as file:\n", + " json.dump(courses, file, indent=4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}