From f34f552d2b1b1185067bb6bdaec88afa739442e1 Mon Sep 17 00:00:00 2001 From: manikumarreddyu <133508962+manikumarreddyu@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:01:45 +0530 Subject: [PATCH] Github-Automated-Analysis added Github-Automated-Analysis --- Github-Automated-Analysis/.gitignore | 164 +++++++++++++ Github-Automated-Analysis/main.py | 229 +++++++++++++++++++ Github-Automated-Analysis/repo_data.csv | 20 ++ Github-Automated-Analysis/utils/config.py | 6 + Github-Automated-Analysis/utils/constants.py | 28 +++ 5 files changed, 447 insertions(+) create mode 100644 Github-Automated-Analysis/.gitignore create mode 100644 Github-Automated-Analysis/main.py create mode 100644 Github-Automated-Analysis/repo_data.csv create mode 100644 Github-Automated-Analysis/utils/config.py create mode 100644 Github-Automated-Analysis/utils/constants.py diff --git a/Github-Automated-Analysis/.gitignore b/Github-Automated-Analysis/.gitignore new file mode 100644 index 000000000..b845f2a82 --- /dev/null +++ b/Github-Automated-Analysis/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + + +.vscode +.streamlit \ No newline at end of file diff --git a/Github-Automated-Analysis/main.py b/Github-Automated-Analysis/main.py new file mode 100644 index 000000000..f14bec8ef --- /dev/null +++ b/Github-Automated-Analysis/main.py @@ -0,0 +1,229 @@ +import streamlit as st +import requests +import os +from dotenv import load_dotenv +import pandas as pd +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.prompts import PromptTemplate +from langchain.document_loaders import CSVLoader,TextLoader,DirectoryLoader +from langchain.text_splitter import CharacterTextSplitter +from langchain.chains import RetrievalQA +from langchain.llms import OpenAI +from langchain.vectorstores import FAISS +import utils.config as config +from github import Github +from utils.constants import * + +load_dotenv() +os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') +os.environ['GITHUB_TOKEN'] = os.getenv('GITHUB_TOKEN') +os.environ['ACTIVELOOP_TOKEN'] = os.getenv('ACTIVELOOP_TOKEN') + + + +st.set_page_config(page_title="GitHub Repositories List" , page_icon=":computer:" , layout="wide" , initial_sidebar_state="expanded") + + + + +# Function to fetch GitHub repositories +@st.cache_data # Cache data so that we don't have to fetch it again +def fetch_github_repos(username): + # url = f'https://api.github.com/users/{username}/repos' + # response = requests.get(url) + # if response.status_code == 200: + # return response.json() + # else: + # return None + repos = [] + page = 1 + while True: + url = f"https://api.github.com/users/{username}/repos?page={page}&per_page=50" + response = requests.get(url) + data = response.json() + if not data: + break + repos.extend([(repo) for repo in data]) + page += 1 + return repos + +# Function to display repositories +def display_repos(repos): + for repo in repos: + repo_name = repo["name"] + repo_url = repo["html_url"] + st.write(f"[{repo_name}]({repo_url})") + + + +def get_user_repos(username): + """Gets the repository information of each of the repositories of a GitHub user. + + Args: + username: The username of the GitHub user. + + Returns: + A list of dictionaries, where each dictionary contains the information of a repository. + """ + client = Github() + + user = client.get_user(username) + repos = user.get_repos() + + repo_info = [] + + for repo in repos: + + repo_info.append({ + "name": repo.name, + "description": repo.description, + "language": repo.language, + "stars": repo.stargazers_count, + "forks": repo.forks_count, + "labels": repo.get_labels(), + "issues": repo.get_issues(state="all"), + "contents" : repo.get_contents(""), + + }) + + repo_info_df = pd.DataFrame(repo_info) + repo_info_df.to_csv("repo_data.csv") + + loader = CSVLoader(file_path="repo_data.csv", encoding ="utf-8") + csv_data = loader.load() + csv_embeddings = OpenAIEmbeddings() + vectors = FAISS.from_documents(csv_data, csv_embeddings) + + # Create a question-answering chain using the index + + context = """ You are Supersmart Github Repository AI system. You are a superintelligent AI that answers questions about Github Repositories and can understand the technical complexity if the repo. + +You are: + - helpful & friendly + - good at answering complex questions in simple language + - an expert in all programming languages + - able to infer the intent of the user's question + + +Remember You are an inteelligent CSV Agent who can understand CSV files and their contents. You are given a CSV file with the following columns: Repository Name, Repository Link, Analysis. You are asked to find the most technically complex and challenging repository from the given CSV file. + +To measure the technical complexity of a GitHub repository using the provided API endpoints, You will analyze various factors such as the number of commits, branches, pull requests, issues,contents , number of forks , stars , and contributors. Additionally, you will consider the programming languages used, the size of the codebase, and the frequency of updates. +You will Analyze the following GitHub repository factors to determine the technical complexity of the codebase and calculate a complexity score for each project: + +1.Description +2.languages used in the repository +3.Number of stars +4.Number of forks +5.Labels of the repository +6.Description of the repository +7.Contents of the repository + +You can consider other factors as well if you think they are relevant for determining the technical complexity of a GitHub repository. +Calculate the complexity score for each project by assigning weights to each factor and summing up the weighted scores. + +The project with the highest complexity score will be considered the most technically complex. + +Here is the approach or chain-of-thought process , you can use to reach to the solution : +Step 1: Analyze each row and it's contents in the CSV file , each Row represents a Github Repository + + + + """ + + prompt_template = """ + + Understand the following to answer the question in an efficient way + + {context} + + Question: {question} + Now answer the question. Let's think step by step:""" + PROMPT = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] +) + + + chain_type_kwargs = {"prompt": PROMPT} + + chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=vectors.as_retriever(), input_key="question" , chain_type_kwargs=chain_type_kwargs) + + + st.subheader("Most Technically Complex Github Repository is") + + query = f""" + + +Which is the most technically challenging repository from the given CSV file? + +Return the name of the repository , the link to the repository and the analysis of the repository showing why it is the most technically challenging/Complex repository.Try to provide a detailed analysis to hold your answer strong + +The output should be in the following format: + +Repository Name: +Repository Link: +Analysis: + +Provide a clickable link to the repository as well like this: +To get the repo url , you can use this format : + +The username is : "{username}" + + +"https://github.com/{username}/repository_name" + + +[Repository Name](Repository Link) --> This is Important.Don't skip it + + +Let's think step by step about how to answer this question: + +""" + result = chain({"question": query}) + if result is not None: + st.write(result['result']) + else: + st.info("Please wait..") + st.stop() + + + + +# Main app +def main(): + config.init() + # Set up the app title and sidebar + st.title("GitHub Automated Analysis Tool") + st.sidebar.title("GitHub Automated Analysis Tool") + + # Input field for GitHub username + username = st.sidebar.text_input("Enter GitHub Username") + + # Submit and clear buttons + submit_button = st.sidebar.button("Submit") + clear_button = st.sidebar.button("Clear") + st.sidebar.header("About") + st.sidebar.info("This Python-based tool , when given a GitHub user's URL, returns the most technically complex and challenging repository from that user's profile. The tool will use GPT and LangChain to assess each repository individually before determining the most technically challenging one.") + st.divider() + st.sidebar.write("This tool is created by [MANI KUMAR REDDY U](https/github.com/manikumarreddyu).") + + # Display the repositories + if submit_button: + st.subheader(f"Repositories for {username}") + repos = fetch_github_repos(username) + if repos: + display_repos(repos) + st.info("Analysis of the repositories using LangChain and ChatGPT started. Please wait...") + get_user_repos(username) + st.error("Invalid username or unable to fetch repositories") + + # Clear the input field + if clear_button: + username = "" + st.experimental_rerun() + + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Github-Automated-Analysis/repo_data.csv b/Github-Automated-Analysis/repo_data.csv new file mode 100644 index 000000000..3e89029dc --- /dev/null +++ b/Github-Automated-Analysis/repo_data.csv @@ -0,0 +1,20 @@ +,name,description,language,stars,forks,labels,issues,contents +0,3-Python-Projects,,Python,0,0,,,"[ContentFile(path=""README.md""), ContentFile(path=""multiclipboard.py""), ContentFile(path=""send_email.py""), ContentFile(path=""weather.py"")]" +1,5-Python-Projects-For-Beginners,,Python,0,0,,,"[ContentFile(path=""README.md""), ContentFile(path=""choose_your_own_adventure.py""), ContentFile(path=""key.key""), ContentFile(path=""number_guesser.py""), ContentFile(path=""password_manager.py""), ContentFile(path=""passwords.txt""), ContentFile(path=""quiz_game.py""), ContentFile(path=""rock_paper_scissors.py"")]" +2,Blog-website-Django-Python,,Python,0,0,,,"[ContentFile(path=""blog""), ContentFile(path=""db.sqlite3""), ContentFile(path=""manage.py""), ContentFile(path=""media""), ContentFile(path=""mydjangoProject1""), ContentFile(path=""posts.json""), ContentFile(path=""users"")]" +3,Books-Recommender-System-Using-Machine-Learning,,Jupyter Notebook,0,0,,,"[ContentFile(path="".gitignore""), ContentFile(path=""Books Recommender.ipynb""), ContentFile(path=""Procfile""), ContentFile(path=""app.py""), ContentFile(path=""artifacts""), ContentFile(path=""data""), ContentFile(path=""demo"")]" +4,brainwave,,JavaScript,0,0,,,"[ContentFile(path="".eslintrc.cjs""), ContentFile(path="".gitignore""), ContentFile(path=""README.md""), ContentFile(path=""index.html""), ContentFile(path=""package-lock.json""), ContentFile(path=""package.json""), ContentFile(path=""postcss.config.js""), ContentFile(path=""public""), ContentFile(path=""src""), ContentFile(path=""tailwind.config.js""), ContentFile(path=""vite.config.js"")]" +5,Crop-Recommendation-System,,Jupyter Notebook,0,0,,,"[ContentFile(path=""Crop_recommendation.csv""), ContentFile(path=""Pickle_RL_Model.pkl""), ContentFile(path=""README.md""), ContentFile(path=""crop_app""), ContentFile(path=""crop_appp""), ContentFile(path=""crop_prediction_app.ipynb"")]" +6,Data-Visualization-with-Seaborn,,Jupyter Notebook,0,0,,,"[ContentFile(path=""Data Visualization with Seaborn.ipynb""), ContentFile(path=""README.md"")]" +7,EDA-and-Diabetes-Prediction-using-KNN,,Jupyter Notebook,0,0,,,"[ContentFile(path=""Diabetes_Prediction_using_KNN(Includes_EDA).ipynb""), ContentFile(path=""README.md""), ContentFile(path=""diabetes.csv"")]" +8,employee-management-spring-angular,,TypeScript,0,0,,,"[ContentFile(path=""README.md""), ContentFile(path=""angular-frontend""), ContentFile(path=""springboot-backend"")]" +9,GitHub-Finder,,JavaScript,0,0,,,"[ContentFile(path=""app.js""), ContentFile(path=""github.js""), ContentFile(path=""index.html""), ContentFile(path=""ui.js"")]" +10,Health-Track,,Python,0,0,,,"[ContentFile(path="".idea""), ContentFile(path="".streamlit""), ContentFile(path=""Apps""), ContentFile(path=""Classifier_Models""), ContentFile(path=""Home.py""), ContentFile(path=""README.md""), ContentFile(path=""Run_command.py""), ContentFile(path=""images""), ContentFile(path=""pages""), ContentFile(path=""requirements.txt""), ContentFile(path=""res"")]" +11,Medical-Insurance-Cost-Prediction,,Jupyter Notebook,0,0,,,"[ContentFile(path="".devcontainer""), ContentFile(path=""Medical-Insurance-Cost-Prediction.sav""), ContentFile(path=""README.md""), ContentFile(path=""insurance.csv""), ContentFile(path=""main.ipynb""), ContentFile(path=""main.py""), ContentFile(path=""requirements.txt"")]" +12,Medical-Insurance-Cost-Prediction-2,,Jupyter Notebook,0,0,,,"[ContentFile(path=""README.md""), ContentFile(path=""Regression and EDA .ipynb""), ContentFile(path=""data.csv"")]" +13,Movie-Recommender-System-Using-Machine-Learning,,Jupyter Notebook,0,0,,,"[ContentFile(path="".gitignore""), ContentFile(path=""Movie Recommender System Data Analysis.ipynb""), ContentFile(path=""Procfile""), ContentFile(path=""app.py""), ContentFile(path=""artifacts""), ContentFile(path=""data""), ContentFile(path=""demo"")]" +14,SMILE-DENTAL-CARE,,JavaScript,0,0,,,"[ContentFile(path=""about.html""), ContentFile(path=""appointment.html""), ContentFile(path=""contact.html""), ContentFile(path=""css""), ContentFile(path=""dental-clinic-website-template.jpg""), ContentFile(path=""dentcare-1.0.0""), ContentFile(path=""img""), ContentFile(path=""index.html""), ContentFile(path=""int.html""), ContentFile(path=""js""), ContentFile(path=""lib""), ContentFile(path=""mak.jpg""), ContentFile(path=""manoj.jpg""), ContentFile(path=""maruthi.jpg""), ContentFile(path=""mkr.jpg""), ContentFile(path=""peace.jpg""), ContentFile(path=""prasanna.jpg""), ContentFile(path=""price.html""), ContentFile(path=""scss""), ContentFile(path=""service.html""), ContentFile(path=""team.html""), ContentFile(path=""testimonial.html"")]" +15,streamlit-with-gradio-api,,Python,0,0,,,"[ContentFile(path=""README.md""), ContentFile(path=""app.py"")]" +16,Streamlit_chatapp,,Python,0,0,,,"[ContentFile(path="".env""), ContentFile(path=""openai_api_usage.csv""), ContentFile(path=""readme.md""), ContentFile(path=""requirements.txt""), ContentFile(path=""streamlit_app.py""), ContentFile(path=""utils.py"")]" +17,TMDB-MOVIES,,HTML,0,0,,,"[ContentFile(path=""movie-app"")]" +18,vite-react-tailwind-todo,,JavaScript,0,0,,,"[ContentFile(path="".gitignore""), ContentFile(path="".prettierrc""), ContentFile(path=""README.md""), ContentFile(path=""design""), ContentFile(path=""index.html""), ContentFile(path=""package-lock.json""), ContentFile(path=""package.json""), ContentFile(path=""postcss.config.cjs""), ContentFile(path=""public""), ContentFile(path=""src""), ContentFile(path=""tailwind.config.cjs""), ContentFile(path=""vite.config.js"")]" diff --git a/Github-Automated-Analysis/utils/config.py b/Github-Automated-Analysis/utils/config.py new file mode 100644 index 000000000..b5e78b987 --- /dev/null +++ b/Github-Automated-Analysis/utils/config.py @@ -0,0 +1,6 @@ +import pandas as pd + + +def init(): + global main_df + main_df = pd.DataFrame(columns=["Repository Name", "Repository Link", "Analysis"]) \ No newline at end of file diff --git a/Github-Automated-Analysis/utils/constants.py b/Github-Automated-Analysis/utils/constants.py new file mode 100644 index 000000000..02fed67b6 --- /dev/null +++ b/Github-Automated-Analysis/utils/constants.py @@ -0,0 +1,28 @@ +from pathlib import Path + +APP_NAME = "Github Automated Analysis Tool" +MODEL = "gpt-3.5-turbo" +PAGE_ICON = "🤖" + +K = 10 +FETCH_K = 20 +CHUNK_SIZE = 1000 +TEMPERATURE = 0.5 +MAX_TOKENS = 3000 +ENABLE_ADVANCED_OPTIONS = True + +DATA_PATH = Path.cwd() / "data" + + + + +OPENAI_HELP = """ +You can sign-up for OpenAI's API [here](https://openai.com/blog/openai-api).\n +Once you are logged in, you find the API keys [here](https://platform.openai.com/account/api-keys) +""" + +ACTIVELOOP_HELP = """ +You can create an Activeloop account (including 200GB of free database storage) [here](https://www.activeloop.ai/).\n +Once you are logged in, you find the API token [here](https://app.activeloop.ai/profile/gustavz/apitoken).\n +The organisation name is your username, or you can create new organisations [here](https://app.activeloop.ai/organization/new/create) +""" \ No newline at end of file