Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github-Automated-Analysis #182

Merged
merged 1 commit into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions Github-Automated-Analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


.vscode
.streamlit
229 changes: 229 additions & 0 deletions Github-Automated-Analysis/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import streamlit as st
import requests
import os
from dotenv import load_dotenv
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import CSVLoader,TextLoader,DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
import utils.config as config
from github import Github
from utils.constants import *

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['GITHUB_TOKEN'] = os.getenv('GITHUB_TOKEN')
os.environ['ACTIVELOOP_TOKEN'] = os.getenv('ACTIVELOOP_TOKEN')



st.set_page_config(page_title="GitHub Repositories List" , page_icon=":computer:" , layout="wide" , initial_sidebar_state="expanded")




# Function to fetch GitHub repositories
@st.cache_data # Cache data so that we don't have to fetch it again
def fetch_github_repos(username):
# url = f'https://api.github.com/users/{username}/repos'
# response = requests.get(url)
# if response.status_code == 200:
# return response.json()
# else:
# return None
repos = []
page = 1
while True:
url = f"https://api.github.com/users/{username}/repos?page={page}&per_page=50"
response = requests.get(url)
data = response.json()
if not data:
break
repos.extend([(repo) for repo in data])
page += 1
return repos

# Function to display repositories
def display_repos(repos):
for repo in repos:
repo_name = repo["name"]
repo_url = repo["html_url"]
st.write(f"[{repo_name}]({repo_url})")



def get_user_repos(username):
"""Gets the repository information of each of the repositories of a GitHub user.

Args:
username: The username of the GitHub user.

Returns:
A list of dictionaries, where each dictionary contains the information of a repository.
"""
client = Github()

user = client.get_user(username)
repos = user.get_repos()

repo_info = []

for repo in repos:

repo_info.append({
"name": repo.name,
"description": repo.description,
"language": repo.language,
"stars": repo.stargazers_count,
"forks": repo.forks_count,
"labels": repo.get_labels(),
"issues": repo.get_issues(state="all"),
"contents" : repo.get_contents(""),

})

repo_info_df = pd.DataFrame(repo_info)
repo_info_df.to_csv("repo_data.csv")

loader = CSVLoader(file_path="repo_data.csv", encoding ="utf-8")
csv_data = loader.load()
csv_embeddings = OpenAIEmbeddings()
vectors = FAISS.from_documents(csv_data, csv_embeddings)

# Create a question-answering chain using the index

context = """ You are Supersmart Github Repository AI system. You are a superintelligent AI that answers questions about Github Repositories and can understand the technical complexity if the repo.

You are:
- helpful & friendly
- good at answering complex questions in simple language
- an expert in all programming languages
- able to infer the intent of the user's question


Remember You are an inteelligent CSV Agent who can understand CSV files and their contents. You are given a CSV file with the following columns: Repository Name, Repository Link, Analysis. You are asked to find the most technically complex and challenging repository from the given CSV file.

To measure the technical complexity of a GitHub repository using the provided API endpoints, You will analyze various factors such as the number of commits, branches, pull requests, issues,contents , number of forks , stars , and contributors. Additionally, you will consider the programming languages used, the size of the codebase, and the frequency of updates.
You will Analyze the following GitHub repository factors to determine the technical complexity of the codebase and calculate a complexity score for each project:

1.Description
2.languages used in the repository
3.Number of stars
4.Number of forks
5.Labels of the repository
6.Description of the repository
7.Contents of the repository

You can consider other factors as well if you think they are relevant for determining the technical complexity of a GitHub repository.
Calculate the complexity score for each project by assigning weights to each factor and summing up the weighted scores.

The project with the highest complexity score will be considered the most technically complex.

Here is the approach or chain-of-thought process , you can use to reach to the solution :
Step 1: Analyze each row and it's contents in the CSV file , each Row represents a Github Repository



"""

prompt_template = """

Understand the following to answer the question in an efficient way

{context}

Question: {question}
Now answer the question. Let's think step by step:"""
PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)


chain_type_kwargs = {"prompt": PROMPT}

chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=vectors.as_retriever(), input_key="question" , chain_type_kwargs=chain_type_kwargs)


st.subheader("Most Technically Complex Github Repository is")

query = f"""


Which is the most technically challenging repository from the given CSV file?

Return the name of the repository , the link to the repository and the analysis of the repository showing why it is the most technically challenging/Complex repository.Try to provide a detailed analysis to hold your answer strong

The output should be in the following format:

Repository Name: <name of the repository>
Repository Link: <link to the repository>
Analysis: <analysis of the repository>

Provide a clickable link to the repository as well like this:
To get the repo url , you can use this format :

The username is : "{username}"


"https://github.com/{username}/repository_name"


[Repository Name](Repository Link) --> This is Important.Don't skip it


Let's think step by step about how to answer this question:

"""
result = chain({"question": query})
if result is not None:
st.write(result['result'])
else:
st.info("Please wait..")
st.stop()




# Main app
def main():
config.init()
# Set up the app title and sidebar
st.title("GitHub Automated Analysis Tool")
st.sidebar.title("GitHub Automated Analysis Tool")

# Input field for GitHub username
username = st.sidebar.text_input("Enter GitHub Username")

# Submit and clear buttons
submit_button = st.sidebar.button("Submit")
clear_button = st.sidebar.button("Clear")
st.sidebar.header("About")
st.sidebar.info("This Python-based tool , when given a GitHub user's URL, returns the most technically complex and challenging repository from that user's profile. The tool will use GPT and LangChain to assess each repository individually before determining the most technically challenging one.")
st.divider()
st.sidebar.write("This tool is created by [MANI KUMAR REDDY U](https/github.com/manikumarreddyu).")

# Display the repositories
if submit_button:
st.subheader(f"Repositories for {username}")
repos = fetch_github_repos(username)
if repos:
display_repos(repos)
st.info("Analysis of the repositories using LangChain and ChatGPT started. Please wait...")
get_user_repos(username)
st.error("Invalid username or unable to fetch repositories")

# Clear the input field
if clear_button:
username = ""
st.experimental_rerun()





if __name__ == "__main__":
main()
Loading
Loading