Skip to content

Commit

Permalink
Merge pull request #229 from DSC-McMaster-U/nicole/automl-finalchanges
Browse files Browse the repository at this point in the history
added final changes to automl page
  • Loading branch information
rawanmahdi authored Mar 30, 2024
2 parents af27bfb + b8d773b commit 11791ec
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 118 deletions.
2 changes: 1 addition & 1 deletion backend/compute/autoEDA.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import uuid
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
#from ydata_profiling import ProfileReport


def generate_eda_plots(df):
Expand Down
43 changes: 35 additions & 8 deletions backend/compute/autoML.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,59 @@
# from pycaret.classification import setup as classification_setup, compare_models as classification_compare_models, finalize_model
# from pycaret.regression import setup as regression_setup, compare_models as regression_compare_models, finalize_model

import pycaret.classification as pycaret_cl
from pycaret.classification import *
import pycaret.regression as pycaret_rg

import pandas as pd
import joblib
import uuid
import matplotlib.pyplot as plt
import os

def perform_classification(data, target_column):

pycaret_cl.setup(data = data, target = target_column)
clf1 = pycaret_cl.setup(data = data, target = target_column)

best_model = pycaret_cl.compare_models()

model_file_path = 'classification_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path
#generating scoring/accuray grid
dt = pycaret_cl.create_model('dt')
dt_results = pycaret_cl.pull()
scoring_grid_filename = os.path.join('tempData', f"scoring_grid_{uuid.uuid4()}.csv")

dt_results.to_csv(scoring_grid_filename, index=False)

#plotting model
lr = pycaret_cl.create_model('lr')
plot_filename = f"plot_{uuid.uuid4()}.png"
plot_model = pycaret_cl.plot_model(lr, plot='auc', save='tempData')

return best_model, model_file_path, scoring_grid_filename, plot_filename

def perform_regression(data, target_column):
#IMPLEMENT SAME FOR REGRESSION LATER

pycaret_rg.setup(data = data, target = target_column)
best_model = pycaret_rg.compare_models()

model_file_path = 'regression_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path
#generating scoring/accuracy chart
dt = pycaret_rg.create_model('dt')
dt_results = pycaret_rg.pull()
scoring_grid_filename = f"scoring_grid_{uuid.uuid4()}.csv"
dt_results.to_csv(scoring_grid_filename, index=False)

#plotting model
lr = pycaret_rg.create_model('lr')
plot_filename = f"plot_{uuid.uuid4()}.png"
pycaret_rg.plot_model(lr, plot='auc', save=True, plot_name=plot_filename)

return best_model, model_file_path, scoring_grid_filename, plot_filename

def generate_model(data, target_column, task):

Expand All @@ -34,10 +62,9 @@ def generate_model(data, target_column, task):

if task == 'C':
perform_classification(df, target_column) # Call classification_setup() before classification_compare_models()
final_model, model_file_path = perform_classification(df, target_column)
final_model, model_file_path, scoring_grid_filename, plot_filename = perform_classification(df, target_column)
elif task == 'R':
perform_regression(df, target_column) # Call regression_setup() before regression_compare_models()
final_model, model_file_path = perform_regression(df, target_column)

return final_model, model_file_path
final_model, model_file_path, scoring_grid_filename, plot_filename = perform_regression(df, target_column)

return final_model, model_file_path, scoring_grid_filename, plot_filename
79 changes: 61 additions & 18 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from fastapi import Request
from fastapi import HTTPException
from fastapi.responses import StreamingResponse, HTMLResponse
import shutil


# custom functions for EDA and AutoML
Expand All @@ -25,6 +26,7 @@
DATA_BUCKET = "automate-ml-datasets"
GRAPH_BUCKET = "automate_ml_graphs"
MODEL_BUCKET = "automl_gdsc_models"
ML_PLOT_BUCKET = "automl_gdsc_mlplot"
origins = ["*"]

app.add_middleware(
Expand Down Expand Up @@ -278,62 +280,103 @@ async def getProfile(fileName):
# return {}


# start the automl process
#start the automl process
@app.get("/api/generateModel")
async def getModel(fileName, column, task):
async def getModel(fileName, column,
task):
plot_filename = ""
scoreGridLines = ""
try:

temp_dir = 'tempData'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)

storage_client = storage.Client.from_service_account_json("./credentials.json")

#retreive data
data_bucket = storage_client.get_bucket(DATA_BUCKET)
blob = data_bucket.blob(f"{fileName}.csv")

byte_stream = BytesIO()
blob.download_to_file(byte_stream)
byte_stream.seek(0)

# producing model
model, model_file_path = generate_model(byte_stream, column, task)
#producing model
model, model_file_path, scoring_grid_filename, plot_filename = generate_model(byte_stream, column, task)

# upload model to model bucket
#upload model to model bucket
model_bucket = storage_client.get_bucket(MODEL_BUCKET)
model_blob = model_bucket.blob(f"{fileName}.pkl")
with open(model_file_path, "rb") as model_file:
model_blob.upload_from_file(
model_file, content_type="application/octet-stream"
)
model_blob.upload_from_file(model_file, content_type="application/octet-stream")

#put model into model bucket
bucket = storage_client.get_bucket(MODEL_BUCKET)
blob = bucket.blob(fileName)

#put score grid into plot bucket
scoring_grid_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
scoring_grid_blob = scoring_grid_bucket.blob(scoring_grid_filename)
with open(scoring_grid_filename, "rb") as scoring_grid_file:
scoring_grid_blob.upload_from_file(scoring_grid_file, content_type="text/csv")

#store scoring/accuracy grid
bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
blob = bucket.blob(scoring_grid_filename)

#convert it to csv and json
with blob.open("r") as f :
scoreGridLines = f.read()
scoreGridLines = str(scoreGridLines) if scoreGridLines else None
csv_reader = csv.DictReader(StringIO(scoreGridLines))
json_data = [row for row in csv_reader]

return fileName, column, task
#put plot into plot bucket
plot_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
plot_blob = plot_bucket.blob(plot_filename)
blob.content_type = 'image/png'
plot_blob.upload_from_filename("tempData/AUC.png")

#get the url of the plot
public_url = plot_blob.public_url

return {"scoring_grid": scoreGridLines, "json": json_data, "plot_model_url": public_url}

except Exception as e:
return {"error": f"An error occurred: {str(e)}"}

finally:
#Delete the temporary file
#if os.path.exists("tempImages/AUC.png"):
# os.remove("tempImages/AUC.png")
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)



# retreive the model and download it
#retreive the model and download it
@app.get("/api/downloadModel")
async def downloadModel():
try:
# action
#action
storage_client = storage.Client.from_service_account_json("./credentials.json")

# retreiving the data from bucket
#retreiving the data from bucket
bucket = storage_client.get_bucket(MODEL_BUCKET)
blobs = list(bucket.list_blobs())
blob = blobs[0]

byte_stream = BytesIO()
blob.download_to_file(byte_stream)
byte_stream.seek(0)

# remove it from the bucket
#remove it from the bucket
blob.delete()

return StreamingResponse(byte_stream, media_type="application/octet-stream")


except Exception as e:
return {"error": f"An error occurred: {str(e)}"}


# big query operations
@app.get("/api/bq")
async def bq(fileName, query=None):
Expand Down
Loading

0 comments on commit 11791ec

Please sign in to comment.