Merge pull request #229 from DSC-McMaster-U/nicole/automl-finalchanges

added final changes to automl page
DSC-McMaster-U · Mar 30, 2024 · 11791ec · 11791ec
2 parents af27bfb + b8d773b
commit 11791ec
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 118 deletions.
diff --git a/backend/compute/autoEDA.py b/backend/compute/autoEDA.py
@@ -2,7 +2,7 @@
 import uuid
 import pandas as pd
 import matplotlib.pyplot as plt
-from ydata_profiling import ProfileReport
+#from ydata_profiling import ProfileReport
 
 
 def generate_eda_plots(df):

diff --git a/backend/compute/autoML.py b/backend/compute/autoML.py
@@ -1,31 +1,59 @@
 # from pycaret.classification import setup as classification_setup, compare_models as classification_compare_models, finalize_model
 # from pycaret.regression import setup as regression_setup, compare_models as regression_compare_models, finalize_model
-
 import pycaret.classification as pycaret_cl
+from pycaret.classification import *
 import pycaret.regression as pycaret_rg
 
 import pandas as pd
 import joblib
+import uuid
+import matplotlib.pyplot as plt
+import os
 
 def perform_classification(data, target_column):
 
-    pycaret_cl.setup(data = data, target = target_column)
+    clf1 = pycaret_cl.setup(data = data, target = target_column)
+
     best_model = pycaret_cl.compare_models()
 
     model_file_path = 'classification_model.pkl'
     joblib.dump(best_model, model_file_path)
 
-    return best_model, model_file_path
+    #generating scoring/accuray grid
+    dt = pycaret_cl.create_model('dt')
+    dt_results = pycaret_cl.pull()
+    scoring_grid_filename = os.path.join('tempData', f"scoring_grid_{uuid.uuid4()}.csv")
+
+    dt_results.to_csv(scoring_grid_filename, index=False)
+
+    #plotting model
+    lr = pycaret_cl.create_model('lr')
+    plot_filename = f"plot_{uuid.uuid4()}.png"
+    plot_model = pycaret_cl.plot_model(lr, plot='auc', save='tempData')
+
+    return best_model, model_file_path, scoring_grid_filename, plot_filename
 
 def perform_regression(data, target_column):
+    #IMPLEMENT SAME FOR REGRESSION LATER
 
     pycaret_rg.setup(data = data, target = target_column)
     best_model = pycaret_rg.compare_models()
 
     model_file_path = 'regression_model.pkl'
     joblib.dump(best_model, model_file_path)
 
-    return best_model, model_file_path
+    #generating scoring/accuracy chart
+    dt = pycaret_rg.create_model('dt')
+    dt_results = pycaret_rg.pull()
+    scoring_grid_filename = f"scoring_grid_{uuid.uuid4()}.csv"
+    dt_results.to_csv(scoring_grid_filename, index=False)
+
+    #plotting model
+    lr = pycaret_rg.create_model('lr')
+    plot_filename = f"plot_{uuid.uuid4()}.png"
+    pycaret_rg.plot_model(lr, plot='auc', save=True, plot_name=plot_filename)
+
+    return best_model, model_file_path, scoring_grid_filename, plot_filename
 
 def generate_model(data, target_column, task):
 
@@ -34,10 +62,9 @@ def generate_model(data, target_column, task):
 
     if task == 'C':
         perform_classification(df, target_column)  # Call classification_setup() before classification_compare_models()
-        final_model, model_file_path = perform_classification(df, target_column)
+        final_model, model_file_path, scoring_grid_filename, plot_filename = perform_classification(df, target_column)
     elif task == 'R':
         perform_regression(df, target_column)  # Call regression_setup() before regression_compare_models()
-        final_model, model_file_path = perform_regression(df, target_column)
-
-    return final_model, model_file_path
+        final_model, model_file_path, scoring_grid_filename, plot_filename = perform_regression(df, target_column)
 
+    return final_model, model_file_path, scoring_grid_filename, plot_filename
diff --git a/backend/main.py b/backend/main.py
@@ -10,6 +10,7 @@
 from fastapi import Request
 from fastapi import HTTPException
 from fastapi.responses import StreamingResponse, HTMLResponse
+import shutil
 
 
 # custom functions for EDA and AutoML
@@ -25,6 +26,7 @@
 DATA_BUCKET = "automate-ml-datasets"
 GRAPH_BUCKET = "automate_ml_graphs"
 MODEL_BUCKET = "automl_gdsc_models"
+ML_PLOT_BUCKET = "automl_gdsc_mlplot"
 origins = ["*"]
 
 app.add_middleware(
@@ -278,62 +280,103 @@ async def getProfile(fileName):
 #     return {}
 
 
-# start the automl process
+#start the automl process
 @app.get("/api/generateModel")
-async def getModel(fileName, column, task):
+async def getModel(fileName, column, 
+task):
+    plot_filename = ""
+    scoreGridLines = ""
     try:
-
+        temp_dir = 'tempData'
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+
         storage_client = storage.Client.from_service_account_json("./credentials.json")
 
+        #retreive data
         data_bucket = storage_client.get_bucket(DATA_BUCKET)
         blob = data_bucket.blob(f"{fileName}.csv")
-
         byte_stream = BytesIO()
         blob.download_to_file(byte_stream)
         byte_stream.seek(0)
 
-        # producing model
-        model, model_file_path = generate_model(byte_stream, column, task)
+        #producing model
+        model, model_file_path, scoring_grid_filename, plot_filename = generate_model(byte_stream, column, task)
 
-        # upload model to model bucket
+        #upload model to model bucket
         model_bucket = storage_client.get_bucket(MODEL_BUCKET)
         model_blob = model_bucket.blob(f"{fileName}.pkl")
         with open(model_file_path, "rb") as model_file:
-            model_blob.upload_from_file(
-                model_file, content_type="application/octet-stream"
-            )
+            model_blob.upload_from_file(model_file, content_type="application/octet-stream")
+
+        #put model into model bucket
+        bucket = storage_client.get_bucket(MODEL_BUCKET)
+        blob = bucket.blob(fileName)
+
+        #put score grid into plot bucket
+        scoring_grid_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        scoring_grid_blob = scoring_grid_bucket.blob(scoring_grid_filename)
+        with open(scoring_grid_filename, "rb") as scoring_grid_file:
+            scoring_grid_blob.upload_from_file(scoring_grid_file, content_type="text/csv")
+
+        #store scoring/accuracy grid
+        bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        blob = bucket.blob(scoring_grid_filename)
+
+        #convert it to csv and json
+        with blob.open("r") as f :
+            scoreGridLines = f.read()
+        scoreGridLines = str(scoreGridLines) if scoreGridLines else None
+        csv_reader = csv.DictReader(StringIO(scoreGridLines))
+        json_data = [row for row in csv_reader]
 
-        return fileName, column, task
+        #put plot into plot bucket
+        plot_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        plot_blob = plot_bucket.blob(plot_filename)
+        blob.content_type = 'image/png'
+        plot_blob.upload_from_filename("tempData/AUC.png")
+
+        #get the url of the plot
+        public_url = plot_blob.public_url
+
+        return {"scoring_grid": scoreGridLines, "json": json_data, "plot_model_url": public_url}
 
     except Exception as e:
         return {"error": f"An error occurred: {str(e)}"}
+
+    finally:
+        #Delete the temporary file
+        #if os.path.exists("tempImages/AUC.png"):
+         #   os.remove("tempImages/AUC.png")
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
 
+
 
-# retreive the model and download it
+#retreive the model and download it
 @app.get("/api/downloadModel")
 async def downloadModel():
     try:
-        # action
+        #action
         storage_client = storage.Client.from_service_account_json("./credentials.json")
 
-        # retreiving the data from bucket
+        #retreiving the data from bucket
         bucket = storage_client.get_bucket(MODEL_BUCKET)
         blobs = list(bucket.list_blobs())
         blob = blobs[0]
 
         byte_stream = BytesIO()
         blob.download_to_file(byte_stream)
         byte_stream.seek(0)
-
-        # remove it from the bucket
+        
+        #remove it from the bucket
         blob.delete()
 
         return StreamingResponse(byte_stream, media_type="application/octet-stream")
 
+
     except Exception as e:
         return {"error": f"An error occurred: {str(e)}"}
-
-
 # big query operations
 @app.get("/api/bq")
 async def bq(fileName, query=None):