From c60e499579f77191fddc17436a5aa2556fcb11cb Mon Sep 17 00:00:00 2001 From: Aditi Bansal <142652964+Aditi22Bansal@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:54:46 +0530 Subject: [PATCH] Create gbm.py --- GBM model/gbm.py | 147 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 GBM model/gbm.py diff --git a/GBM model/gbm.py b/GBM model/gbm.py new file mode 100644 index 000000000..cb8ea1994 --- /dev/null +++ b/GBM model/gbm.py @@ -0,0 +1,147 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve + +# Load the dataset +file_path = '/kaggle/input/credit-card-eligibility-data-determining-factors/dataset.csv' +data = pd.read_csv(file_path) + +# Display basic information about the dataset +print("Dataset Information:") +print(data.info()) + +# Display the first few rows of the dataset +print("\nFirst few rows of the dataset:") +print(data.head()) + +# Check for missing values +print("\nMissing Values:") +print(data.isnull().sum()) + +# Check for class imbalance +print("\nClass Distribution:") +print(data['Target'].value_counts()) + +# Visualize the class distribution +plt.figure(figsize=(8, 6)) +sns.countplot(data['Target']) +plt.title('Class Distribution') +plt.show() + +# Display summary statistics +print("\nSummary Statistics:") +print(data.describe()) + +# Strip leading/trailing spaces from column names +data.columns = data.columns.str.strip() + +# Encode categorical features +categorical_cols = data.select_dtypes(include=['object']).columns +for col in categorical_cols: + le = LabelEncoder() + data[col] = le.fit_transform(data[col]) + +# Define features and target +X = data.drop(columns=['ID', 'Target']) +y = data['Target'] + +# Identify categorical and numerical columns +categorical_cols = X.select_dtypes(include=['object']).columns +numerical_cols = X.select_dtypes(exclude=['object']).columns + +# Preprocessing for numerical data +numerical_transformer = StandardScaler() + +# Preprocessing for categorical data +categorical_transformer = OneHotEncoder(handle_unknown='ignore') + +# Bundle preprocessing for numerical and categorical data +preprocessor = ColumnTransformer( + transformers=[ + ('num', numerical_transformer, numerical_cols), + ('cat', categorical_transformer, categorical_cols) + ]) + +# Define the model +model = GradientBoostingClassifier(random_state=42) + +# Create the pipeline +pipeline = Pipeline(steps=[('preprocessor', preprocessor), + ('model', model)]) + +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Perform hyperparameter tuning using GridSearchCV +param_grid = { + 'model__n_estimators': [100, 200], + 'model__learning_rate': [0.1, 0.05, 0.01], + 'model__max_depth': [3, 4, 5] +} + +grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1) +grid_search.fit(X_train, y_train) + +# Display the best parameters and best score +print("\nBest Parameters:") +print(grid_search.best_params_) +print("\nBest Score:") +print(grid_search.best_score_) + +# Evaluate the model on the test set +best_model = grid_search.best_estimator_ +y_pred = best_model.predict(X_test) + +accuracy = accuracy_score(y_test, y_pred) +conf_matrix = confusion_matrix(y_test, y_pred) +class_report = classification_report(y_test, y_pred) + +print(f"\nAccuracy: {accuracy}") +print("\nConfusion Matrix:") +print(conf_matrix) +print("\nClassification Report:") +print(class_report) + +# Plot ROC curve +y_pred_proba = best_model.predict_proba(X_test)[:, 1] +fpr, tpr, _ = roc_curve(y_test, y_pred_proba) +roc_auc = roc_auc_score(y_test, y_pred_proba) + +plt.figure(figsize=(8, 6)) +plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})') +plt.plot([0, 1], [0, 1], 'k--') +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.title('ROC Curve') +plt.legend(loc='best') +plt.show() + +# Feature importance +feature_importances = best_model.named_steps['model'].feature_importances_ +feature_names = numerical_cols.tolist() + best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols).tolist() + +# Create a dataframe for feature importances +importance_df = pd.DataFrame({ + 'Feature': feature_names, + 'Importance': feature_importances +}).sort_values(by='Importance', ascending=False) + +print("\nFeature Importances:") +print(importance_df) + +# Plot feature importances +plt.figure(figsize=(12, 8)) +sns.barplot(x='Importance', y='Feature', data=importance_df) +plt.title('Feature Importances') +plt.show() + +# Save the model +import joblib +joblib.dump(best_model, 'best_gbm_model.pkl')