Merge pull request #154 from Aditi22Bansal/patch-1

Create gbm.py- Solving issue #128
recodehive · Jun 3, 2024 · 5fca2ac · 5fca2ac
2 parents 8b2bb87 + c60e499
commit 5fca2ac
Showing 1 changed file with 147 additions and 0 deletions.
diff --git a/GBM model/gbm.py b/GBM model/gbm.py
@@ -0,0 +1,147 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
+
+# Load the dataset
+file_path = '/kaggle/input/credit-card-eligibility-data-determining-factors/dataset.csv'
+data = pd.read_csv(file_path)
+
+# Display basic information about the dataset
+print("Dataset Information:")
+print(data.info())
+
+# Display the first few rows of the dataset
+print("\nFirst few rows of the dataset:")
+print(data.head())
+
+# Check for missing values
+print("\nMissing Values:")
+print(data.isnull().sum())
+
+# Check for class imbalance
+print("\nClass Distribution:")
+print(data['Target'].value_counts())
+
+# Visualize the class distribution
+plt.figure(figsize=(8, 6))
+sns.countplot(data['Target'])
+plt.title('Class Distribution')
+plt.show()
+
+# Display summary statistics
+print("\nSummary Statistics:")
+print(data.describe())
+
+# Strip leading/trailing spaces from column names
+data.columns = data.columns.str.strip()
+
+# Encode categorical features
+categorical_cols = data.select_dtypes(include=['object']).columns
+for col in categorical_cols:
+    le = LabelEncoder()
+    data[col] = le.fit_transform(data[col])
+
+# Define features and target
+X = data.drop(columns=['ID', 'Target'])
+y = data['Target']
+
+# Identify categorical and numerical columns
+categorical_cols = X.select_dtypes(include=['object']).columns
+numerical_cols = X.select_dtypes(exclude=['object']).columns
+
+# Preprocessing for numerical data
+numerical_transformer = StandardScaler()
+
+# Preprocessing for categorical data
+categorical_transformer = OneHotEncoder(handle_unknown='ignore')
+
+# Bundle preprocessing for numerical and categorical data
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', numerical_transformer, numerical_cols),
+        ('cat', categorical_transformer, categorical_cols)
+    ])
+
+# Define the model
+model = GradientBoostingClassifier(random_state=42)
+
+# Create the pipeline
+pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                           ('model', model)])
+
+# Split the data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Perform hyperparameter tuning using GridSearchCV
+param_grid = {
+    'model__n_estimators': [100, 200],
+    'model__learning_rate': [0.1, 0.05, 0.01],
+    'model__max_depth': [3, 4, 5]
+}
+
+grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
+grid_search.fit(X_train, y_train)
+
+# Display the best parameters and best score
+print("\nBest Parameters:")
+print(grid_search.best_params_)
+print("\nBest Score:")
+print(grid_search.best_score_)
+
+# Evaluate the model on the test set
+best_model = grid_search.best_estimator_
+y_pred = best_model.predict(X_test)
+
+accuracy = accuracy_score(y_test, y_pred)
+conf_matrix = confusion_matrix(y_test, y_pred)
+class_report = classification_report(y_test, y_pred)
+
+print(f"\nAccuracy: {accuracy}")
+print("\nConfusion Matrix:")
+print(conf_matrix)
+print("\nClassification Report:")
+print(class_report)
+
+# Plot ROC curve
+y_pred_proba = best_model.predict_proba(X_test)[:, 1]
+fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+roc_auc = roc_auc_score(y_test, y_pred_proba)
+
+plt.figure(figsize=(8, 6))
+plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
+plt.plot([0, 1], [0, 1], 'k--')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC Curve')
+plt.legend(loc='best')
+plt.show()
+
+# Feature importance
+feature_importances = best_model.named_steps['model'].feature_importances_
+feature_names = numerical_cols.tolist() + best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols).tolist()
+
+# Create a dataframe for feature importances
+importance_df = pd.DataFrame({
+    'Feature': feature_names,
+    'Importance': feature_importances
+}).sort_values(by='Importance', ascending=False)
+
+print("\nFeature Importances:")
+print(importance_df)
+
+# Plot feature importances
+plt.figure(figsize=(12, 8))
+sns.barplot(x='Importance', y='Feature', data=importance_df)
+plt.title('Feature Importances')
+plt.show()
+
+# Save the model
+import joblib
+joblib.dump(best_model, 'best_gbm_model.pkl')