Skip to content

Commit

Permalink
Merge pull request #154 from Aditi22Bansal/patch-1
Browse files Browse the repository at this point in the history
Create gbm.py- Solving issue #128
  • Loading branch information
sanjay-kv authored Jun 3, 2024
2 parents 8b2bb87 + c60e499 commit 5fca2ac
Showing 1 changed file with 147 additions and 0 deletions.
147 changes: 147 additions & 0 deletions GBM model/gbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Load the dataset
file_path = '/kaggle/input/credit-card-eligibility-data-determining-factors/dataset.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(data.head())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Check for class imbalance
print("\nClass Distribution:")
print(data['Target'].value_counts())

# Visualize the class distribution
plt.figure(figsize=(8, 6))
sns.countplot(data['Target'])
plt.title('Class Distribution')
plt.show()

# Display summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Strip leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])

# Define features and target
X = data.drop(columns=['ID', 'Target'])
y = data['Target']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

# Define the model
model = GradientBoostingClassifier(random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform hyperparameter tuning using GridSearchCV
param_grid = {
'model__n_estimators': [100, 200],
'model__learning_rate': [0.1, 0.05, 0.01],
'model__max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best parameters and best score
print("\nBest Parameters:")
print(grid_search.best_params_)
print("\nBest Score:")
print(grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"\nAccuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Plot ROC curve
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()

# Feature importance
feature_importances = best_model.named_steps['model'].feature_importances_
feature_names = numerical_cols.tolist() + best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols).tolist()

# Create a dataframe for feature importances
importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()

# Save the model
import joblib
joblib.dump(best_model, 'best_gbm_model.pkl')

0 comments on commit 5fca2ac

Please sign in to comment.