Skip to content

Commit

Permalink
plot std for gp and test mlp
Browse files Browse the repository at this point in the history
  • Loading branch information
pohaoc2 committed Dec 3, 2024
1 parent 913809f commit 5058ce3
Show file tree
Hide file tree
Showing 2 changed files with 257 additions and 5 deletions.
97 changes: 92 additions & 5 deletions sandbox/src/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,85 @@
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

def plot_parity_with_uncertainty(
y_true_train, y_pred_train, y_std_train,
y_true_test, y_pred_test, y_std_test,
xlim=(-3, 3), ylim=(-3, 3),
train_r2=None, test_r2=None,
filename="parity_plot_with_smooth_band.png"
):
"""
Plots parity plots with uncertainty bands for training and test data.
Parameters:
- y_true_train: True values for the training data.
- y_pred_train: Predicted values for the training data.
- y_std_train: Predicted standard deviations for the training data.
- y_true_test: True values for the test data.
- y_pred_test: Predicted values for the test data.
- y_std_test: Predicted standard deviations for the test data.
- output_name: Name of the output variable being plotted (string).
- xlim: Tuple specifying x-axis limits (default: (-2, 2)).
- ylim: Tuple specifying y-axis limits (default: (-2, 2)).
- filename: Name of the file to save the plot (default: "parity_plot_with_smooth_band.png").
"""
# Sort data points for smooth connections
sorted_indices_train = np.argsort(y_true_train[:, 0])
y_true_train_sorted = y_true_train[sorted_indices_train, 0]
y_pred_train_sorted = y_pred_train[sorted_indices_train, 0]
y_std_train_sorted = y_std_train[sorted_indices_train, 0]

sorted_indices_test = np.argsort(y_true_test[:, 0])
y_true_test_sorted = y_true_test[sorted_indices_test, 0]
y_pred_test_sorted = y_pred_test[sorted_indices_test, 0]
y_std_test_sorted = y_std_test[sorted_indices_test, 0]

# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Train data parity plot
ax[0].plot(y_true_train_sorted, y_pred_train_sorted, label="Predicted", color="blue")
ax[0].fill_between(
y_true_train_sorted,
y_pred_train_sorted - y_std_train_sorted,
y_pred_train_sorted + y_std_train_sorted,
color="blue",
alpha=0.2,
label="Uncertainty"
)
ax[0].scatter(y_true_train[:, 0], y_pred_train[:, 0], label="Train Data", color="blue", alpha=0.6)
ax[0].plot(xlim, xlim, 'k--', label="Parity Line")
ax[0].set_title(f"Parity Plot - Train Data (r2 = {train_r2:.3f})")
ax[0].set_xlabel("True Values")
ax[0].set_ylabel("Predicted Values")
ax[0].legend()
ax[0].set_xlim(xlim)
ax[0].set_ylim(ylim)

# Test data parity plot
ax[1].plot(y_true_test_sorted, y_pred_test_sorted, label="Predicted", color="green")
ax[1].fill_between(
y_true_test_sorted,
y_pred_test_sorted - y_std_test_sorted,
y_pred_test_sorted + y_std_test_sorted,
color="green",
alpha=0.2,
label="Uncertainty"
)
ax[1].scatter(y_true_test[:, 0], y_pred_test[:, 0], label="Test Data", color="green", alpha=0.6)
ax[1].plot(xlim, xlim, 'k--', label="Parity Line")
ax[1].set_title(f"Parity Plot - Test Data (r2 = {test_r2:.3f})")
ax[1].set_xlabel("True Values")
ax[1].set_ylabel("Predicted Values")
ax[1].legend()
ax[1].set_xlim(xlim)
ax[1].set_ylim(ylim)

# Final layout adjustments and save the figure
plt.tight_layout()
plt.savefig(filename)


def clean_data(full_data, response):
"""Handle missing or non-numeric data"""

Expand Down Expand Up @@ -80,7 +159,7 @@ def clean_data(full_data, response):
#features = spatial_features

kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=1, length_scale_bounds=(1e-2, 1e2), nu=1.5) + WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-4, 1e-1))
kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=1, length_scale_bounds=(1e-2, 1e2), nu=1.5) #+ WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-4, 1e-1))
train = True

feature_selection_results = {}
Expand All @@ -99,7 +178,6 @@ def clean_data(full_data, response):
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

if train:
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-1)
gp.fit(X_train, y_train)
Expand All @@ -111,14 +189,14 @@ def clean_data(full_data, response):
y_pred, y_pred_std = gp.predict(X_test, return_std=True)
y_pred_train, y_pred_std_train = gp.predict(X_train, return_std=True)
# Convert back to original scale

"""
y_pred = scaler.inverse_transform(y_pred)
y_pred_std = scaler.inverse_transform(y_pred_std)
y_pred_train = scaler.inverse_transform(y_pred_train)
y_pred_std_train = scaler.inverse_transform(y_pred_std_train)
y_train = scaler.inverse_transform(y_train)
y_test = scaler.inverse_transform(y_test)

"""
for i, name in enumerate(output_names[:1]):
r2_train = r2_score(y_train[:, i], y_pred_train[:, i])
r_train = np.corrcoef(y_train[:, i], y_pred_train[:, i])[0, 1]
Expand All @@ -134,7 +212,16 @@ def clean_data(full_data, response):
"MSE Train": f"{mse_train:.3f}",
"MSE Test": f"{mse_test:.3f}",
}

plot_parity_with_uncertainty(y_train,
y_pred_train,
y_pred_std_train,
y_test,
y_pred,
y_pred_std,
train_r2=r2_train,
test_r2=r2_test,
)
asd()
# Plot a parity plot for train and test data
output_name = output_names[0]
output_index = OUTPUT_MAPPING[output_name]
Expand Down
165 changes: 165 additions & 0 deletions sandbox/src/mlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import random
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

def clean_data(full_data, response):
"""Handle missing or non-numeric data"""

# Remove rows with multiple components
full_data = full_data[full_data["COMPONENTS"] == 1]
full_data.reset_index(drop=True, inplace=True)

# Remove response rows with bad values
full_data = full_data.loc[~full_data[response].isin([np.nan, np.inf, -np.inf])]
full_data.reset_index(drop=True, inplace=True)

# Removed features columns with bad values
numeric_cols = full_data.select_dtypes(include=[np.number]).columns
full_data = full_data.loc[
:, ~(np.isnan(full_data[numeric_cols]).any(axis=0) | np.isinf(full_data[numeric_cols])).any(axis=0)
]
return full_data

OUTPUT_MAPPING = {"ACTIVITY": 0, "GROWTH": 1, "SYMMETRY": 2}

# Load data
data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
data = pd.read_csv(data_path)
output_names = ["ACTIVITY", "GROWTH", "SYMMETRY"]
features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS"
]
spatial_features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS", "GRADIUS:FLOW",
"GDIAMETER:FLOW", "AVG_ECCENTRICITY:FLOW", "AVG_SHORTEST_PATH:FLOW",
"AVG_CLOSENESS:FLOW", "AVG_BETWEENNESS:FLOW", "GRADIUS:WALL",
"GDIAMETER:WALL", "AVG_ECCENTRICITY:WALL", "AVG_SHORTEST_PATH:WALL",
"AVG_CLOSENESS:WALL", "AVG_BETWEENNESS:WALL", "GRADIUS:SHEAR",
"GDIAMETER:SHEAR", "AVG_ECCENTRICITY:SHEAR", "AVG_SHORTEST_PATH:SHEAR",
"AVG_CLOSENESS:SHEAR", "AVG_BETWEENNESS:SHEAR", "GRADIUS:RADIUS",
"GDIAMETER:RADIUS", "AVG_ECCENTRICITY:RADIUS", "AVG_SHORTEST_PATH:RADIUS",
"AVG_CLOSENESS:RADIUS", "AVG_BETWEENNESS:RADIUS", "GRADIUS:PRESSURE_AVG",
"GDIAMETER:PRESSURE_AVG", "AVG_ECCENTRICITY:PRESSURE_AVG",
"AVG_SHORTEST_PATH:PRESSURE_AVG", "AVG_CLOSENESS:PRESSURE_AVG",
"AVG_BETWEENNESS:PRESSURE_AVG", "GRADIUS:PRESSURE_DELTA",
"GDIAMETER:PRESSURE_DELTA", "AVG_ECCENTRICITY:PRESSURE_DELTA",
"AVG_SHORTEST_PATH:PRESSURE_DELTA", "AVG_CLOSENESS:PRESSURE_DELTA",
"AVG_BETWEENNESS:PRESSURE_DELTA", "GRADIUS:OXYGEN_AVG",
"GDIAMETER:OXYGEN_AVG", "AVG_ECCENTRICITY:OXYGEN_AVG",
"AVG_SHORTEST_PATH:OXYGEN_AVG", "AVG_CLOSENESS:OXYGEN_AVG",
"AVG_BETWEENNESS:OXYGEN_AVG", "GRADIUS:OXYGEN_DELTA",
"GDIAMETER:OXYGEN_DELTA", "AVG_ECCENTRICITY:OXYGEN_DELTA",
"AVG_SHORTEST_PATH:OXYGEN_DELTA", "AVG_CLOSENESS:OXYGEN_DELTA",
"AVG_ECCENTRICITY_WEIGHTED",
"AVG_CLOSENESS_WEIGHTED", "AVG_CORENESS_WEIGHTED",
"AVG_BETWEENNESS_WEIGHTED", "AVG_OUT_DEGREES_WEIGHTED",
"AVG_IN_DEGREES_WEIGHTED", "AVG_DEGREE_WEIGHTED",
"GRADIUS:INVERSE_DISTANCE", "GDIAMETER:INVERSE_DISTANCE",
"AVG_ECCENTRICITY:INVERSE_DISTANCE", "AVG_SHORTEST_PATH:INVERSE_DISTANCE",
"AVG_CLOSENESS:INVERSE_DISTANCE", "AVG_BETWEENNESS:INVERSE_DISTANCE"
]
selected_features_indices = [2,0,17,6,7,3,18,4,11,15]
features = np.array(features)#[selected_features_indices]
selected_features = features

#features = spatial_features

kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=1, length_scale_bounds=(1e-2, 1e2), nu=1.5) + WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-4, 1e-1))
train = True

feature_selection_results = {}
for iteration in range(1):#(len(features)):
print("="*10 + f"Random Feature Set {iteration+1}" + "="*10)
#selected_features = random.sample(features, random.randint(2, 3))
#X = data[selected_features].values
selected_features = np.array(features)#[:iteration+1]
X = data[selected_features].values # Inputs
y = data[output_names].values # Outputs


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

if train:
mlp = MLPRegressor(hidden_layer_sizes=(5, 10),
activation="logistic",
alpha=0.316,
solver="lbfgs",
max_iter=1000,
random_state=42)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)
y_pred_train = mlp.predict(X_train)
# Convert back to original scale
"""
y_pred = scaler.inverse_transform(y_pred)
y_pred_train = scaler.inverse_transform(y_pred_train)
y_train = scaler.inverse_transform(y_train)
y_test = scaler.inverse_transform(y_test)
"""

for i, name in enumerate(output_names[:1]):
r2_train = r2_score(y_train[:, i], y_pred_train[:, i])
r_train = np.corrcoef(y_train[:, i], y_pred_train[:, i])[0, 1]
mse_train = mean_squared_error(y_train[:, i], y_pred_train[:, i])
r2_test = r2_score(y_test[:, i], y_pred[:, i])
r_test = np.corrcoef(y_test[:, i], y_pred[:, i])[0, 1]
mse_test = mean_squared_error(y_test[:, i], y_pred[:, i])
# Save results
feature_selection_results[f"Feature Set {iteration+1}"] = {
"Selected Features": selected_features,
"R² Train": f"{r2_train:.3f}",
"R² Test": f"{r2_test:.3f}",
"MSE Train": f"{mse_train:.3f}",
"MSE Test": f"{mse_test:.3f}",
}

# Plot a parity plot for train and test data
output_name = output_names[0]
output_index = OUTPUT_MAPPING[output_name]
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
ax.scatter(y_train[:, output_index], y_pred_train[:, output_index], label="Train")
ax.scatter(y_test[:, output_index], y_pred[:, output_index], label="Test")
ax.set_title(f"{output_name} (R^2 Train: {r2_train:.3f}, R^2 Test: {r2_test:.3f})")
ax.set_xlabel("True")
ax.set_ylabel("Predicted")
ax.legend()
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ax.plot(xlim, xlim, 'k--', label="Parity Line")
plt.tight_layout()

# Save the plot
selected_features_combined = "_".join(selected_features)
plt.savefig(f"parity_plot.png")#_{selected_features_combined}.png")

results_df = pd.DataFrame.from_dict(feature_selection_results, orient="index")
results_df.sort_values(by="R² Test", ascending=False, inplace=True)

# Save results to a CSV
#results_df.to_csv("feature_selection_results.csv", index=False)
print(results_df.head(1))

0 comments on commit 5058ce3

Please sign in to comment.