Skip to content

Commit

Permalink
add simple feature selection code
Browse files Browse the repository at this point in the history
  • Loading branch information
pohaoc2 committed Dec 5, 2024
1 parent 5058ce3 commit 4614539
Show file tree
Hide file tree
Showing 8 changed files with 272 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ variance_formatted/
overfit/
training_vis.ipynb
variance.ipynb

*.png

# Editors
.vscode/
Expand Down
162 changes: 161 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ black = "23.1.0"
mypy = "1.5.0"
statsmodels = "^0.14.0"
torch = "^2.5.1"
shap = "^0.46.0"

[tool.poetry.dev-dependencies]
black = "^23.1.0"
Expand Down
41 changes: 23 additions & 18 deletions sandbox/src/approximate_bayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@ def distance_function(y_obs, y_sim=0):
"""
Distance measure between observed and simulated outputs.
"""
sigma_sim = np.std(y_sim)
if sigma_sim == 0: # Handle edge case for zero variance
sigma_sim = 1e-6
return np.sum(np.abs(y_obs - y_sim))
return np.sum(((y_obs - y_sim) ** 2))


# ABC algorithm
def abc(data, y_sims, y_obs, epsilon):
"""
Expand All @@ -33,38 +32,44 @@ def abc(data, y_sims, y_obs, epsilon):

# Compute the distance between observed and simulated outputs
distance = distance_function(y_obs, y_sim)

# Accept or reject based on epsilon
if distance <= epsilon:
accepted_parameters.append(row)
accepted_parameters.append(y_sim[0])

return pd.DataFrame(accepted_parameters)
return accepted_parameters

def main():
# Load ABM data
data_path = "../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
data = pd.read_csv(data_path)
input_feature_names = ["NODES", "EDGES", "GRADIUS"]
input_feature_names = ["ACTIVITY"]
predicted_output = ["ACTIVITY", "GROWTH", "SYMMETRY"]
predicted_output = ["ACTIVITY"]#, "GROWTH", "SYMMETRY"]
input_features = data[input_feature_names].values
y_sims = data[predicted_output].values
fig, ax = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
for i, feature in enumerate(predicted_output):
ax[i].hist(data[feature], bins=50)
ax[i].set_title(feature)
plt.savefig("y_sims.png")
fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
_, bins, patch = ax[0].hist(y_sims, bins=20)
ax[0].set_title("Prior - Activity")
ax[0].set_xlim([-1, 1])

y_obs = 1
y_obs = 0.25
print(f"Number of samples: {len(data)}")
return 0
# ABC setup
epsilon = 500
epsilon = 0.25

# Run ABC
posterior_samples = abc(input_features, y_sims, y_obs, epsilon)
# posterior_samples.to_csv("posterior_samples.csv", index=False)
print(f"Number of accepted samples: {len(posterior_samples)}")
# Plot the accepted samples
ax[1].hist(posterior_samples, bins=bins)
ax[1].set_title("Posterior - Activity")
ax[1].axvline(x=y_obs, color="red", linestyle="--", label="Observed")
# Plot eplison
ax[1].axvline(x=y_obs + epsilon, color="black", linestyle="--", label="Epsilon")
ax[1].axvline(x=y_obs - epsilon, color="black", linestyle="--")
ax[1].legend()
ax[1].set_xlim([-1, 1])
plt.tight_layout()
plt.savefig("posterior_samples.png")

if __name__ == "__main__":
main()
Loading

0 comments on commit 4614539

Please sign in to comment.