Skip to content

Commit

Permalink
Add changes to most recent run
Browse files Browse the repository at this point in the history
  • Loading branch information
acmiyaguchi committed Dec 10, 2019
1 parent 1582b95 commit 085cc7b
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 40 deletions.
5 changes: 0 additions & 5 deletions scripts/start-jupyter
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@ PYSPARK_DRIVER_PYTHON=jupyter \
PYSPARK_DRIVER_PYTHON_OPTS=notebook \
pyspark \
--master 'local[*]' \
<<<<<<< HEAD
--conf spark.driver.memory=24g \
--conf spark.sql.shuffle.partitions=16 \
=======
--conf spark.driver.memory=8g \
--conf spark.sql.shuffle.partitions=8 \
>>>>>>> 59def8f9b64a5cabc09cf9bc199db4d3a24ae42b
--packages \
graphframes:graphframes:0.7.0-spark2.4-s_2.11
78 changes: 45 additions & 33 deletions wikicast/experiment_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime, timedelta
from itertools import chain, product
from time import time
import warnings

import click
import networkx as nx
Expand All @@ -23,6 +24,16 @@
plot_learning_curve,
)

# Turn off warning in this experiment
# FutureWarning: The default value of multioutput (not exposed in score method)
# will change from 'variance_weighted' to 'uniform_average' in 0.23 to keep
# consistent with 'metrics.r2_score'. To specify the default value manually and
# avoid the warning, please either call 'metrics.r2_score' directly or make a
# custom scorer with 'metrics.make_scorer' (the built-in scorer 'r2' uses
# multioutput='uniform_average').
warnings.filterwarnings("ignore")


N_CUTS = 8
START_DATE = datetime.strptime("2018-01-01", "%Y-%m-%d")
END_DATE = datetime.strptime("2019-09-01", "%Y-%m-%d")
Expand Down Expand Up @@ -58,30 +69,31 @@ def run_trial(data, output, window_size, num_windows):
summarize("mean", test, (np.ones(test.shape).T * validate.mean(axis=1)).T),
]

pred = run_train_predict(Ridge(alpha=0), train, validate, test, [])
results += [summarize("linear regression", test, pred)]
pred = run_train_predict(Ridge(alpha=0), train[:, -window_size:], validate, test, [])
results += [summarize("linear regression (no history)", test, pred)]

scoring = {
"rmse": make_scorer(rmse, greater_is_better=False),
"mape": make_scorer(mape, greater_is_better=False),
}

ridge = Ridge()
params = dict(alpha=stats.reciprocal(a=1e5, b=1e9))
search_ridge = RandomizedSearchCV(
estimator=ridge,
param_distributions=params,
scoring=scoring,
refit="rmse",
cv=5,
n_iter=10,
return_train_score=False,
)

results += run_ablation(
"ridge regression", search_ridge, train, validate, test, features_dict
)
write_search_results(search_ridge, f"{output}/ridge-random.csv")
print("starting ridge")
ridge = Ridge(solver="lsqr", alpha=1.8e8)
# params = dict(alpha=stats.reciprocal(a=1e5, b=1e9))
# search_ridge = RandomizedSearchCV(
# estimator=ridge,
# param_distributions=params,
# scoring=scoring,
# refit="rmse",
# cv=5,
# n_iter=5,
# return_train_score=False,
# )
search_ridge = ridge
# results += run_ablation(
# "ridge regression", search_ridge, train, validate, test, features_dict
# )
# write_search_results(search_ridge, f"{output}/ridge-random.csv")

# use lbfgs when the dataset is small, does not require a learning rate
solver = "adam"
Expand Down Expand Up @@ -128,21 +140,21 @@ def best_nn_random(params, output, **kwargs):
# "hidden_layer_sizes": layers,
# "alpha": [5e5, 5e4, 0.5],
# }
# params = {
# "activation": ["relu"],
# "hidden_layer_sizes": layers,
# "alpha": stats.reciprocal(1e3, 1e6),
# }
# search = best_nn_random(params, f"{output}/nn-grid-layers-random.csv", n_iter=10)

layers = [
np.hstack([train] + features).shape[1],
(128, 8, 128),
(16, 8, 8, 8),
(64, 32, 64, 16),
]
params = {"activation": ["relu"], "hidden_layer_sizes": layers}
search = best_nn_grid(params, f"{output}/nn-grid-layers-best.csv")
layers = [(64, 32, 64, 16)]
params = {
"hidden_layer_sizes": layers,
#"alpha": stats.reciprocal(1e-3, 1e6),
}
search = best_nn_grid(params, f"{output}/nn-grid-no-regularization.csv")

# layers = [
# np.hstack([train] + features).shape[1],
# (128, 8, 128),
# (16, 8, 8, 8),
# (64, 32, 64, 16),
# ]
# params = {"hidden_layer_sizes": layers, "alpha": stats.reciprocal(1e2, 1e8)}
# search = best_nn_random(params, f"{output}/nn-grid-layers-best.csv")

best_nn = search.best_estimator_
results += run_ablation(
Expand Down
4 changes: 2 additions & 2 deletions wikicast/trmf_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
# lags = itertools.chain(
# range(1, 8), range(7 * 4, 8 * 4), range(7 * 8, 8 * 8)
# )
lags = list(range(1,8))
lags = list(range(1, 8))
lag_set = sp.array(list(lags), dtype=sp.uint32)
k = 128
k = 64
lambdaI = 2
lambdaAR = 625
lambdaLag = 0.5
Expand Down
4 changes: 4 additions & 0 deletions wikicast/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def run_ablation(name, model, train, validate, test, features_dict, trial_id=-1)
pred = run_train_predict(model, train, validate, test, list(features_dict.values()))
results.append(summarize(name, test, pred, trial_id=trial_id))

# run without history
pred = run_train_predict(model, train[:, -validate.shape[0]:], validate, test, list(features_dict.values()))
results.append(summarize(f"{name}: without history", test, pred, trial_id=trial_id))

# run without a single feature
for feature_name in features_dict.keys():
features_list = [v for k, v in features_dict.items() if k != feature_name]
Expand Down

0 comments on commit 085cc7b

Please sign in to comment.