From a39edc4068bf241e336785a21cfbaef8844a96a1 Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Tue, 10 Dec 2019 22:21:13 -0800 Subject: [PATCH] Update clustering --- NOTES.md | 8 ++++++++ wikicast/experiment_clustering.py | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/NOTES.md b/NOTES.md index 47ce76d..bb8659b 100644 --- a/NOTES.md +++ b/NOTES.md @@ -145,3 +145,11 @@ for i in {0..6}; do scripts/run-command subgraph pageview --artifact-path sample ```bash gsutil -m rsync -d -r data/design_matrix gs://wiki-forecast-data/design_matrix ``` + +```bash +docker run -v `PWD`:/app -v `realpath ../wikipedia-dump`:/app/data -it trmf + +pip install pandas pyarrow +pip install -e external/exp-trmf-nips16/python +python -m wikicast.trmf_forecast +``` diff --git a/wikicast/experiment_clustering.py b/wikicast/experiment_clustering.py index 9997b84..e269fdb 100644 --- a/wikicast/experiment_clustering.py +++ b/wikicast/experiment_clustering.py @@ -90,9 +90,9 @@ def run_trial(data, output, window_size, num_windows): # return_train_score=False, # ) search_ridge = ridge - # results += run_ablation( - # "ridge regression", search_ridge, train, validate, test, features_dict - # ) + results += run_ablation( + "ridge regression", search_ridge, train, validate, test, features_dict + ) # write_search_results(search_ridge, f"{output}/ridge-random.csv") # use lbfgs when the dataset is small, does not require a learning rate @@ -143,18 +143,18 @@ def best_nn_random(params, output, **kwargs): layers = [(64, 32, 64, 16)] params = { "hidden_layer_sizes": layers, - #"alpha": stats.reciprocal(1e-3, 1e6), + "alpha": [0.002, 20], } - search = best_nn_grid(params, f"{output}/nn-grid-no-regularization.csv") + search = best_nn_grid(params, f"{output}/nn-grid-regularization.csv") # layers = [ - # np.hstack([train] + features).shape[1], - # (128, 8, 128), - # (16, 8, 8, 8), + # #np.hstack([train] + features).shape[1], + # #(128, 8, 128), + # #(16, 8, 8, 8), # (64, 32, 64, 16), # ] - # params = {"hidden_layer_sizes": layers, "alpha": stats.reciprocal(1e2, 1e8)} - # search = best_nn_random(params, f"{output}/nn-grid-layers-best.csv") + # params = {"hidden_layer_sizes": layers, "alpha": stats.reciprocal(1e-4, 1e2)} + # search = best_nn_random(params, f"{output}/nn-grid-layers-best.csv", n_iter=10) best_nn = search.best_estimator_ results += run_ablation(