-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ml sequencing #282
base: main
Are you sure you want to change the base?
Ml sequencing #282
Changes from 8 commits
1c45d4e
9256184
54b3552
e188e75
4faa7e3
f2dd427
aad10b7
889488c
c09170c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,4 +38,5 @@ mike-*.yml | |
.ipynb_checkpoints | ||
examples | ||
**/outputs/ | ||
**/tmp/ | ||
**/tmp/ | ||
temp/ |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,10 @@ prettytable >= 3, < 4 | |
python-Levenshtein >= 0.21, < 0.26 | ||
rich >= 12, < 14 | ||
Rtree >= 1, < 2 | ||
seaborn < 0.14 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should go into |
||
s2sphere < 0.3 | ||
scikit-learn >= 1.2, < 2 | ||
shapely >= 1, < 3 | ||
tensorflow < 2.17 | ||
tensorflow-probability < 0.25 | ||
xlrd >= 2, < 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
from typing import Optional | ||
|
||
import numpy as np | ||
from tensorflow import keras | ||
import tensorflow_probability as tfp | ||
import tf_keras as tfk | ||
|
||
tfd = tfp.distributions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can do: |
||
tfpl = tfp.layers | ||
tfkl = tfk.layers | ||
|
||
from pam.core import Population | ||
from pam.planner.encoder import PlansSequenceEncoder | ||
|
||
|
||
class ScheduleModelSimple: | ||
def __init__( | ||
self, population: Population, n_units: Optional[int] = 50, dropout: Optional[float] = 0.1 | ||
) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. docstring |
||
self.encoder = PlansSequenceEncoder(population=population) | ||
|
||
# build model | ||
input_acts = keras.layers.Input(shape=[self.encoder.acts.shape[1]]) | ||
emb_acts = keras.layers.Embedding( | ||
len(self.encoder.activity_encoder.labels), 1, mask_zero=True, name="emb" | ||
)(input_acts) | ||
encoder_h1, encoder_h, encoder_c = keras.layers.LSTM( | ||
n_units, return_state=True, name="encoder_h1" | ||
)(emb_acts) | ||
encoder_state = [encoder_h, encoder_c] | ||
|
||
decoder_input = keras.layers.Input(shape=[self.encoder.durations.shape[1] - 1, 1]) | ||
decoder_h1 = keras.layers.LSTM( | ||
n_units, name="decoder_h1", dropout=dropout, return_sequences=True | ||
)(decoder_input, initial_state=encoder_state) | ||
decoder_h2 = keras.layers.LSTM( | ||
n_units, name="decoder_h2", dropout=dropout, return_sequences=True | ||
)(decoder_h1) | ||
decoder_output = keras.layers.Dense(1, activation="relu", name="decoder_output")(decoder_h2) | ||
model = keras.models.Model(inputs=[input_acts, decoder_input], outputs=[decoder_output]) | ||
|
||
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should any of these arguments be user-configurable? |
||
model.summary() | ||
|
||
self.model = model | ||
|
||
def fit(self, epochs: int = 500) -> None: | ||
"""Fit the sceduling model. | ||
|
||
Args: | ||
epochs (int, optional): Number of epochs to run. Defaults to 500. | ||
""" | ||
X = self.encoder.acts[:, ::-1] | ||
durations = self.encoder.durations | ||
self.history = self.model.fit([X, durations[:, :-1]], durations[:, 1:], epochs=epochs) | ||
|
||
def predict(self, population: Population) -> np.array: | ||
"""Predict the activity durations of a population. | ||
|
||
Args: | ||
population (Population): A PAM population. | ||
|
||
Returns: | ||
np.array: Durations array. Each row represents a plan. | ||
""" | ||
encoder = PlansSequenceEncoder( | ||
population=population, activity_encoder=self.encoder.activity_encoder | ||
) | ||
X = encoder.acts[:, ::-1] | ||
y_pred = np.zeros(shape=encoder.durations.shape) | ||
for i in range(1, y_pred.shape[1]): | ||
y_pred[:, i] = self.model.predict([X, y_pred[:, :i]])[:, -1, 0] | ||
|
||
return y_pred | ||
|
||
|
||
class ActivityDurationRegression: | ||
def __init__(self, acts: np.array, durations: np.array) -> None: | ||
"""Model to predict durations of a set of activities. | ||
|
||
Args: | ||
acts (np.array): Activity tokens. Shape: (n, 1). | ||
durations (np.array): Durations. Shape: (n, 1). | ||
""" | ||
self.acts = acts | ||
self.durations = durations | ||
|
||
# set up model | ||
inputs = keras.layers.Input(shape=(1,)) | ||
h1 = keras.layers.Dense(50, activation="relu")(inputs) | ||
h2 = keras.layers.Dense(20, activation="relu")(h1) | ||
outputs = keras.layers.Dense(1, activation="relu")(h2) | ||
model = keras.models.Model(inputs=inputs, outputs=outputs) | ||
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"]) | ||
model.summary() | ||
|
||
self.model = model | ||
|
||
def fit(self, epochs: int = 20): | ||
"""Fit the Neural Network model. | ||
|
||
Args: | ||
epochs (int, optional): Number of epochs to run. Defaults to 20. | ||
""" | ||
self.history = self.model.fit(self.acts, self.durations, epochs=epochs) | ||
|
||
def predict(self, acts: np.array) -> np.array: | ||
"""Predict durations | ||
|
||
Args: | ||
acts (np.array): Act tokens. Shape: (n, 1). | ||
|
||
Returns: | ||
np.array: Durations. Shape: (n, 1) | ||
""" | ||
y_pred = self.model.predict(acts) | ||
|
||
return y_pred | ||
|
||
|
||
class ActivityDurationMixture: | ||
def __init__( | ||
self, acts: np.array, durations: np.array, n_components: Optional[int] = 2 | ||
) -> None: | ||
"""Mixture Density Model for predicting durations of a set of activities as a multimodal distribution. | ||
|
||
Args: | ||
acts (np.array): Activity tokens. Shape: (n, 1). | ||
durations (np.array): Durations. Shape: (n, 1). | ||
n_components (Optional[int], optional): Number of components of the Gaussian Mixture. Defaults to 2. | ||
""" | ||
self.acts = acts | ||
self.durations = durations | ||
self.n_components = n_components | ||
|
||
event_shape = [1] | ||
params_size = tfp.layers.MixtureNormal.params_size(n_components, event_shape) | ||
inputs = tfkl.Input(shape=(1,)) | ||
h1 = tfkl.Dense(50, activation="relu")(inputs) | ||
h2 = tfkl.Dense(20, activation="relu")(h1) | ||
Comment on lines
+139
to
+140
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, are 50 and 20 values that should be user-configurable? I can't say I understand the method well enough to know their significance. |
||
h3 = tfkl.Dense(params_size, activation=None)(h2) | ||
outputs = tfpl.MixtureNormal(n_components, event_shape, name="output")(h3) | ||
|
||
model = tfk.Model(inputs=inputs, outputs=outputs) | ||
model.compile( | ||
optimizer="adam", loss=lambda y, model: -model.log_prob(y), metrics=["accuracy"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is the loss function different here to the others? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it solves a different problem: the other models try to minimize the squared error from the prediction value. This model tries to estimate the underlying probability distribution, so the loss function is doing Maximum Likelihood Estimation. |
||
) | ||
model.summary() | ||
|
||
self.model = model | ||
|
||
def fit(self, epochs: Optional[int] = 20): | ||
"""Fit the Mixture Density Network model. | ||
|
||
Args: | ||
epochs (Optional[int], optional): Number of epochs to run. Defaults to 20. | ||
""" | ||
self.history = self.model.fit(self.acts, self.durations, epochs=epochs) | ||
|
||
def predict(self, acts: np.array) -> np.array: | ||
"""Predict durations | ||
|
||
Args: | ||
acts (np.array): Act tokens. Shape: (n, 1). | ||
|
||
Returns: | ||
np.array: Durations. Shape: (n, 1) | ||
""" | ||
y_pred = self.model.predict(acts) | ||
|
||
return y_pred |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,22 +4,24 @@ | |||||
|
||||||
if TYPE_CHECKING: | ||||||
from pam.activity import Plan | ||||||
from pam.core import Population | ||||||
|
||||||
from datetime import timedelta as td | ||||||
from itertools import groupby | ||||||
from typing import List, Optional, Union | ||||||
|
||||||
import numpy as np | ||||||
import pandas as pd | ||||||
|
||||||
from pam import activity | ||||||
from pam.variables import START_OF_DAY | ||||||
|
||||||
|
||||||
class Encoder: | ||||||
def __init__(self, labels: List[str], travel_act="travel") -> None: | ||||||
self.labels = set(labels) | ||||||
self.labels = list(labels) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume that labels was previously converted to a set to remove duplicates, although it also removes order. Could this change have unintended effects on existing classes that subclass this Encoder? |
||||||
if travel_act not in self.labels: | ||||||
self.labels.add(travel_act) | ||||||
self.labels.append(travel_act) | ||||||
self.label_code = self.get_mapping(self.labels) | ||||||
self.code_label = {v: k for k, v in self.label_code.items()} | ||||||
|
||||||
|
@@ -169,3 +171,63 @@ class PlansOneHotEncoder(PlansEncoder): | |||||
""" | ||||||
|
||||||
plans_encoder_class = PlanOneHotEncoder | ||||||
|
||||||
|
||||||
class PlansSequenceEncoder: | ||||||
def __init__(self, population: Population, activity_encoder: Optional[Encoder] = None) -> None: | ||||||
"""Encodes the plans of a population into arrays representing sequencies of activities and durations. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
Args: | ||||||
population (Population): A PAM population. | ||||||
activity_encoder (Optional[Encoder], optional): Encoder of activity types. Defaults to None. | ||||||
""" | ||||||
|
||||||
self.population = population | ||||||
act_labels = ["NA", "SOS", "EOS"] + list(population.activity_classes) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you leave a comment / add to the docstring what SOS and EOS mean? |
||||||
|
||||||
if activity_encoder is None: | ||||||
self.activity_encoder = StringIntEncoder(act_labels) | ||||||
else: | ||||||
self.activity_encoder = activity_encoder | ||||||
|
||||||
self.acts = None | ||||||
self.acts_labels = None | ||||||
self.durations = None | ||||||
|
||||||
self.encode_plans() | ||||||
|
||||||
def encode_plans(self) -> None: | ||||||
"""Encode sequencies of activities and durations into numpy arrays.""" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
acts = [] | ||||||
acts_labels = [] | ||||||
durations = [] | ||||||
for hid, pid, person in self.population.people(): | ||||||
# start-of-sequence values | ||||||
person_acts = [1] | ||||||
person_acts_labels = [] | ||||||
person_durations = [0] | ||||||
|
||||||
# collect activities and durations | ||||||
for act in person.activities: | ||||||
person_acts.append(self.activity_encoder.encode(act.act)) | ||||||
person_acts_labels.append(act.act) | ||||||
person_durations.append(act.duration / pd.Timedelta(hours=24)) | ||||||
|
||||||
# end-of-sequence values | ||||||
person_acts.append(2) | ||||||
person_durations.append(0) | ||||||
|
||||||
# append | ||||||
acts.append(person_acts) | ||||||
acts_labels.append(person_acts_labels) | ||||||
durations.append(person_durations) | ||||||
|
||||||
# convert to arrays | ||||||
acts = pd.DataFrame(acts).fillna(0).values.astype(int) | ||||||
durations = pd.DataFrame(durations).fillna(0).values | ||||||
durations = durations / durations.sum(1).reshape(-1, 1) # add up to 24 hours | ||||||
|
||||||
# store | ||||||
self.acts = acts | ||||||
self.acts_labels = acts_labels | ||||||
self.durations = durations |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import numpy as np | ||
import pytest | ||
from pam.planner.choice_scheduling import ( | ||
ActivityDurationMixture, | ||
ActivityDurationRegression, | ||
ScheduleModelSimple, | ||
) | ||
from pam.planner.encoder import PlansSequenceEncoder | ||
from tensorflow import keras | ||
|
||
|
||
@pytest.fixture | ||
def model_simple(population_simple) -> ScheduleModelSimple: | ||
return ScheduleModelSimple(population_simple) | ||
|
||
|
||
@pytest.fixture | ||
def plans_encoded(population_simple) -> PlansSequenceEncoder: | ||
return PlansSequenceEncoder(population_simple) | ||
|
||
|
||
def test_start_end_tokens(model_simple): | ||
assert model_simple.encoder.activity_encoder.label_code["SOS"] == 1 | ||
assert model_simple.encoder.activity_encoder.label_code["EOS"] == 2 | ||
|
||
|
||
def test_prediction_shape_matches_input(model_simple, population_simple): | ||
model_simple.fit(epochs=2) | ||
y_pred = model_simple.predict(population_simple) | ||
np.testing.assert_equal(y_pred.shape, model_simple.encoder.durations.shape) | ||
|
||
|
||
def test_model_built(model_simple): | ||
assert isinstance(model_simple.model, keras.models.Model) | ||
|
||
|
||
def test_activity_duration_regression(plans_encoded): | ||
acts = plans_encoded.acts[:, [0]] | ||
durations = plans_encoded.durations[:, 0] | ||
model = ActivityDurationRegression(acts=acts, durations=durations) | ||
model.fit(epochs=1) | ||
y_pred = model.predict(acts) | ||
assert y_pred.shape == (len(durations), 1) | ||
|
||
|
||
def test_activity_duration_mixture(plans_encoded): | ||
acts = plans_encoded.acts[:, [0]] | ||
durations = plans_encoded.durations[:, 0] | ||
model = ActivityDurationMixture(acts=acts, durations=durations) | ||
model.fit(epochs=1) | ||
y_pred = model.predict(acts) | ||
assert y_pred.shape == (len(durations), 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just use
tmp
for temporary folders, then it would be covered by**/tmp/