Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

addressing issues 5, 8, (part of 13) - new #17

Merged
merged 19 commits into from
Oct 2, 2020
73 changes: 38 additions & 35 deletions alphazero/config.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,41 @@

class AlphaZeroConfig:

def __init__(self):

# Molecule
self.max_atoms = 10 # max atoms in molecule
self.min_atoms = 4 # max atoms in molecule

# MCTS / rollout
self.lru_cache_maxsize = 100000
self.num_rollouts = 1000 # should we limit, if so how much?
self.num_simulations = 256 # number of simulations used by MCTS per game step
self.root_dirichlet_alpha = 0.0 # 0.3 chess, 0.03 Go, 0.15 shogi
self.root_exploration_fraction = 0.25
self.pb_c_base = 1 # 19652 in pseudocode
self.pb_c_init = 1.25
# Molecule
max_atoms = 10 # max atoms in molecule
min_atoms = 4 # max atoms in molecule

# Network
self.l2_regularization_coef = 1e-4
self.features = 16 # used by all network layers
self.num_messages = 1
self.num_heads = 4 # Number of attention heads
self.batch_size = 32 # for gradient updates
self.checkpoint_frequency = 1 # save new model file every N batches
self.batch_update_frequency = 10 # get most recent data every N updates
self.gradient_steps_per_batch = 32 # num step per batch
self.training_iterations = int(1e06) # training iterations for NN

assert self.features % self.num_heads == 0, \
"dimension mismatch for attention heads"
# MCTS / rollout
lru_cache_maxsize = 100000
num_rollouts = 1000 # should we limit, if so how much?
num_simulations = 256 # number of simulations used by MCTS per game step
root_dirichlet_alpha = 0.0 # 0.3 chess, 0.03 Go, 0.15 shogi
root_exploration_fraction = 0.25
pb_c_base = 1 # 19652 in pseudocode
pb_c_init = 1.25
min_reward = -1. # Minimum reward to return for invalid actions
reward_buffer = 25 # 250 in the R2 paper

# Buffers
self.ranked_reward_alpha = 0.9
self.buffer_max_size = 512
# Network
l2_regularization_coef = 1e-4
features = 16 # used by all network layers
num_messages = 1
num_heads = 4 # Number of attention heads
batch_size = 32 # for gradient updates
checkpoint_frequency = 1 # save new model file every N batches
batch_update_frequency = 10 # get most recent data every N updates
gradient_steps_per_batch = 32 # num step per batch
training_iterations = int(1e06) # training iterations for NN

# Training
self.training_steps = 100
#assert self.features % self.num_heads == 0, \
# "dimension mismatch for attention heads"

# Buffers
ranked_reward_alpha = 0.9
buffer_max_size = 512

# Training
training_steps = 100

# DB tables
sql_basename = "Stable"

# Experiment id
experiment_id = "0001"
17 changes: 11 additions & 6 deletions alphazero/game.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@

from rdkit import Chem

import alphazero.config as config
from alphazero.node import Node
from alphazero.policy import policy_model
from alphazero.config import AlphaZeroConfig

model = policy_model()
config = AlphaZeroConfig()


class Game(nx.DiGraph):

Expand Down Expand Up @@ -74,11 +72,18 @@ def expand(self, parent: Node):
"""

# Create the children nodes and add them to the graph
self.add_edges_from(((parent, child) for child in parent.build_children()))
children = list(parent.build_children())

if not children:
parent.terminal = True
parent._reward = config.min_reward
return parent._reward

self.add_edges_from(((parent, child) for child in children))

# Run the policy network to get value and prior_logit predictions
values, prior_logits = model(parent.policy_inputs_with_children())
prior_logits = prior_logits[1:].numpy().flatten()
values, prior_logits = model.predict(parent.policy_inputs_with_children())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems to fix those retracing errors we were seeing

prior_logits = prior_logits[1:].flatten()

# if we're adding noise, perturb the logits
if self.dirichlet_noise:
Expand Down
7 changes: 7 additions & 0 deletions alphazero/mod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import alphazero.config as config
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can directly modify config variables in the run scripts, so mod.py is not needed

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, mod.py was moved to stable_radical_optimization folder and renamed to stable_rad_config.py. Loading mod.py was also removed from all scripts in alphazero folder


# DB table names modified by the user according to their wish
config.sql_basename = "StableES"

# Experiment id
config.experiment_id = "0001"
15 changes: 6 additions & 9 deletions alphazero/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@
import rdkit.Chem
from tensorflow.keras.preprocessing.sequence import pad_sequences

from alphazero.config import AlphaZeroConfig
import alphazero.config as config
from alphazero.molecule import build_molecules, build_radicals
from alphazero.preprocessor import preprocessor

CONFIG = AlphaZeroConfig()


class Node(rdkit.Chem.Mol):

def __init__(self, *args, graph: nx.DiGraph=None, terminal: bool=False, **kwargs):
Expand Down Expand Up @@ -61,15 +58,15 @@ def build_children(self):
if self.terminal:
raise RuntimeError("Attemping to get children of terminal node")

if self.GetNumAtoms() < CONFIG.max_atoms:
if self.GetNumAtoms() < config.max_atoms:
for mol in build_molecules(self, stereoisomers=False):
if self.G.has_node(mol):
# Check if the graph already has the current mol
yield self.G.nodes[mol]
else:
yield self.__class__(mol, graph=self.G)

if self.GetNumAtoms() >= CONFIG.min_atoms:
if self.GetNumAtoms() >= config.min_atoms:
for radical in build_radicals(self):
yield self.__class__(radical, graph=self.G, terminal=True)

Expand Down Expand Up @@ -114,8 +111,8 @@ def value(self):

def ucb_score(self, parent):

pb_c = np.log((parent.visits + CONFIG.pb_c_base + 1) /
CONFIG.pb_c_base) + CONFIG.pb_c_init
pb_c = np.log((parent.visits + config.pb_c_base + 1) /
config.pb_c_base) + config.pb_c_init

pb_c *= np.sqrt(parent.visits) / (self.visits + 1)

Expand Down Expand Up @@ -217,4 +214,4 @@ def get_reward(self):
"""This should get overwritten by a subclass's reward function.
(Should this be using ranked rewards?)
"""
pass
pass
18 changes: 8 additions & 10 deletions alphazero/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
from tensorflow.keras import layers
import nfp

import alphazero.config as config
from alphazero.preprocessor import preprocessor
from alphazero.config import AlphaZeroConfig

CONFIG = AlphaZeroConfig()

# two models:
# first, a policy model that predicts value, pi_logits from a batch of molecule inputs
Expand All @@ -21,24 +19,24 @@ def policy_model():
input_tensors = [atom_class, bond_class, connectivity]

# Initialize the atom states
atom_state = layers.Embedding(preprocessor.atom_classes, CONFIG.features,
atom_state = layers.Embedding(preprocessor.atom_classes, config.features,
name='atom_embedding', mask_zero=True)(atom_class)

# Initialize the bond states
bond_state = layers.Embedding(preprocessor.bond_classes, CONFIG.features,
bond_state = layers.Embedding(preprocessor.bond_classes, config.features,
name='bond_embedding', mask_zero=True)(bond_class)

units = CONFIG.features//CONFIG.num_heads
global_state = nfp.GlobalUpdate(units=units, num_heads=CONFIG.num_heads)(
units = config.features//config.num_heads
global_state = nfp.GlobalUpdate(units=units, num_heads=config.num_heads)(
[atom_state, bond_state, connectivity])

for _ in range(CONFIG.num_messages): # Do the message passing
for _ in range(config.num_messages): # Do the message passing
bond_state = nfp.EdgeUpdate()([atom_state, bond_state, connectivity, global_state])
atom_state = nfp.NodeUpdate()([atom_state, bond_state, connectivity, global_state])
global_state = nfp.GlobalUpdate(units=units, num_heads=CONFIG.num_heads)(
global_state = nfp.GlobalUpdate(units=units, num_heads=config.num_heads)(
[atom_state, bond_state, connectivity, global_state])

value = layers.Dense(1, activation='tanh')(global_state)
pi_logit = layers.Dense(1)(global_state)

return tf.keras.Model(input_tensors, [value, pi_logit], name='policy_model')
return tf.keras.Model(input_tensors, [value, pi_logit], name='policy_model')
Loading