NREL · pstjohn · Oct 2, 2020 · Sep 30, 2020 · Sep 30, 2020 · Sep 30, 2020
diff --git a/alphazero/config.py b/alphazero/config.py
@@ -1,38 +1,41 @@
-
-class AlphaZeroConfig:
-
-    def __init__(self):
-
-        # Molecule 
-        self.max_atoms = 10  # max atoms in molecule
-        self.min_atoms = 4  # max atoms in molecule
-
-        # MCTS / rollout
-        self.lru_cache_maxsize = 100000 
-        self.num_rollouts = 1000   # should we limit, if so how much?
-        self.num_simulations = 256  # number of simulations used by MCTS per game step
-        self.root_dirichlet_alpha = 0.0  # 0.3 chess, 0.03 Go, 0.15 shogi
-        self.root_exploration_fraction = 0.25
-        self.pb_c_base = 1   # 19652 in pseudocode
-        self.pb_c_init = 1.25
+# Molecule 
+max_atoms = 10  # max atoms in molecule
+min_atoms = 4  # max atoms in molecule
 
-        # Network
-        self.l2_regularization_coef = 1e-4  
-        self.features = 16     # used by all network layers
-        self.num_messages = 1
-        self.num_heads = 4        # Number of attention heads
-        self.batch_size = 32           # for gradient updates
-        self.checkpoint_frequency = 1      # save new model file every N batches
-        self.batch_update_frequency = 10   # get most recent data every N updates
-        self.gradient_steps_per_batch = 32  # num step per batch
-        self.training_iterations = int(1e06) # training iterations for NN
-
-        assert self.features % self.num_heads == 0, \
-            "dimension mismatch for attention heads"
+# MCTS / rollout
+lru_cache_maxsize = 100000 
+num_rollouts = 1000   # should we limit, if so how much?
+num_simulations = 256  # number of simulations used by MCTS per game step
+root_dirichlet_alpha = 0.0  # 0.3 chess, 0.03 Go, 0.15 shogi
+root_exploration_fraction = 0.25
+pb_c_base = 1   # 19652 in pseudocode
+pb_c_init = 1.25
+min_reward = -1.  # Minimum reward to return for invalid actions
+reward_buffer = 25  # 250 in the R2 paper
 
-        # Buffers
-        self.ranked_reward_alpha = 0.9
-        self.buffer_max_size = 512
+# Network
+l2_regularization_coef = 1e-4  
+features = 16     # used by all network layers
+num_messages = 1
+num_heads = 4        # Number of attention heads
+batch_size = 32           # for gradient updates
+checkpoint_frequency = 1      # save new model file every N batches
+batch_update_frequency = 10   # get most recent data every N updates
+gradient_steps_per_batch = 32  # num step per batch
+training_iterations = int(1e06) # training iterations for NN
 
-        # Training
-        self.training_steps = 100
+#assert self.features % self.num_heads == 0, \
+#   "dimension mismatch for attention heads"
+
+# Buffers
+ranked_reward_alpha = 0.9
+buffer_max_size = 512
+
+# Training
+training_steps = 100
+
+# DB tables
+sql_basename = "Stable"
+
+# Experiment id
+experiment_id = "0001"
diff --git a/alphazero/game.py b/alphazero/game.py
@@ -4,13 +4,11 @@
 
 from rdkit import Chem
 
+import alphazero.config as config
 from alphazero.node import Node
 from alphazero.policy import policy_model
-from alphazero.config import AlphaZeroConfig
 
 model = policy_model()
-config = AlphaZeroConfig()
-
 
 class Game(nx.DiGraph):
 
@@ -74,11 +72,18 @@ def expand(self, parent: Node):
         """
 
         # Create the children nodes and add them to the graph
-        self.add_edges_from(((parent, child) for child in parent.build_children()))
+        children = list(parent.build_children())
+
+        if not children:
+            parent.terminal = True
+            parent._reward = config.min_reward
+            return parent._reward
+
+        self.add_edges_from(((parent, child) for child in children))
 
         # Run the policy network to get value and prior_logit predictions
-        values, prior_logits = model(parent.policy_inputs_with_children())
-        prior_logits = prior_logits[1:].numpy().flatten()
+        values, prior_logits = model.predict(parent.policy_inputs_with_children())
+        prior_logits = prior_logits[1:].flatten()
 
         # if we're adding noise, perturb the logits
         if self.dirichlet_noise:

diff --git a/alphazero/mod.py b/alphazero/mod.py
@@ -0,0 +1,7 @@
+import alphazero.config as config
+
+# DB table names modified by the user according to their wish
+config.sql_basename = "StableES"
+
+# Experiment id
+config.experiment_id = "0001"
diff --git a/alphazero/node.py b/alphazero/node.py
@@ -8,13 +8,10 @@
 import rdkit.Chem
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 
-from alphazero.config import AlphaZeroConfig
+import alphazero.config as config
 from alphazero.molecule import build_molecules, build_radicals
 from alphazero.preprocessor import preprocessor
 
-CONFIG = AlphaZeroConfig()
-
-
 class Node(rdkit.Chem.Mol):
 
     def __init__(self, *args, graph: nx.DiGraph=None, terminal: bool=False, **kwargs):
@@ -61,15 +58,15 @@ def build_children(self):
         if self.terminal:
             raise RuntimeError("Attemping to get children of terminal node")
 
-        if self.GetNumAtoms() < CONFIG.max_atoms:
+        if self.GetNumAtoms() < config.max_atoms:
             for mol in build_molecules(self, stereoisomers=False):
                 if self.G.has_node(mol):
                     # Check if the graph already has the current mol
                     yield self.G.nodes[mol]
                 else:
                     yield self.__class__(mol, graph=self.G)
 
-        if self.GetNumAtoms() >= CONFIG.min_atoms:
+        if self.GetNumAtoms() >= config.min_atoms:
             for radical in build_radicals(self):
                 yield self.__class__(radical, graph=self.G, terminal=True)
 
@@ -114,8 +111,8 @@ def value(self):
 
     def ucb_score(self, parent):
 
-        pb_c = np.log((parent.visits + CONFIG.pb_c_base + 1) /
-                      CONFIG.pb_c_base) + CONFIG.pb_c_init
+        pb_c = np.log((parent.visits + config.pb_c_base + 1) /
+                      config.pb_c_base) + config.pb_c_init
 
         pb_c *= np.sqrt(parent.visits) / (self.visits + 1)
 
@@ -217,4 +214,4 @@ def get_reward(self):
         """This should get overwritten by a subclass's reward function.
         (Should this be using ranked rewards?)
         """
-        pass
+        pass
diff --git a/alphazero/policy.py b/alphazero/policy.py
@@ -2,10 +2,8 @@
 from tensorflow.keras import layers
 import nfp
 
+import alphazero.config as config
 from alphazero.preprocessor import preprocessor
-from alphazero.config import AlphaZeroConfig
-
-CONFIG = AlphaZeroConfig()
 
 # two models: 
 # first, a policy model that predicts value, pi_logits from a batch of molecule inputs
@@ -21,24 +19,24 @@ def policy_model():
     input_tensors = [atom_class, bond_class, connectivity]
 
     # Initialize the atom states
-    atom_state = layers.Embedding(preprocessor.atom_classes, CONFIG.features,
+    atom_state = layers.Embedding(preprocessor.atom_classes, config.features,
                                   name='atom_embedding', mask_zero=True)(atom_class)
 
     # Initialize the bond states
-    bond_state = layers.Embedding(preprocessor.bond_classes, CONFIG.features,
+    bond_state = layers.Embedding(preprocessor.bond_classes, config.features,
                                   name='bond_embedding', mask_zero=True)(bond_class)
 
-    units = CONFIG.features//CONFIG.num_heads
-    global_state = nfp.GlobalUpdate(units=units, num_heads=CONFIG.num_heads)(
+    units = config.features//config.num_heads
+    global_state = nfp.GlobalUpdate(units=units, num_heads=config.num_heads)(
         [atom_state, bond_state, connectivity])
 
-    for _ in range(CONFIG.num_messages):  # Do the message passing
+    for _ in range(config.num_messages):  # Do the message passing
         bond_state = nfp.EdgeUpdate()([atom_state, bond_state, connectivity, global_state])
         atom_state = nfp.NodeUpdate()([atom_state, bond_state, connectivity, global_state])
-        global_state = nfp.GlobalUpdate(units=units, num_heads=CONFIG.num_heads)(
+        global_state = nfp.GlobalUpdate(units=units, num_heads=config.num_heads)(
             [atom_state, bond_state, connectivity, global_state])
 
     value = layers.Dense(1, activation='tanh')(global_state)
     pi_logit = layers.Dense(1)(global_state)
 
-    return tf.keras.Model(input_tensors, [value, pi_logit], name='policy_model')
+    return tf.keras.Model(input_tensors, [value, pi_logit], name='policy_model')