From 57cb327d3d34aacb83e51da2e4955d447f726eb6 Mon Sep 17 00:00:00 2001
From: Hamda <93351861+HamdaHmida@users.noreply.github.com>
Date: Fri, 18 Oct 2024 09:18:04 +0100
Subject: [PATCH] Up variable encoding (#414)

- Add a variable space that allows for variable-length state array in reinforcement learning
- Update the Unified Planning domain to allow PDDL states to be represented as variable-length arrays when used in reinforcement learning
---
 examples/up_native_solvers.py |  25 ++++++
 pyproject.toml                |   2 +-
 skdecide/hub/domain/up/up.py  | 161 +++++++++++++++++++++++++++++++---
 skdecide/hub/space/gym/gym.py |  46 ++++++++++
 4 files changed, 222 insertions(+), 12 deletions(-)

diff --git a/examples/up_native_solvers.py b/examples/up_native_solvers.py
index 08d3c20e3a..8e32cf9727 100644
--- a/examples/up_native_solvers.py
+++ b/examples/up_native_solvers.py
@@ -150,3 +150,28 @@
             max_framerate=30,
             outcome_formatter=None,
         )
+
+# Example 3: Solving the same numeric example with the variable state encoding of UPDomain
+
+domain_factory = lambda: UPDomain(
+    problem,
+    state_encoding="variable",
+    action_encoding="int",
+)
+
+print("Initialise Solver ... \n")
+solver = RayRLlib(
+    domain_factory=domain_factory,
+    algo_class=DQN,
+    train_iterations=1,
+)
+
+solver.solve()
+
+rollout(
+    domain_factory(),
+    solver,
+    num_episodes=1,
+    max_steps=100,
+    outcome_formatter=None,
+)
diff --git a/pyproject.toml b/pyproject.toml
index 3482231a5f..9b8c0b6dca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,7 @@ up-enhsp = { version = ">=0.0.25", python = ">=3.10", optional = true }
 up-pyperplan = { version = ">=1.1.0", python = ">=3.10", optional = true }
 cartopy = { version = ">=0.22.0", python = ">=3.9", optional = true }
 pygrib = [
-    { version = ">=2.1.5", platform = "linux", optional = true },
+    { version = "<=2.1.5", platform = "linux", optional = true },
     { version = ">=2.1.5", platform = "darwin", optional = true },
 ]
 
diff --git a/skdecide/hub/domain/up/up.py b/skdecide/hub/domain/up/up.py
index e8facb5a00..398ad6f19a 100644
--- a/skdecide/hub/domain/up/up.py
+++ b/skdecide/hub/domain/up/up.py
@@ -25,7 +25,7 @@
 from skdecide.core import EmptySpace, ImplicitSpace, Space, Value
 from skdecide.domains import DeterministicPlanningDomain
 from skdecide.hub.space.gym import ListSpace, SetSpace
-from skdecide.hub.space.gym.gym import BoxSpace, DictSpace, DiscreteSpace, GymSpace
+from skdecide.hub.space.gym.gym import BoxSpace, DictSpace, DiscreteSpace, VariableSpace
 from skdecide.utils import logger
 
 
@@ -167,6 +167,8 @@ def __init__(
         fluent_domains: dict[FNode, tuple[Union[int, float], Union[int, float]]] = None,
         state_encoding: str = "native",
         action_encoding: str = "native",
+        max_len: int = 2000,
+        max_actions: int = 20,
         **simulator_params,
     ):
         """Initialize UPDomain.
@@ -174,8 +176,10 @@ def __init__(
         # Parameters
         problem: The Unified Planning problem (Problem) to wrap.
         fluent_domains: Dictionary of min and max fluent values by fluent represented as a Unified Planning's FNode (must be provided only if get_observation_space() is used)
-        state_encoding: Encoding of the state (observation) which must be one of "native", "dictionary" or "vector" (warning: if action_masking is "vector" then the state automatically becomes a dictionary which separates the action masking vector from the real state as defined here)
+        state_encoding: Encoding of the state (observation) which must be one of "native", "dictionary", "vector" or "variable" (warning: if action_masking is "vector" then the state automatically becomes a dictionary which separates the action masking vector from the real state as defined here)
         action_encoding: Encoding of the action which must be either "native" or "int"
+        max_len: Maximum number of fluents in the case of using variable state encoding
+        max_actions: Maximum number of actions in the case of using variable state encoding
         simulator_params: Optional parameters to pass to the UP sequential simulator
         """
         self._problem = problem
@@ -204,17 +208,21 @@ def __init__(
         self._states_np2up = None
         self._actions_up2np = None
         self._actions_np2up = None
-        if self._state_encoding != "native":
-            if self._state_encoding not in ["dictionary", "vector"]:
-                raise RuntimeError(
-                    "State encoding must be one of 'native', 'dictionary' or 'vector'"
-                )
-            self._init_state_encoding_()
+        self.max_len = max_len  # used only in the variable state encoding
+        self.max_actions = max_actions  # used only in the variable state encoding
+
         if self._action_encoding != "native":
             if self._action_encoding != "int":
                 raise RuntimeError("Action encoding must be either 'native' or 'int'")
             self._init_action_encoding_()
 
+        if self._state_encoding != "native":
+            if self._state_encoding not in ["dictionary", "vector", "variable"]:
+                raise RuntimeError(
+                    "State encoding must be one of 'native', 'dictionary', 'vector' or 'variable'"
+                )
+            self._init_state_encoding_()
+
     def _init_state_encoding_(self):
         def fnode_lower_bound(fn):
             if fn.fluent().type.lower_bound is not None:
@@ -240,12 +248,56 @@ def fnode_upper_bound(fn):
         self._fnodes_vars_ordering = []
         self._states_up2np = {}
         self._states_np2up = {}
+
+        if self._state_encoding == "variable":
+            self.objects = []
+            self.max_param = 0
+            for i, a in enumerate(self._actions_np2up):
+                if len(a.up_parameters) > self.max_param:
+                    self.max_param = len(a.up_parameters)
+                for p in a.up_parameters:
+                    if p not in self.objects:
+                        self.objects.append(p)
+
+            self.n2id = {
+                i.name: self._problem.fluents.index(i) + 1
+                for i in self._problem.fluents
+            }
+            self.id2n = {
+                self._problem.fluents.index(i) + 1: i.name
+                for i in self._problem.fluents
+            }
+
+            self.variable_mapping = {}
+            self.inv_mapping = {}
+            self.bools = []
+            self.bool_val = {}
+            self.non_bool_val = {}
+
         init_state = self._simulator.get_initial_state()
         static_fluents = self._problem.get_static_fluents()
         self._static_fluent_values = {}
         ci = init_state
         while ci is not None:
             for fn, fv in ci._values.items():
+                if self._state_encoding == "variable":
+                    if fn.fluent().type.is_bool_type():
+                        if self.n2id[fn.fluent().name] not in self.bools:
+                            self.bools.append(self.n2id[fn.fluent().name])
+                        if int(fv.constant_value()) not in self.bool_val.keys():
+                            self.bool_val[int(fv.constant_value())] = fv
+                    else:
+                        self.bool_val[int(fv.constant_value())] = fv
+                    fluent = np.array([-1 for _ in range(self.max_param + 2)])
+                    fluent[0] = self.n2id[fn.fluent().name]
+                    fluent[-1] = int(fv.constant_value())
+                    c = 1
+                    for j in fn._content.args:
+                        fluent[c] = self.objects.index(j)
+                        c += 1
+
+                    self.variable_mapping[(fn, fv)] = (fluent[:-1], fluent[-1])
+                    self.inv_mapping[tuple(fluent[:-1])] = fn
                 if (
                     fn.fluent() not in static_fluents
                     and fn.fluent().name != "total-cost"
@@ -295,11 +347,14 @@ def fnode_upper_bound(fn):
                     elif fn.fluent().type.is_time_type():
                         raise RuntimeError("Time types not handled by UPDomain")
                 elif fn.fluent().name != "total-cost":
-                    self._static_fluent_values[fn] = fv
+                    if self._state_encoding != "variable":
+                        self._static_fluent_values[fn] = fv
             ci = ci._father
 
     def _convert_to_skup_state_(self, state):
-        if self._state_encoding == "native":
+        if state is None:
+            return None
+        elif self._state_encoding == "native":
             return state
         elif self._state_encoding == "dictionary":
             kstate = frozenset(state.items())
@@ -327,6 +382,25 @@ def _convert_to_skup_state_(self, state):
                 self._states_up2np[skup_state] = state
                 self._states_np2up[kstate] = skup_state
                 return skup_state
+        elif self._state_encoding == "variable":
+            values = {}
+            for fluent in state:
+                if tuple(fluent[:-1]) in self.inv_mapping.keys():
+                    k = self.inv_mapping[tuple(fluent[:-1])]
+                    if fluent[0] in self.bools:
+                        values[k] = self.bool_val[fluent[-1]]
+                    else:
+                        values[k] = Int(int(fluent[-1]))
+                else:
+                    for k in self.variable_mapping.keys():
+                        if self.variable_mapping[k][0].all() == fluent[:-1].all():
+                            if fluent[0] in self.bools:
+                                values[k[0]] = self.bool_val[fluent[-1]]
+                            else:
+                                values[k[0]] = Int(int(fluent[-1]))
+
+            values.update(self._static_fluent_values)
+            return SkUPState(UPState(values))
         else:
             return None
 
@@ -367,6 +441,33 @@ def _convert_from_skup_state_(self, skup_state: SkUPState):
                 self._states_np2up[tuple(state.flatten())] = skup_state
                 self._states_up2np[skup_state] = state
                 return state
+        elif self._state_encoding == "variable":
+            state = []
+            try:
+                ci = skup_state.up_state
+            except:
+                ci = skup_state
+            while ci is not None:
+                for fn, val in ci._values.items():
+                    if (fn, val) in self.variable_mapping.keys():
+                        state.append(
+                            np.append(
+                                self.variable_mapping[(fn, val)][0],
+                                self.variable_mapping[(fn, val)][1],
+                            )
+                        )
+                    else:
+                        fluent = np.array([-1 for _ in range(self.max_param + 2)])
+                        fluent[0] = self.n2id[fn.fluent().name]
+                        fluent[-1] = int(val.constant_value())
+                        c = 1
+                        for j in fn._content.args:
+                            fluent[c] = self.objects.index(j)
+                            c += 1
+                        state.append(fluent)
+                        self.variable_mapping[(fn, val)] = (fluent[:-1], fluent[-1])
+                        self.inv_mapping[tuple(fluent[:-1])] = fn
+                return state
         else:
             return None
 
@@ -407,6 +508,11 @@ def _get_next_state(
         next_state = SkUPState(
             self._simulator.apply(state.up_state, act.up_action, act.up_parameters)
         )
+        if (self._state_encoding == "variable") and (next_state.up_state is not None):
+            for fn, fv in state.up_state._values.items():
+                if fn not in next_state.up_state._values.keys():
+                    next_state.up_state._values[fn] = fv
+
         if self._total_cost is not None:
             cost = (
                 next_state.up_state.get_value(self._total_cost).constant_value() - cost
@@ -478,7 +584,10 @@ def _get_action_space_(self) -> D.T_agent[Space[D.T_event]]:
                     self._init_action_encoding_()
                 self._action_space = ListSpace(self._actions_np2up)
             elif self._action_encoding == "int":
-                self._action_space = DiscreteSpace(len(self._actions_np2up))
+                if self._state_encoding != "variable":
+                    self._action_space = DiscreteSpace(len(self._actions_np2up))
+                else:
+                    self._action_space = DiscreteSpace(self.max_actions)
             else:
                 return None
         return self._action_space
@@ -573,6 +682,36 @@ def _get_observation_space_(self) -> D.T_agent[Space[D.T_observation]]:
                         else np.int32
                     ),
                 )
+            elif self._state_encoding == "variable":
+                self._observation_space = VariableSpace(
+                    BoxSpace(
+                        low=-1
+                        if np.array(
+                            [
+                                self._fnodes_variables_map[fn][0]
+                                for fn in self._fnodes_vars_ordering
+                            ]
+                        ).min()
+                        > -1
+                        else np.array(
+                            [
+                                self._fnodes_variables_map[fn][0]
+                                for fn in self._fnodes_vars_ordering
+                            ]
+                        ).min(),
+                        high=1000000,
+                        shape=(self.max_param + 2,),
+                        dtype=(
+                            np.float32
+                            if any(
+                                fn.fluent().type.is_real_type()
+                                for fn in self._fnodes_vars_ordering
+                            )
+                            else np.int32
+                        ),
+                    ),
+                    max_len=self.max_len,
+                )
             else:
                 return None
         return self._observation_space
diff --git a/skdecide/hub/space/gym/gym.py b/skdecide/hub/space/gym/gym.py
index 6af959df6a..2df488c060 100644
--- a/skdecide/hub/space/gym/gym.py
+++ b/skdecide/hub/space/gym/gym.py
@@ -510,3 +510,49 @@ def to_unwrapped(self, sample_n: Iterable[T]) -> Iterable[dict]:
     def from_unwrapped(self, sample_n: Iterable[dict]) -> Iterable[T]:
         # TODO: convert to simple types (get rid of ndarray created by gym dict space...)?
         return [self._data_class(**sample) for sample in sample_n]
+
+
+class VariableSpace(GymSpace[T]):
+
+    """This class wraps a gymnasium Space (gym.spaces.Space) to allow dynamic length of elements."""
+
+    def __init__(
+        self,
+        space: gym.Space,
+        max_len: int,
+        **kwargs,
+    ):
+        self._gym_space = space
+        self.max_len = max_len
+        self.size = (self.max_len, self._gym_space._shape[0])
+
+    def sample(self):
+        length = self.max_len
+        return list(np.array(self._gym_space.sample()) for _ in range(length))
+
+    def unwrapped(self):
+        return gym.spaces.Box(
+            low=self._gym_space.low.min(),
+            high=self._gym_space.high.max(),
+            shape=self.size,
+        )
+
+    def to_unwrapped(self, sample_n: Iterable[T]) -> Iterable:
+        return [
+            np.pad(
+                np.array(v),
+                ((0, self.max_len - len(v)), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            for v in sample_n
+        ]
+
+    def from_unwrapped(self, sample_n: Iterable) -> Iterable[T]:
+        return [
+            np.array(ligne)
+            for ligne in [row for row in sample_n if not np.all(row == 0)]
+        ]
+
+    def __repr__(self):
+        return f"RepeatedSpace({self._gym_space}, max_len={self.max_len})"