Merge pull request #13 from alkemics/search

v0.0.3
alkemics · May 10, 2020 · 164138e · 164138e
2 parents 5a5619e + 4243297
commit 164138e
Show file tree

Hide file tree

Showing 21 changed files with 321 additions and 159 deletions.
diff --git a/pandagg/node/query/_parameter_clause.py b/pandagg/node/query/_parameter_clause.py
@@ -22,7 +22,7 @@ def __init__(self, value):
     def line_repr(self, depth, **kwargs):
         return "%s=%s" % (self.KEY, json.dumps(self.body["value"]))
 
-    def serialize(self, with_name=True):
+    def to_dict(self, with_name=True):
         return {self.KEY: self.body["value"]}
 
 
@@ -131,6 +131,9 @@ def __init__(self, *args, **kwargs):
                 )
         super(ParentParameterClause, self).__init__(_children=children)
 
+    def to_dict(self, with_name=True):
+        return {self.KEY: [n.to_dict() for n in self._children]}
+
 
 class Filter(ParentParameterClause):
     KEY = "filter"

diff --git a/pandagg/node/query/abstract.py b/pandagg/node/query/abstract.py
@@ -97,7 +97,7 @@ def name(self):
     def _identifier_prefix(self):
         return "%s_" % self.KEY
 
-    def serialize(self, with_name=True):
+    def to_dict(self, with_name=True):
         b = self.body.copy()
         if with_name and self._named:
             b["_name"] = self.name
@@ -113,9 +113,9 @@ def __str__(self):
 
     def __eq__(self, other):
         if isinstance(other, self.__class__):
-            return other.serialize() == self.serialize()
+            return other.to_dict() == self.to_dict()
         # make sure we still equal to a dict with the same data
-        return other == self.serialize()
+        return other == self.to_dict()
 
 
 class LeafQueryClause(QueryClause):

diff --git a/pandagg/node/query/compound.py b/pandagg/node/query/compound.py
@@ -12,20 +12,22 @@
 
 
 class CompoundClause(QueryClause):
-    """Compound clauses can encapsulate other query clauses.
+    """Compound clauses can encapsulate other query clauses::
+
+        {
+            "<query_type>" : {
+                <query_body>
+                <children_clauses>
+            }
+        }
 
     Note: the children attribute's only purpose is for initiation with the following syntax:
+
     >>> from pandagg.query import Bool, Term
     >>> query = Bool(
     >>>     filter=Term(field='some_path', value=3),
     >>>     _name='bool_id',
     >>> )
-    {
-        "<query_type>" : {
-            <query_body>
-            <children_clauses>
-        }
-    }
     """
 
     DEFAULT_OPERATOR = None
@@ -67,6 +69,12 @@ def params(cls, parent_only=False):
             or not issubclass(cls.get_dsl_class(p, "_param_"), SimpleParameter)
         }
 
+    def to_dict(self, with_name=True):
+        d = {}
+        for c in self._children:
+            d.update(c.to_dict())
+        return {self.KEY: d}
+
 
 class Bool(CompoundClause):
     DEFAULT_OPERATOR = Must

diff --git a/pandagg/node/query/term_level.py b/pandagg/node/query/term_level.py
@@ -29,7 +29,7 @@ def __init__(self, values, _name=None):
         self.values = values
         super(Ids, self).__init__(_name=_name, values=values)
 
-    def serialize(self, with_name=True):
+    def to_dict(self, with_name=True):
         b = {"values": self.values}
         if with_name and self._named:
             b["_name"] = self.name

diff --git a/pandagg/search.py b/pandagg/search.py
@@ -201,7 +201,8 @@ def __getitem__(self, n):
             return s
 
     def size(self, size):
-        """Equivalent to::
+        """
+        Equivalent to::
 
             s = Search().params(size=size)
 

diff --git a/pandagg/tree/aggs.py b/pandagg/tree/aggs.py
@@ -24,16 +24,57 @@
 
 @python_2_unicode_compatible
 class Aggs(Tree):
-    """Tree combination of aggregation nodes.
+    r"""
+    Combination of aggregation clauses. This class provides handful methods to build an aggregation (see
+    :func:`~pandagg.tree.aggs.Aggs.aggs` and :func:`~pandagg.tree.aggs.Aggs.groupby`), and is used as well
+    to parse aggregations response in handy formats.
 
-    Mapping declaration is optional, but doing so validates aggregation validity.
+    Mapping declaration is optional, but doing so validates aggregation validity and automatically handles missing
+    nested clauses.
+
+    All following syntaxes are identical:
+
+    From a dict:
+
+    >>> Aggs({"per_user":{"terms":{"field":"user"}}})
+
+    Using shortcut declaration: first argument is the aggregation type, other arguments are aggregation body parameters:
+
+    >>> Aggs('terms', name='per_user', field='user')
+
+    Using DSL class:
+
+    >>> from pandagg.aggs import Terms
+    >>> Aggs(Terms('per_user', field='user'))
+
+    Dict and DSL class syntaxes allow to provide multiple clauses aggregations:
+
+    >>> Aggs({"per_user":{"terms":{"field":"user"}, "aggs": {"avg_age": {"avg": {"field": "age"}}}}})
+
+    With is similar to:
+
+    >>> from pandagg.aggs import Terms, Avg
+    >>> Aggs(Terms('per_user', field='user', aggs=Avg('avg_age', field='age')))
+
+    :Keyword Arguments:
+        * *mapping* (``dict`` or ``pandagg.tree.mapping.Mapping``) --
+          Mapping of requested indice(s). Providing it will validate aggregations validity, and add required nested
+          clauses if missing.
+
+        * *nested_autocorrect* (``bool``) --
+          In case of missing nested clauses in aggregation, if True, automatically add missing nested clauses, else
+          raise error.
+
+        * remaining kwargs:
+          Used as body in aggregation
     """
 
     node_class = AggNode
     _crafted_root_name = "root"
 
     def __init__(self, *args, **kwargs):
         self.mapping = Mapping(kwargs.pop("mapping", None))
+        self.nested_autocorrect = kwargs.pop("nested_autocorrect", False)
         super(Aggs, self).__init__()
         if args or kwargs:
             self._fill(*args, **kwargs)
@@ -43,15 +84,6 @@ def __nonzero__(self):
 
     __bool__ = __nonzero__
 
-    @classmethod
-    def deserialize(cls, *args, **kwargs):
-        mapping = kwargs.pop("mapping", None)
-        if len(args) == 1 and isinstance(args[0], Aggs):
-            return args[0]
-
-        new = cls(mapping=mapping)
-        return new._fill(*args, **kwargs)
-
     def _fill(self, *args, **kwargs):
         if args:
             node_hierarchy = self.node_class._type_deserializer(*args, **kwargs)
@@ -63,7 +95,10 @@ def _fill(self, *args, **kwargs):
         return self
 
     def _clone_init(self, deep=False):
-        return Aggs(mapping=self.mapping.clone(deep=deep))
+        return Aggs(
+            mapping=self.mapping.clone(deep=deep),
+            nested_autocorrect=self.nested_autocorrect,
+        )
 
     def _is_eligible_grouping_node(self, nid):
         """Return whether node can be used as grouping node."""
@@ -77,7 +112,8 @@ def _is_eligible_grouping_node(self, nid):
 
     @property
     def deepest_linear_bucket_agg(self):
-        """Return deepest bucket aggregation node (pandagg.nodes.abstract.BucketAggNode) of that aggregation that
+        """
+        Return deepest bucket aggregation node (pandagg.nodes.abstract.BucketAggNode) of that aggregation that
         neither has siblings, nor has an ancestor with siblings.
         """
         if not self.root or not self._is_eligible_grouping_node(self.root):
@@ -101,7 +137,8 @@ def deepest_linear_bucket_agg(self):
         return last_bucket_agg_name
 
     def _validate_aggs_parent_id(self, pid):
-        """If pid is not None, ensure that pid belongs to tree, and that it refers to a bucket aggregation.
+        """
+        If pid is not None, ensure that pid belongs to tree, and that it refers to a bucket aggregation.
 
         Else, if not provided, return deepest bucket aggregation if there is no ambiguity (linear aggregations).
         KO: non-ambiguous::
@@ -131,7 +168,8 @@ def _validate_aggs_parent_id(self, pid):
         return leaves[0].identifier
 
     def groupby(self, *args, **kwargs):
-        r"""Arrange passed aggregations in vertical/nested manner, above or below another agg clause.
+        r"""
+        Arrange passed aggregations in vertical/nested manner, above or below another agg clause.
 
         Given the initial aggregation::
 
@@ -140,12 +178,12 @@ def groupby(self, *args, **kwargs):
 
         If `insert_below` = 'A'::
 
-            A──> by──> B
+            A──> new──> B
                   └──> C
 
         If `insert_above` = 'B'::
 
-            A──> by──> B
+            A──> new──> B
             └──> C
 
         `by` argument accepts single occurrence or sequence of following formats:
@@ -163,11 +201,10 @@ def groupby(self, *args, **kwargs):
             └──> C
 
 
-        Accepted declarations for single aggregation:
+        Accepted all Aggs.__init__ syntaxes
 
-        Official DSL like:
-
-        >>> Aggs().groupby('terms', name='per_user_id', field='user_id')
+        >>> Aggs()\
+        >>> .groupby('terms', name='per_user_id', field='user_id')
         {"terms_on_my_field":{"terms":{"field":"some_field"}}}
 
         Passing a dict:
@@ -221,19 +258,19 @@ def groupby(self, *args, **kwargs):
                 raise ValueError(
                     "Kwargs not allowed when passing multiple aggregations in args."
                 )
-            inserted_aggs = [self.deserialize(arg) for arg in args]
+            inserted_aggs = [Aggs(arg) for arg in args]
         # groupby([{}, {}])
         elif len(args) == 1 and isinstance(args[0], (list, tuple)):
             if kwargs:
                 raise ValueError(
                     "Kwargs not allowed when passing multiple aggregations in args."
                 )
-            inserted_aggs = [self.deserialize(arg) for arg in args[0]]
+            inserted_aggs = [Aggs(arg) for arg in args[0]]
         # groupby({})
         # groupby(Terms())
         # groupby('terms', name='per_tag', field='tag')
         else:
-            inserted_aggs = [self.deserialize(*args, **kwargs)]
+            inserted_aggs = [Aggs(*args, **kwargs)]
 
         if insert_above is not None:
             parent = new_agg.parent(insert_above, id_only=False)
@@ -265,35 +302,52 @@ def groupby(self, *args, **kwargs):
         return new_agg
 
     def aggs(self, *args, **kwargs):
-        """Arrange passed aggregations in `arg` arguments "horizontally".
+        r"""
+        Arrange passed aggregations "horizontally".
 
-        Those will be placed under the `insert_below` aggregation clause id if provided, else under the deepest linear
-        bucket aggregation if there is no ambiguity:
+        Given the initial aggregation::
+
+            A──> B
+            └──> C
+
+        If passing multiple aggregations with `insert_below` = 'A'::
+
+            A──> B
+            └──> C
+            └──> new1
+            └──> new2
+
+        Note: those will be placed under the `insert_below` aggregation clause id if provided, else under the deepest
+        linear bucket aggregation if there is no ambiguity:
 
         OK::
 
-            A──> B ─> C ─> arg
+            A──> B ─> C ─> new
 
         KO::
 
             A──> B
             └──> C
 
-        `arg` argument accepts single occurrence or sequence of following formats:
+        `args` accepts single occurrence or sequence of following formats:
 
         * string (for terms agg concise declaration)
         * regular Elasticsearch dict syntax
         * AggNode instance (for instance Terms, Filters etc)
 
 
-        :param arg: aggregation(s) clauses to insert "horizontally"
-        :param insert_below: parent aggregation id under which these aggregations should be placed
-        :param kwargs: agg body arguments when using "string" syntax for terms aggregation
+        :Keyword Arguments:
+            * *insert_below* (``string``) --
+              Parent aggregation name under which these aggregations should be placed
+
+            * remaining kwargs:
+              Used as body in aggregation
+
         :rtype: pandagg.aggs.Aggs
         """
         insert_below = self._validate_aggs_parent_id(kwargs.pop("insert_below", None))
         new_agg = self.clone(with_tree=True)
-        deserialized = self.deserialize(*args, mapping=self.mapping, **kwargs)
+        deserialized = Aggs(*args, **kwargs)
         deserialized_root = deserialized.get(deserialized.root)
         if isinstance(deserialized_root, ShadowRoot):
             new_agg.merge(deserialized, nid=insert_below)
@@ -335,7 +389,7 @@ def applied_nested_path_at_node(self, nid):
     def _insert_node_below(self, node, parent_id, with_children=True):
         """If mapping is provided, nested aggregations are automatically applied.
         """
-        if isinstance(node, ShadowRoot):
+        if isinstance(node, ShadowRoot) and parent_id is not None:
             for child in node._children or []:
                 super(Aggs, self)._insert_node_below(
                     child, parent_id=parent_id, with_children=with_children
@@ -346,7 +400,6 @@ def _insert_node_below(self, node, parent_id, with_children=True):
             isinstance(node, Nested)
             or isinstance(node, ReverseNested)
             or not self.mapping
-            or parent_id is None
             or not hasattr(node, "field")
         ):
             return super(Aggs, self)._insert_node_below(
@@ -357,11 +410,20 @@ def _insert_node_below(self, node, parent_id, with_children=True):
 
         # from deepest to highest
         required_nested_level = self.mapping.nested_at_field(node.field)
-        current_nested_level = self.applied_nested_path_at_node(parent_id)
+
+        if self.is_empty():
+            current_nested_level = None
+        else:
+            current_nested_level = self.applied_nested_path_at_node(parent_id)
         if current_nested_level == required_nested_level:
             return super(Aggs, self)._insert_node_below(
                 node, parent_id, with_children=with_children
             )
+        if not self.nested_autocorrect:
+            raise ValueError(
+                "Invalid %s agg on %s field. Invalid nested: expected %s, current %s."
+                % (node.KEY, node.field, required_nested_level, current_nested_level)
+            )
         if current_nested_level and (
             required_nested_level or "" in current_nested_level
         ):

diff --git a/pandagg/tree/mapping.py b/pandagg/tree/mapping.py
@@ -44,7 +44,7 @@ def __nonzero__(self):
 
     __bool__ = __nonzero__
 
-    def serialize(self, from_=None, depth=None):
+    def to_dict(self, from_=None, depth=None):
         if self.root is None:
             return None
         from_ = self.root if from_ is None else from_
@@ -54,7 +54,7 @@ def serialize(self, from_=None, depth=None):
             if depth is not None:
                 depth -= 1
             for child_node in self.children(node.identifier, id_only=False):
-                children_queries[child_node.name] = self.serialize(
+                children_queries[child_node.name] = self.to_dict(
                     from_=child_node.identifier, depth=depth
                 )
         serialized_node = node.body