diff --git a/docs/source/advanced-usage.rst b/docs/source/advanced-usage.rst deleted file mode 100644 index 16b8ed43..00000000 --- a/docs/source/advanced-usage.rst +++ /dev/null @@ -1,11 +0,0 @@ - -############## -Advanced usage -############## - -.. note:: - - This is a work in progress. Some sections still need to be furnished. - - * node and tree deserialization order - * compound query insertion diff --git a/docs/source/index.rst b/docs/source/index.rst index c5ef9c3e..af78a3b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,7 +13,6 @@ pandagg introduction user-guide - advanced-usage Tutorial dataset API reference Contributing @@ -43,8 +42,7 @@ Alternatively, you can grab the latest source code from `GitHub `_. diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 3777c1e2..fc48a3ff 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -2,10 +2,6 @@ Principles ########## -.. note:: - - This is a work in progress. Some sections still need to be furnished. - This library focuses on two principles: diff --git a/docs/source/user-guide.aggs.rst b/docs/source/user-guide.aggs.rst new file mode 100644 index 00000000..7a56e17c --- /dev/null +++ b/docs/source/user-guide.aggs.rst @@ -0,0 +1,122 @@ +*********** +Aggregation +*********** + +The :class:`~pandagg.tree.aggs.aggs.Aggs` class provides : + +- multiple syntaxes to declare and udpate a aggregation +- aggregation clause validation +- ability to insert clauses at specific locations (and not just below last manipulated clause) + + +Declaration +=========== + +From native "dict" query +------------------------ + +Given the following aggregation: + + >>> expected_aggs = { + >>> "decade": { + >>> "histogram": {"field": "year", "interval": 10}, + >>> "aggs": { + >>> "genres": { + >>> "terms": {"field": "genres", "size": 3}, + >>> "aggs": { + >>> "max_nb_roles": { + >>> "max": {"field": "nb_roles"} + >>> }, + >>> "avg_rank": { + >>> "avg": {"field": "rank"} + >>> } + >>> } + >>> } + >>> } + >>> } + >>> } + +To declare :class:`~pandagg.tree.aggs.aggs.Aggs`, simply pass "dict" query as argument: + + >>> from pandagg.aggs import Aggs + >>> a = Aggs(expected_aggs) + +A visual representation of the query is available with :func:`~pandagg.tree.aggs.aggs.Aggs.show`: + + >>> a.show() + + decade + └── genres + ├── max_nb_roles + └── avg_rank + + +Call :func:`~pandagg.tree.aggs.aggs.Aggs.to_dict` to convert it to native dict: + + >>> a.to_dict() == expected_aggs + True + +With DSL classes +---------------- + +Pandagg provides a DSL to declare this query in a quite similar fashion: + + >>> from pandagg.aggs import Histogram, Terms, Max, Avg + >>> + >>> a = Histogram("decade", field='year', interval=10, aggs=[ + >>> Terms("genres", field="genres", size=3, aggs=[ + >>> Max("max_nb_roles", field="nb_roles"), + >>> Avg("avg_rank", field="range") + >>> ]), + >>> ]) + +All these classes inherit from :class:`~pandagg.tree.aggs.aggs.Aggs` and thus provide the same interface. + + >>> from pandagg.aggs import Aggs + >>> isinstance(a, Aggs) + True + +With flattened syntax +--------------------- + +In the flattened syntax, the first argument is the aggregation name, the second argument is the aggregation type, the +following keyword arguments define the aggregation body: + + >>> from pandagg.query import Aggs + >>> a = Aggs('genres', 'terms', size=3) + >>> a.to_dict() + {'genres': {'terms': {'field': 'genres', 'size': 3}}} + + +Aggregations enrichment +======================= + +Aggregations can be enriched using two methods: + +- :func:`~pandagg.tree.aggs.aggs.Aggs.aggs` +- :func:`~pandagg.tree.aggs.aggs.Aggs.groupby` + +Both methods return a new :class:`~pandagg.tree.aggs.aggs.Aggs` instance, and keep unchanged the initial Aggregation. + +For instance: + + >>> from pandagg.aggs import Aggs + >>> initial_a = Aggs() + >>> enriched_a = initial_a.aggs('genres_agg', 'terms', field='genres') + + >>> initial_q.to_dict() + None + + >>> enriched_q.to_dict() + {'genres_agg': {'terms': {'field': 'genres'}}} + +.. note:: + + Calling :func:`~pandagg.tree.aggs.aggs.Aggs.to_dict` on an empty Aggregation returns `None` + + >>> from pandagg.aggs import Aggs + >>> Aggs().to_dict() + None + + +TODO diff --git a/docs/source/user-guide.interactive.rst b/docs/source/user-guide.interactive.rst new file mode 100644 index 00000000..cd4ae928 --- /dev/null +++ b/docs/source/user-guide.interactive.rst @@ -0,0 +1,246 @@ + +******************** +Interactive features +******************** + +Features described in this module are primarly designed for interactive usage, for instance in an +`ipython shell_`, since one of the key features is the intuitive usage provided by auto-completion. + +Cluster indices discovery +========================= + +:func:`~pandagg.discovery.discover` function list all indices on a cluster matching a provided pattern: + + >>> from elasticsearch import Elasticsearch + >>> from pandagg.discovery import discover + >>> client = Elasticsearch(hosts=['xxx']) + >>> indices = discover(client, index='mov*') + >>> indices + ['movies', 'movies_fake'] + +Each of the indices is accessible via autocompletion: + + >>> indices.movies + + + +An :class:`~pandagg.discovery.Index` exposes: settings, mapping (interactive), aliases and name: + + >>> movies = indices.movies + >>> movies.settings + {'index': {'creation_date': '1591824202943', + 'number_of_shards': '1', + 'number_of_replicas': '1', + 'uuid': 'v6Amj9x1Sk-trBShI-188A', + 'version': {'created': '7070199'}, + 'provided_name': 'movies'}} + + >>> movies.mapping + + _ + ├── directors [Nested] + │ ├── director_id Keyword + │ ├── first_name Text + │ │ └── raw ~ Keyword + │ ├── full_name Text + │ │ └── raw ~ Keyword + │ ├── genres Keyword + │ └── last_name Text + │ └── raw ~ Keyword + ├── genres Keyword + ├── movie_id Keyword + ├── name Text + │ └── raw ~ Keyword + ├── nb_directors Integer + ├── nb_roles Integer + ├── rank Float + ├── roles [Nested] + │ ├── actor_id Keyword + │ ├── first_name Text + │ │ └── raw ~ Keyword + │ ├── full_name Text + │ │ └── raw ~ Keyword + │ ├── gender Keyword + │ ├── last_name Text + │ │ └── raw ~ Keyword + │ └── role Keyword + └── year Integer + + +Navigable mapping +================= + +The :class:`~pandagg.discovery.Index` **mapping** attribute returns a :class:`~pandagg.interactive.mapping.IMapping` +instance that provides navigation features with autocompletion to quickly discover a large +mapping: + + + >>> movies.roles + + roles [Nested] + ├── actor_id Integer + ├── first_name Text + │ └── raw ~ Keyword + ├── gender Keyword + ├── last_name Text + │ └── raw ~ Keyword + └── role Keyword + >>> movies.roles.first_name + + first_name Text + └── raw ~ Keyword + + +.. note:: + + a navigable mapping can be obtained directly using :class:`~pandagg.interactive.mapping.IMapping` class without + using discovery module: + + >>> from pandagg.mapping import IMapping + >>> from examples.imdb.load import mapping + >>> m = IMapping(mapping) + >>> m.roles.first_name + + first_name Text + └── raw ~ Keyword + + +To get the complete field definition, just call it: + + >>> movies.roles.first_name() + of type text: + { + "type": "text", + "fields": { + "raw": { + "type": "keyword" + } + } + } + +A **IMapping** instance can be bound to an Elasticsearch client to get quick access to aggregations computation on mapping fields. + +Suppose you have the following client: + + >>> from elasticsearch import Elasticsearch + >>> client = Elasticsearch(hosts=['localhost:9200']) + +Client can be bound at instantiation: + + >>> movies = IMapping(mapping, client=client, index_name='movies') + +Doing so will generate a **a** attribute on mapping fields, this attribute will list all available aggregation for that +field type (with autocompletion): + + >>> movies.roles.gender.a.terms() + [('M', {'key': 'M', 'doc_count': 2296792}), + ('F', {'key': 'F', 'doc_count': 1135174})] + + +.. note:: + + Nested clauses will be automatically taken into account. + + +Navigable aggregation response +============================== + +When executing a :class:`~pandagg.search.Search` request with aggregations, resulting aggregations can be parsed in +multiple formats as described :doc:`user-guide.response`. + +Suppose we execute the following search request: + + >>> from elasticsearch import Elasticsearch + >>> from pandagg.search import Search + >>> + >>> client = ElasticSearch(hosts=['localhost:9200']) + >>> response = Search(using=client, index='movies')\ + >>> .size(0)\ + >>> .groupby('decade', 'histogram', interval=10, field='year')\ + >>> .groupby('genres', size=3)\ + >>> .aggs('avg_rank', 'avg', field='rank')\ + >>> .aggs('avg_nb_roles', 'avg', field='nb_roles')\ + >>> .filter('range', year={"gte": 1990})\ + >>> .execute() + +One of the available serialization methods for aggregations, :func:`~pandagg.response.Aggregations.to_interactive_tree`, +generates an interactive tree of class :class:`~pandagg.interactive.response.IResponse`: + + >>> tree = response.aggregations.to_interactive_tree() + >>> tree + + root + ├── decade=1990 79495 + │ ├── genres=Documentary 8393 + │ │ ├── avg_nb_roles 3.7789824854045038 + │ │ └── avg_rank 6.517093241977517 + │ ├── genres=Drama 12232 + │ │ ├── avg_nb_roles 18.518067364290385 + │ │ └── avg_rank 5.981429367965072 + │ └── genres=Short 12197 + │ ├── avg_nb_roles 3.023284414200213 + │ └── avg_rank 6.311325829450123 + └── decade=2000 57649 + ├── genres=Documentary 8639 + │ ├── avg_nb_roles 5.581433036231045 + │ └── avg_rank 6.980897812811443 + ├── genres=Drama 11500 + │ ├── avg_nb_roles 14.385391304347825 + │ └── avg_rank 6.269675415719865 + └── genres=Short 13451 + ├── avg_nb_roles 4.053081555274701 + └── avg_rank 6.83625304327684 + +This tree provides auto-completion on each node to select a subpart of the tree: + + >>> tree.decade_1990 + + decade=1990 79495 + ├── genres=Documentary 8393 + │ ├── avg_nb_roles 3.7789824854045038 + │ └── avg_rank 6.517093241977517 + ├── genres=Drama 12232 + │ ├── avg_nb_roles 18.518067364290385 + │ └── avg_rank 5.981429367965072 + └── genres=Short 12197 + ├── avg_nb_roles 3.023284414200213 + └── avg_rank 6.311325829450123 + + >>> tree.genres_Drama + + genres=Drama 12232 + ├── avg_nb_roles 18.518067364290385 + └── avg_rank 5.981429367965072 + +:func:`~pandagg.interactive.response.IResponse.get_bucket_filter` returns the query that filters documents belonging +to the given bucket: + + >>> tree.decade_1990.genres_Drama.get_bucket_filter() + {'bool': { + 'must': [ + {'term': {'genres': {'value': 'Drama'}}}, + {'range': {'year': {'gte': 1990.0, 'lt': 2000.0}}} + ], + 'filter': [{'range': {'year': {'gte': 1990}}}] + } + } + +:func:`~pandagg.interactive.response.IResponse.list_documents` method actually execute this query to list documents +belonging to bucket: + + >>> tree.decade_1990.genres_Drama.list_documents(size=2, _source={"include": ['name']}) + {'took': 10, + 'timed_out': False, + '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, + 'hits': {'total': {'value': 10000, 'relation': 'gte'}, + 'max_score': 2.4539857, + 'hits': [{'_index': 'movies', + '_type': '_doc', + '_id': '706', + '_score': 2.4539857, + '_source': {'name': '100 meter fri'}}, + {'_index': 'movies', + '_type': '_doc', + '_id': '714', + '_score': 2.4539857, + '_source': {'name': '100 Proof'}}]}} diff --git a/docs/source/user-guide.query.rst b/docs/source/user-guide.query.rst new file mode 100644 index 00000000..820b6808 --- /dev/null +++ b/docs/source/user-guide.query.rst @@ -0,0 +1,271 @@ +***** +Query +***** + +The :class:`~pandagg.tree.query.abstract.Query` class provides : + +- multiple syntaxes to declare and udpate a query +- query validation (with nested clauses validation) +- ability to insert clauses at specific points +- tree-like visual representation + +Declaration +=========== + +From native "dict" query +------------------------ + +Given the following query: + + >>> expected_query = {'bool': {'must': [ + >>> {'terms': {'genres': ['Action', 'Thriller']}}, + >>> {'range': {'rank': {'gte': 7}}}, + >>> {'nested': { + >>> 'path': 'roles', + >>> 'query': {'bool': {'must': [ + >>> {'term': {'roles.gender': {'value': 'F'}}}, + >>> {'term': {'roles.role': {'value': 'Reporter'}}}]} + >>> } + >>> }} + >>> ]}} + +To instantiate :class:`~pandagg.tree.query.abstract.Query`, simply pass "dict" query as argument: + + >>> from pandagg.query import Query + >>> q = Query(expected_query) + +A visual representation of the query is available with :func:`~pandagg.tree.query.abstract.Query.show`: + + >>> q.show() + + bool + └── must + ├── nested, path="roles" + │ └── query + │ └── bool + │ └── must + │ ├── term, field=roles.gender, value="F" + │ └── term, field=roles.role, value="Reporter" + ├── range, field=rank, gte=7 + └── terms, genres=["Action", "Thriller"] + + +Call :func:`~pandagg.tree.query.abstract.Query.to_dict` to convert it to native dict: + + >>> q.to_dict() + {'bool': { + 'must': [ + {'range': {'rank': {'gte': 7}}}, + {'terms': {'genres': ['Action', 'Thriller']}}, + {'bool': {'must': [ + {'term': {'roles.role': {'value': 'Reporter'}}}, + {'term': {'roles.gender': {'value': 'F'}}}]}}}} + ]} + ] + }} + + >>> from pandagg.utils import equal_queries + >>> equal_queries(q.to_dict(), expected_query) + True + + +.. note:: + `equal_queries` function won't consider order of clauses in must/should parameters since it actually doesn't matter + in Elasticsearch execution, ie + + >>> equal_queries({'must': [A, B]}, {'must': [B, A]}) + True + +With DSL classes +---------------- + +Pandagg provides a DSL to declare this query in a quite similar fashion: + + >>> from pandagg.query import Nested, Bool, Range, Term, Terms + + >>> q = Bool(must=[ + >>> Terms(genres=['Action', 'Thriller']), + >>> Range(rank={"gte": 7}), + >>> Nested( + >>> path='roles', + >>> query=Bool(must=[ + >>> Term(roles__gender='F'), + >>> Term(roles__role='Reporter') + >>> ]) + >>> ) + >>> ]) + +All these classes inherit from :class:`~pandagg.tree.query.abstract.Query` and thus provide the same interface. + + >>> from pandagg.query import Query + >>> isinstance(q, Query) + True + +With flattened syntax +--------------------- + +In the flattened syntax, the query clause type is used as first argument: + + >>> from pandagg.query import Query + >>> q = Query('terms', genres=['Action', 'Thriller']) + + +Query enrichment +================ + +All methods described below return a new :class:`~pandagg.tree.query.abstract.Query` instance, and keep unchanged the +initial query. + +For instance: + + >>> from pandagg.query import Query + >>> initial_q = Query() + >>> enriched_q = initial_q.query('terms', genres=['Comedy', 'Short']) + + >>> initial_q.to_dict() + None + + >>> enriched_q.to_dict() + {'terms': {'genres': ['Comedy', 'Short']}} + +.. note:: + + Calling :func:`~pandagg.tree.query.abstract.Query.to_dict` on an empty Query returns `None` + + >>> from pandagg.query import Query + >>> Query().to_dict() + None + + +query() method +-------------- + +The base method to enrich a :class:`~pandagg.tree.query.abstract.Query` is :func:`~pandagg.tree.query.abstract.Query.query`. + + +Considering this query: + + >>> from pandagg.query import Query + >>> q = Query() + +:func:`~pandagg.tree.query.abstract.Query.query` accepts following syntaxes: + +from dictionnary:: + + + >>> q.query({"terms": {"genres": ['Comedy', 'Short']}) + +flattened syntax:: + + + >>> q.query("terms", genres=['Comedy', 'Short']) + + +from Query instance (this includes DSL classes):: + + >>> from pandagg.query import Terms + >>> q.query(Terms(genres=['Action', 'Thriller'])) + + +Compound clauses specific methods +--------------------------------- + +:class:`~pandagg.tree.query.abstract.Query` instance also exposes following methods for specific compound queries: + +(TODO: detail allowed syntaxes) + +Specific to bool queries: + +- :func:`~pandagg.tree.query.abstract.Query.bool` +- :func:`~pandagg.tree.query.abstract.Query.filter` +- :func:`~pandagg.tree.query.abstract.Query.must` +- :func:`~pandagg.tree.query.abstract.Query.must_not` +- :func:`~pandagg.tree.query.abstract.Query.should` + +Specific to other compound queries: + +- :func:`~pandagg.tree.query.abstract.Query.nested` +- :func:`~pandagg.tree.query.abstract.Query.constant_score` +- :func:`~pandagg.tree.query.abstract.Query.dis_max` +- :func:`~pandagg.tree.query.abstract.Query.function_score` +- :func:`~pandagg.tree.query.abstract.Query.has_child` +- :func:`~pandagg.tree.query.abstract.Query.has_parent` +- :func:`~pandagg.tree.query.abstract.Query.parent_id` +- :func:`~pandagg.tree.query.abstract.Query.pinned_query` +- :func:`~pandagg.tree.query.abstract.Query.script_score` +- :func:`~pandagg.tree.query.abstract.Query.boost` + + +Inserted clause location +------------------------ + +On all insertion methods detailed above, by default, the inserted clause is placed at the top level of your query, and +generates a bool clause if necessary. + +Considering the following query: + + >>> from pandagg.query import Query + >>> q = Query('terms', genres=['Action', 'Thriller']) + >>> q.show() + + terms, genres=["Action", "Thriller"] + +A bool query will be created: + + >>> q = q.query('range', rank={"gte": 7}) + >>> q.show() + + bool + └── must + ├── range, field=rank, gte=7 + └── terms, genres=["Action", "Thriller"] + +And reused if necessary: + + >>> q = q.must_not('range', year={"lte": 1970}) + >>> q.show() + + bool + ├── must + │ ├── range, field=rank, gte=7 + │ └── terms, genres=["Action", "Thriller"] + └── must_not + └── range, field=year, lte=1970 + +Specifying a specific location requires to `name queries `_ : + + >>> from pandagg.query import Nested + + >>> q = q.nested(path='roles', _name='nested_roles', query=Term('roles.gender', value='F')) + >>> q.show() + + bool + ├── must + │ ├── nested, _name=nested_roles, path="roles" + │ │ └── query + │ │ └── term, field=roles.gender, value="F" + │ ├── range, field=rank, gte=7 + │ └── terms, genres=["Action", "Thriller"] + └── must_not + └── range, field=year, lte=1970 + +Doing so allows to insert clauses above/below given clause using `parent`/`child` parameters: + + >>> q = q.query('term', roles__role='Reporter', parent='nested_roles') + >>> q.show() + + bool + ├── must + │ ├── nested, _name=nested_roles, path="roles" + │ │ └── query + │ │ └── bool + │ │ └── must + │ │ ├── term, field=roles.role, value="Reporter" + │ │ └── term, field=roles.gender, value="F" + │ ├── range, field=rank, gte=7 + │ └── terms, genres=["Action", "Thriller"] + └── must_not + └── range, field=year, lte=1970 + + +TODO: explain `parent_param`, `child_param`, `mode` merging strategies on same named clause etc.. diff --git a/docs/source/user-guide.response.rst b/docs/source/user-guide.response.rst new file mode 100644 index 00000000..88440d3c --- /dev/null +++ b/docs/source/user-guide.response.rst @@ -0,0 +1,252 @@ +******** +Response +******** + +When executing a search request via :func:`~pandagg.search.Search.execute` method of :class:`~pandagg.search.Search`, +a :class:`~pandagg.response.Response` instance is returned. + + >>> from elasticsearch import Elasticsearch + >>> from pandagg.search import Search + >>> + >>> client = ElasticSearch(hosts=['localhost:9200']) + >>> response = Search(using=client, index='movies')\ + >>> .size(2)\ + >>> .filter('term', 'genres', 'Documentary')\ + >>> .aggs('avg_rank', 'avg', field='rank')\ + >>> .execute() + + >>> response + took 9ms, success: True, total result >=10000, contains 2 hits + + >>> response.__class__ + pandagg.response.Response + + +ElasticSearch raw dict response is available under `data` attribute: + + >>> response.data + { + 'took': 9, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, + 'hits': {'total': {'value': 10000, 'relation': 'gte'}, + 'max_score': 0.0, + 'hits': [{'_index': 'movies', ...}], + 'aggregations': {'avg_rank': {'value': 6.496829211219546}} + } + +Hits +==== + +Hits are available under `hits` attribute: + + >>> response.hits + total: >10000, contains 2 hits + + >>> response.hits.total + {'value': 10000, 'relation': 'gte'} + + >>> response.hits.hits + [ score=0.00, score=0.00] + +Those hits are instances of :class:`~pandagg.response.Hit`. + +Directly iterating over :class:`~pandagg.response.Response` will return those hits: + + >>> list(response) + [ score=0.00, score=0.00] + + >>> hit = next(iter(response)) + +Each hit contains the raw dict under `data` attribute: + + >>> hit.data + {'_index': 'movies', + '_type': '_doc', + '_id': '642', + '_score': 0.0, + '_source': {'movie_id': 642, + 'name': '10 Tage in Calcutta', + 'year': 1984, + 'genres': ['Documentary'], + 'roles': None, + 'nb_roles': 0, + 'directors': [{'director_id': 33096, + 'first_name': 'Reinhard', + 'last_name': 'Hauff', + 'full_name': 'Reinhard Hauff', + 'genres': ['Documentary', 'Drama', 'Musical', 'Short']}], + 'nb_directors': 1, + 'rank': None}} + + >>> hit._index + 'movies' + + >>> hit._source + {'movie_id': 642, + 'name': '10 Tage in Calcutta', + 'year': 1984, + 'genres': ['Documentary'], + 'roles': None, + 'nb_roles': 0, + 'directors': [{'director_id': 33096, + 'first_name': 'Reinhard', + 'last_name': 'Hauff', + 'full_name': 'Reinhard Hauff', + 'genres': ['Documentary', 'Drama', 'Musical', 'Short']}], + 'nb_directors': 1, + 'rank': None} + + +Aggregations +============ + +Aggregations are handled differently, the `aggregations` attribute of a :class:`~pandagg.response.Response` returns +a :class:`~pandagg.response.Aggregations` instance, that provides specific parsing abilities in addition to exposing +raw aggregations response under `data` attribute. + +Let's build a bit more complex aggregation query to showcase its functionalities: + + >>> from elasticsearch import Elasticsearch + >>> from pandagg.search import Search + >>> + >>> client = Elasticsearch(hosts=['localhost:9200']) + >>> response = Search(using=client, index='movies')\ + >>> .size(0)\ + >>> .groupby('decade', 'histogram', interval=10, field='year')\ + >>> .groupby('genres', size=3)\ + >>> .aggs('avg_rank', 'avg', field='rank')\ + >>> .aggs('avg_nb_roles', 'avg', field='nb_roles')\ + >>> .filter('range', year={"gte": 1990})\ + >>> .execute() + +.. note:: + for more details about how to build aggregation query, consult :doc:`user-guide.aggs` section + + +Using `data` attribute: + + >>> response.aggregations.data + {'decade': {'buckets': [{'key': 1990.0, + 'doc_count': 79495, + 'genres': {'doc_count_error_upper_bound': 0, + 'sum_other_doc_count': 38060, + 'buckets': [{'key': 'Drama', + 'doc_count': 12232, + 'avg_nb_roles': {'value': 18.518067364290385}, + 'avg_rank': {'value': 5.981429367965072}}, + {'key': 'Short', + ... + + +Tree serialization +------------------ + +Using :func:`~pandagg.response.Aggregations.to_normalized`: + + >>> response.aggregations.to_normalized() + {'level': 'root', + 'key': None, + 'value': None, + 'children': [{'level': 'decade', + 'key': 1990.0, + 'value': 79495, + 'children': [{'level': 'genres', + 'key': 'Drama', + 'value': 12232, + 'children': [{'level': 'avg_rank', + 'key': None, + 'value': 5.981429367965072}, + {'level': 'avg_nb_roles', 'key': None, 'value': 18.518067364290385}]}, + {'level': 'genres', + 'key': 'Short', + 'value': 12197, + 'children': [{'level': 'avg_rank', + 'key': None, + 'value': 6.311325829450123}, + ... + + +Using :func:`~pandagg.response.Aggregations.to_interactive_tree`: + + >>> response.aggregations.to_interactive_tree() + + root + ├── decade=1990 79495 + │ ├── genres=Documentary 8393 + │ │ ├── avg_nb_roles 3.7789824854045038 + │ │ └── avg_rank 6.517093241977517 + │ ├── genres=Drama 12232 + │ │ ├── avg_nb_roles 18.518067364290385 + │ │ └── avg_rank 5.981429367965072 + │ └── genres=Short 12197 + │ ├── avg_nb_roles 3.023284414200213 + │ └── avg_rank 6.311325829450123 + └── decade=2000 57649 + ├── genres=Documentary 8639 + │ ├── avg_nb_roles 5.581433036231045 + │ └── avg_rank 6.980897812811443 + ├── genres=Drama 11500 + │ ├── avg_nb_roles 14.385391304347825 + │ └── avg_rank 6.269675415719865 + └── genres=Short 13451 + ├── avg_nb_roles 4.053081555274701 + └── avg_rank 6.83625304327684 + + +Tabular serialization +--------------------- + +Doing so requires to identify a level that will draw the line between: + +- grouping levels: those which will be used to identify rows (here decades, and genres), and provide **doc_count** per row +- columns levels: those which will be used to populate columns and cells (here avg_nb_roles and avg_rank) + +The tabular format will suit especially well aggregations with a T shape. + + +Using :func:`~pandagg.response.Aggregations.to_dataframe`: + + >>> response.aggregations.to_dataframe() + avg_nb_roles avg_rank doc_count + decade genres + 1990.0 Drama 18.518067 5.981429 12232 + Short 3.023284 6.311326 12197 + Documentary 3.778982 6.517093 8393 + 2000.0 Short 4.053082 6.836253 13451 + Drama 14.385391 6.269675 11500 + Documentary 5.581433 6.980898 8639 + + +Using :func:`~pandagg.response.Aggregations.to_tabular`: + + >>> response.aggregations.to_tabular() + (['decade', 'genres'], + {(1990.0, 'Drama'): {'doc_count': 12232, + 'avg_rank': 5.981429367965072, + 'avg_nb_roles': 18.518067364290385}, + (1990.0, 'Short'): {'doc_count': 12197, + 'avg_rank': 6.311325829450123, + 'avg_nb_roles': 3.023284414200213}, + (1990.0, 'Documentary'): {'doc_count': 8393, + 'avg_rank': 6.517093241977517, + 'avg_nb_roles': 3.7789824854045038}, + (2000.0, 'Short'): {'doc_count': 13451, + 'avg_rank': 6.83625304327684, + 'avg_nb_roles': 4.053081555274701}, + (2000.0, 'Drama'): {'doc_count': 11500, + 'avg_rank': 6.269675415719865, + 'avg_nb_roles': 14.385391304347825}, + (2000.0, 'Documentary'): {'doc_count': 8639, + 'avg_rank': 6.980897812811443, + 'avg_nb_roles': 5.581433036231045}}) + + +.. note:: + + TODO - explain parameters: + + - index_orient + - grouped_by + - expand_columns + - expand_sep + - normalize + - with_single_bucket_groups diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index bb8677af..ad2b9c8e 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -3,486 +3,107 @@ User Guide ########## -.. note:: - - Examples will be based on :doc:`IMDB` data. - This is a work in progress. Some sections still need to be furnished. - - -***** -Query -***** - -The :class:`~pandagg.tree.query.abstract.Query` class provides : - -- multiple syntaxes to declare and udpate a query -- query validation (with nested clauses validation) -- ability to insert clauses at specific points -- tree-like visual representation - -Declaration -=========== - -From native "dict" query ------------------------- - -Given the following query: - - >>> expected_query = {'bool': {'must': [ - >>> {'terms': {'genres': ['Action', 'Thriller']}}, - >>> {'range': {'rank': {'gte': 7}}}, - >>> {'nested': { - >>> 'path': 'roles', - >>> 'query': {'bool': {'must': [ - >>> {'term': {'roles.gender': {'value': 'F'}}}, - >>> {'term': {'roles.role': {'value': 'Reporter'}}}]} - >>> } - >>> }} - >>> ]}} - -To instantiate :class:`~pandagg.tree.query.abstract.Query`, simply pass "dict" query as argument: - - >>> from pandagg.query import Query - >>> q = Query(expected_query) - -A visual representation of the query is available with :func:`~pandagg.tree.query.abstract.Query.show`: - - >>> q.show() - - bool - └── must - ├── nested, path="roles" - │ └── query - │ └── bool - │ └── must - │ ├── term, field=roles.gender, value="F" - │ └── term, field=roles.role, value="Reporter" - ├── range, field=rank, gte=7 - └── terms, genres=["Action", "Thriller"] - - -Call :func:`~pandagg.tree.query.abstract.Query.to_dict` to convert it to native dict: - - >>> q.to_dict() - {'bool': { - 'must': [ - {'range': {'rank': {'gte': 7}}}, - {'terms': {'genres': ['Action', 'Thriller']}}, - {'bool': {'must': [ - {'term': {'roles.role': {'value': 'Reporter'}}}, - {'term': {'roles.gender': {'value': 'F'}}}]}}}} - ]} - ] - }} - - >>> from pandagg.utils import equal_queries - >>> equal_queries(q.to_dict(), expected_query) - True - - -.. note:: - `equal_queries` function won't consider order of clauses in must/should parameters since it actually doesn't matter - in Elasticsearch execution, ie - - >>> equal_queries({'must': [A, B]}, {'must': [B, A]}) - True - -With DSL classes ----------------- - -Pandagg provides a DSL to declare this query in a quite similar fashion: - - >>> from pandagg.query import Nested, Bool, Range, Term, Terms - - >>> q = Bool(must=[ - >>> Terms(genres=['Action', 'Thriller']), - >>> Range(rank={"gte": 7}), - >>> Nested( - >>> path='roles', - >>> query=Bool(must=[ - >>> Term(roles__gender='F'), - >>> Term(roles__role='Reporter') - >>> ]) - >>> ) - >>> ]) - -All these classes inherit from :class:`~pandagg.tree.query.abstract.Query` and thus provide the same interface. - - >>> from pandagg.query import Query - >>> isinstance(q, Query) - True - -With flattened syntax ---------------------- - -In the flattened syntax, the query clause type is used as first argument: - - >>> from pandagg.query import Query - >>> q = Query('terms', genres=['Action', 'Thriller']) - - -Query enrichment -================ - -All methods described below return a new :class:`~pandagg.tree.query.abstract.Query` instance, and keep unchanged the -initial query. +**pandagg** library provides interfaces to perform **read** operations on cluster. -For instance: - >>> from pandagg.query import Query - >>> initial_q = Query() - >>> enriched_q = initial_q.query('terms', genres=['Comedy', 'Short']) +.. toctree:: + :maxdepth: 2 - >>> initial_q.to_dict() - None + user-guide.search + user-guide.query + user-guide.aggs + user-guide.response + user-guide.interactive - >>> enriched_q.to_dict() - {'terms': {'genres': ['Comedy', 'Short']}} .. note:: - Calling :func:`~pandagg.tree.query.abstract.Query.to_dict` on an empty Query returns `None` - - >>> from pandagg.query import Query - >>> Query().to_dict() - None - - -query() method --------------- - -The base method to enrich a :class:`~pandagg.tree.query.abstract.Query` is :func:`~pandagg.tree.query.abstract.Query.query`. - - -Considering this query: - - >>> from pandagg.query import Query - >>> q = Query() - -:func:`~pandagg.tree.query.abstract.Query.query` accepts following syntaxes: - -from dictionnary:: - - - >>> q.query({"terms": {"genres": ['Comedy', 'Short']}) - -flattened syntax:: - - - >>> q.query("terms", genres=['Comedy', 'Short']) - - -from Query instance (this includes DSL classes):: - - >>> from pandagg.query import Terms - >>> q.query(Terms(genres=['Action', 'Thriller'])) - - -Compound clauses specific methods ---------------------------------- - -:class:`~pandagg.tree.query.abstract.Query` instance also exposes following methods for specific compound queries: - -(TODO: detail allowed syntaxes) - -Specific to bool queries: - -- :func:`~pandagg.tree.query.abstract.Query.bool` -- :func:`~pandagg.tree.query.abstract.Query.filter` -- :func:`~pandagg.tree.query.abstract.Query.must` -- :func:`~pandagg.tree.query.abstract.Query.must_not` -- :func:`~pandagg.tree.query.abstract.Query.should` - -Specific to other compound queries: - -- :func:`~pandagg.tree.query.abstract.Query.nested` -- :func:`~pandagg.tree.query.abstract.Query.constant_score` -- :func:`~pandagg.tree.query.abstract.Query.dis_max` -- :func:`~pandagg.tree.query.abstract.Query.function_score` -- :func:`~pandagg.tree.query.abstract.Query.has_child` -- :func:`~pandagg.tree.query.abstract.Query.has_parent` -- :func:`~pandagg.tree.query.abstract.Query.parent_id` -- :func:`~pandagg.tree.query.abstract.Query.pinned_query` -- :func:`~pandagg.tree.query.abstract.Query.script_score` -- :func:`~pandagg.tree.query.abstract.Query.boost` - - -Inserted clause location ------------------------- - -On all insertion methods detailed above, by default, the inserted clause is placed at the top level of your query, and -generates a bool clause if necessary. - -Considering the following query: - - >>> from pandagg.query import Query - >>> q = Query('terms', genres=['Action', 'Thriller']) - >>> q.show() - - terms, genres=["Action", "Thriller"] - -A bool query will be created: - - >>> q = q.query('range', rank={"gte": 7}) - >>> q.show() - - bool - └── must - ├── range, field=rank, gte=7 - └── terms, genres=["Action", "Thriller"] + Examples will be based on :doc:`IMDB` data. -And reused if necessary: - >>> q = q.must_not('range', year={"lte": 1970}) - >>> q.show() - - bool - ├── must - │ ├── range, field=rank, gte=7 - │ └── terms, genres=["Action", "Thriller"] - └── must_not - └── range, field=year, lte=1970 -Specifying a specific location requires to `name queries `_ : +:class:`~pandagg.search.Search` class is intended to perform request (see :doc:`user-guide.search`) - >>> from pandagg.query import Nested + >>> from pandagg.search import Search + >>> + >>> client = ElasticSearch(hosts=['localhost:9200']) + >>> search = Search(using=client, index='movies')\ + >>> .size(2)\ + >>> .groupby('decade', 'histogram', interval=10, field='year')\ + >>> .groupby('genres', size=3)\ + >>> .aggs('avg_rank', 'avg', field='rank')\ + >>> .aggs('avg_nb_roles', 'avg', field='nb_roles')\ + >>> .filter('range', year={"gte": 1990}) + + >>> search + { + "query": { + "bool": { + "filter": [ + { + "range": { + "year": { + "gte": 1990 + } + } + } + ] + } + }, + "aggs": { + "decade": { + "histogram": { + "field": "year", + "interval": 10 + }, + "aggs": { + "genres": { + "terms": { + ... + ..truncated.. + ... + } + } + }, + "size": 2 + } - >>> q = q.nested(path='roles', _name='nested_roles', query=Term('roles.gender', value='F')) - >>> q.show() - - bool - ├── must - │ ├── nested, _name=nested_roles, path="roles" - │ │ └── query - │ │ └── term, field=roles.gender, value="F" - │ ├── range, field=rank, gte=7 - │ └── terms, genres=["Action", "Thriller"] - └── must_not - └── range, field=year, lte=1970 +It relies on: -Doing so allows to insert clauses above/below given clause using `parent`/`child` parameters: +- :class:`~pandagg.query.Query` to build queries (see :doc:`user-guide.query`), +- :class:`~pandagg.aggs.Aggs` to build aggregations (see :doc:`user-guide.aggs`) - >>> q = q.query('term', roles__role='Reporter', parent='nested_roles') - >>> q.show() + >>> search._query.show() bool - ├── must - │ ├── nested, _name=nested_roles, path="roles" - │ │ └── query - │ │ └── bool - │ │ └── must - │ │ ├── term, field=roles.role, value="Reporter" - │ │ └── term, field=roles.gender, value="F" - │ ├── range, field=rank, gte=7 - │ └── terms, genres=["Action", "Thriller"] - └── must_not - └── range, field=year, lte=1970 - - -TODO: explain `parent_param`, `child_param`, `mode` merging strategies on same named clause etc.. - -*********** -Aggregation -*********** - -The :class:`~pandagg.tree.aggs.aggs.Aggs` class provides : - -- multiple syntaxes to declare and udpate a aggregation -- aggregation clause validation -- ability to insert clauses at specific locations (and not just below last manipulated clause) - - -Declaration -=========== - -From native "dict" query ------------------------- - -Given the following aggregation: + └── filter + └── range, field=year, gte=1990 - >>> expected_aggs = { - >>> "decade": { - >>> "histogram": {"field": "year", "interval": 10}, - >>> "aggs": { - >>> "genres": { - >>> "terms": {"field": "genres", "size": 3}, - >>> "aggs": { - >>> "max_nb_roles": { - >>> "max": {"field": "nb_roles"} - >>> }, - >>> "avg_rank": { - >>> "avg": {"field": "rank"} - >>> } - >>> } - >>> } - >>> } - >>> } - >>> } - -To declare :class:`~pandagg.tree.aggs.aggs.Aggs`, simply pass "dict" query as argument: - - >>> from pandagg.aggs import Aggs - >>> a = Aggs(expected_aggs) - -A visual representation of the query is available with :func:`~pandagg.tree.aggs.aggs.Aggs.show`: - - >>> a.show() + >>> search._aggs.show() decade └── genres - ├── max_nb_roles + ├── avg_nb_roles └── avg_rank +Executing a :class:`~pandagg.search.Search` request using :func:`~pandagg.search.Search.execute` will return a +:class:`~pandagg.response.Response` instance (see :doc:`user-guide.response`). -Call :func:`~pandagg.tree.aggs.aggs.Aggs.to_dict` to convert it to native dict: - - >>> a.to_dict() == expected_aggs - True - -With DSL classes ----------------- - -Pandagg provides a DSL to declare this query in a quite similar fashion: - - >>> from pandagg.aggs import Histogram, Terms, Max, Avg - >>> - >>> a = Histogram("decade", field='year', interval=10, aggs=[ - >>> Terms("genres", field="genres", size=3, aggs=[ - >>> Max("max_nb_roles", field="nb_roles"), - >>> Avg("avg_rank", field="range") - >>> ]), - >>> ]) - -All these classes inherit from :class:`~pandagg.tree.aggs.aggs.Aggs` and thus provide the same interface. - - >>> from pandagg.aggs import Aggs - >>> isinstance(a, Aggs) - True - -With flattened syntax ---------------------- - -In the flattened syntax, the first argument is the aggregation name, the second argument is the aggregation type, the -following keyword arguments define the aggregation body: - - >>> from pandagg.query import Aggs - >>> a = Aggs('genres', 'terms', size=3) - >>> a.to_dict() - {'genres': {'terms': {'field': 'genres', 'size': 3}}} - - -Aggregations enrichment -======================= - -Aggregations can be enriched using two methods: - -- :func:`~pandagg.tree.aggs.aggs.Aggs.aggs` -- :func:`~pandagg.tree.aggs.aggs.Aggs.groupby` - -Both methods return a new :class:`~pandagg.tree.aggs.aggs.Aggs` instance, and keep unchanged the initial Aggregation. - -For instance: - - >>> from pandagg.aggs import Aggs - >>> initial_a = Aggs() - >>> enriched_a = initial_a.aggs('genres_agg', 'terms', field='genres') - - >>> initial_q.to_dict() - None - - >>> enriched_q.to_dict() - {'genres_agg': {'terms': {'field': 'genres'}}} - -.. note:: - - Calling :func:`~pandagg.tree.aggs.aggs.Aggs.to_dict` on an empty Aggregation returns `None` - - >>> from pandagg.aggs import Aggs - >>> Aggs().to_dict() - None - - -TODO - -******** -Response -******** - -TODO - -****** -Search -****** - -TODO - -******* -Mapping -******* - -Interactive mapping -=================== - -In interactive context, the :class:`~pandagg.interactive.mapping.IMapping` class provides navigation features with autocompletion to quickly discover a large -mapping: - - >>> from pandagg.mapping import IMapping - >>> from examples.imdb.load import mapping - >>> m = IMapping(imdb_mapping) - >>> m.roles - - roles [Nested] - ├── actor_id Integer - ├── first_name Text - │ └── raw ~ Keyword - ├── gender Keyword - ├── last_name Text - │ └── raw ~ Keyword - └── role Keyword - >>> m.roles.first_name - - first_name Text - └── raw ~ Keyword - -To get the complete field definition, just call it: - - >>> m.roles.first_name() - of type text: - { - "type": "text", - "fields": { - "raw": { - "type": "keyword" - } - } - } - -A **IMapping** instance can be bound to an Elasticsearch client to get quick access to aggregations computation on mapping fields. - -Suppose you have the following client: - - >>> from elasticsearch import Elasticsearch - >>> client = Elasticsearch(hosts=['localhost:9200']) - -Client can be bound at instantiation: - - >>> m = IMapping(imdb_mapping, client=client, index_name='movies') - -Doing so will generate a **a** attribute on mapping fields, this attribute will list all available aggregation for that -field type (with autocompletion): - - >>> m.roles.gender.a.terms() - [('M', {'key': 'M', 'doc_count': 2296792}), - ('F', {'key': 'F', 'doc_count': 1135174})] - - -.. note:: - - Nested clauses will be automatically taken into account. + >>> response = search.execute() + >>> response + took 58ms, success: True, total result >=10000, contains 2 hits + >>> response.hits.hits + [ score=0.00, score=0.00] -************************* -Cluster indices discovery -************************* + >>> response.aggregations.to_dataframe() + avg_nb_roles avg_rank doc_count + decade genres + 1990.0 Drama 18.518067 5.981429 12232 + Short 3.023284 6.311326 12197 + Documentary 3.778982 6.517093 8393 + 2000.0 Short 4.053082 6.836253 13451 + Drama 14.385391 6.269675 11500 + Documentary 5.581433 6.980898 8639 -TODO +On top of that some interactive features are available (see :doc:`user-guide.interactive`). diff --git a/docs/source/user-guide.search.rst b/docs/source/user-guide.search.rst new file mode 100644 index 00000000..b36cec63 --- /dev/null +++ b/docs/source/user-guide.search.rst @@ -0,0 +1,220 @@ + +****** +Search +****** + +:class:`~pandagg.search.Search` class is intended to perform requests, and refers to +Elasticsearch `search api `_: + + >>> from pandagg.search import Search + >>> + >>> client = ElasticSearch(hosts=['localhost:9200']) + >>> search = Search(using=client, index='movies')\ + >>> .size(2)\ + >>> .groupby('decade', 'histogram', interval=10, field='year')\ + >>> .groupby('genres', size=3)\ + >>> .aggs('avg_rank', 'avg', field='rank')\ + >>> .aggs('avg_nb_roles', 'avg', field='nb_roles')\ + >>> .filter('range', year={"gte": 1990}) + + >>> search + { + "query": { + "bool": { + "filter": [ + { + "range": { + "year": { + "gte": 1990 + } + } + } + ] + } + }, + "aggs": { + "decade": { + "histogram": { + "field": "year", + "interval": 10 + }, + "aggs": { + "genres": { + "terms": { + "field": "genres", + "size": 3 + }, + "aggs": { + "avg_rank": { + "avg": { + "field": "rank" + } + }, + "avg_nb_roles": { + "avg": { + "field": "nb_roles" + } + } + } + } + } + } + }, + "size": 2 + } + +It relies on: + +- :class:`~pandagg.query.Query` to build queries, **query** or **post_filter** (see :doc:`user-guide.query`), +- :class:`~pandagg.aggs.Aggs` to build aggregations (see :doc:`user-guide.aggs`) + + +.. note:: + + All methods described below return a new :class:`~pandagg.search.Search` instance, and keep unchanged the + initial search request. + + >>> from pandagg.search import Search + >>> initial_s = Search() + >>> enriched_s = initial_s.query('terms', genres=['Comedy', 'Short']) + + >>> initial_s.to_dict() + {} + + >>> enriched_s.to_dict() + {'query': {'terms': {'genres': ['Comedy', 'Short']}}} + + + +Query part +========== + +The **query** or **post_filter** parts of a :class:`~pandagg.search.Search` instance are available respectively +under **_query** and **_post_filter** attributes. + + >>> search._query.__class__ + pandagg.tree.query.abstract.Query + >>> search._query.show() + + bool + └── filter + └── range, field=year, gte=1990 + + +To enrich **query** of a search request, methods are exactly the same as for a +:class:`~pandagg.query.Query` instance. + + >>> Search().must_not('range', year={'lt': 1980}) + { + "query": { + "bool": { + "must_not": [ + { + "range": { + "year": { + "lt": 1980 + } + } + } + ] + } + } + } + +See section :doc:`user-guide.query` for more details. + + +To enrich **post_filter** of a search request, use :func:`~pandagg.search.post_filter`: + + >>> Search().post_filter('term', genres='Short') + { + "post_filter": { + "term": { + "genres": { + "value": "Short" + } + } + } + } + + +Aggregations part +================= + +The **aggregations** part of a :class:`~pandagg.search.Search` instance is available under **_aggs** attribute. + + >>> search._aggs.__class__ + pandagg.tree.aggs.aggs.Aggs + >>> search._aggs.show() + + decade + └── genres + ├── avg_nb_roles + └── avg_rank + + +To enrich **aggregations** of a search request, methods are exactly the same as for a +:class:`~pandagg.aggs.Aggs` instance. + + >>> Search()\ + >>> .groupby('decade', 'histogram', interval=10, field='year')\ + >>> .aggs('avg_rank', 'avg', field='rank') + { + "aggs": { + "decade": { + "histogram": { + "field": "year", + "interval": 10 + }, + "aggs": { + "avg_rank": { + "avg": { + "field": "rank" + } + } + } + } + } + } + + +See section :doc:`user-guide.aggs` for more details. + +Other search request parameters +=============================== + +**size**, **sources**, **limit** etc, all those parameters are documented in :class:`~pandagg.search.Search` +documentation and their usage is quite self-explanatory. + + +Request execution +================= + + +To a execute a search request, you must first have bound it to an Elasticsearch client beforehand: + + >>> from elasticsearch import Elasticsearch + >>> client = Elasticsearch(hosts=['localhost:9200']) + +Either at instantiation: + + >>> from pandagg.search import Search + >>> search = Search(using=client, index='movies') + +Either with :func:`~pandagg.search.Search.using` +method: + + >>> from pandagg.search import Search + >>> search = Search()\ + >>> .using(client=client)\ + >>> .index('movies') + +Executing a :class:`~pandagg.search.Search` request using :func:`~pandagg.search.Search.execute` will return a +:class:`~pandagg.response.Response` instance (see more in :doc:`user-guide.response`). + + + >>> response = search.execute() + >>> response + took 58ms, success: True, total result >=10000, contains 2 hits + >>> response.__class__ + pandagg.response.Response diff --git a/examples/imdb/IMDB exploration.py b/examples/imdb/IMDB exploration.py deleted file mode 100644 index fe42f98c..00000000 --- a/examples/imdb/IMDB exploration.py +++ /dev/null @@ -1,506 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # IMDB exploration with Pandagg - -# This tutorial will guide you in some of pandagg functionalities, exploring IMDB data. -# -# 1. Cluster indices discovery -# 2. Mapping exploration -# 3. Aggregations -# 4. Queries -# - -# In[1]: - - -# requires to be declared on top -import matplotlib.pyplot as plt -import seaborn - - -# ## 1. Cluster indices discovery - -# In[2]: - - -# instanciate client just as you would do with regular elastic client -from pandagg import Elasticsearch - -client = Elasticsearch(hosts=["localhost:9300"]) - - -# In[3]: - - -# indices instance lists available indices of cluster, with their mappings and settings -indices = client.fetch_indices() -indices - - -# Indices are accessible with autocompletion -# -# ![autocomplete](ressources/autocomplete_index.png) - -# In[4]: - - -movies = indices.movies -movies - - -# ## 2. Mapping exploration - -# In[5]: - - -# mapping is accessible via "mapping" attribute of "Index" instance -m = movies.mapping - -# this is equivalent to instanciate Mapping from dict as follow: -from examples.imdb.load import mapping as imdb_mapping -from pandagg.mapping import IMapping - -m = IMapping(imdb_mapping, client=client, index_name="movies") - -# Note: client and index_name arguments are optional, but doing so provides the ability to -# compute "quick-access" aggregations on fields (will be detailed below) - - -# In[6]: - - -# you can navigate (with help of tab autocomplete) into your mapping -m.directors - - -# In[7]: - - -# going deeper -m.directors.full_name - - -# In[8]: - - -# calling instance will display mapping definition -m.directors.full_name() - - -# ### Quick access aggregations from mapping - -# Mapping leaves (here genres) all have a "a" attribute (for aggregation). -# Autocomplete will display all possible aggregations types on this field -# -# ![autocomplete](ressources/autocomplete_agg.png) -# - -# In[9]: - - -# passed parameters will be added to aggregation body -m.genres.a.terms(missing="N/A", size=5) - - -# In[10]: - - -m.rank.a.stats() - - -# In[11]: - - -# query parameter enable to filter on some conditions -# documentaries are overall better ranked than average of movies -m.rank.a.stats(query={"term": {"genres": "Documentary"}}) - - -# ## 3. Aggregations - -# Let's compute the number of movies per decade per genre. -# - -# ### Regular declaration - -# In[12]: - - -regular_syntax = { - "genres": { - "terms": {"field": "genres", "size": 3}, - "aggs": { - "movie_decade": { - "date_histogram": {"field": "year", "fixed_interval": "3650d"} - } - }, - } -} - -from pandagg.aggs import Aggs - -agg = Aggs(regular_syntax) - -assert agg.to_dict() == regular_syntax - -agg - - -# ### DSL syntax -# The following syntaxes are strictly equivalent to the above one: - -# In[13]: - - -from pandagg.aggs import DateHistogram, Terms - -agg_dsl = Aggs( - Terms( - "genres", - field="genres", - size=3, - aggs=DateHistogram(name="movie_decade", field="year", fixed_interval="3650d"), - ) -) - -# or using groupby method: the listed aggregations will be placed from top to bottom: - -agg_variant = Aggs().groupby( - [ - Terms("genres", field="genres", size=3), - DateHistogram("movie_decade", field="year", fixed_interval="3650d"), - ] -) - - -assert agg_dsl.to_dict() == agg_variant.to_dict() -assert agg_dsl.to_dict() == regular_syntax - -# decade = DateHistogram('movie_decade', field='year', fixed_interval='3650d') -# per_decate_genres = movies.groupby(['genres', decade],size=3).execute() -agg_dsl - - -# #### About groupby and agg methods -# -# - `groupby` method will arrange passed aggregations clauses "vertically" (nested manner), -# - `agg` method will arrange them "horizontally" - -# In[14]: - - -Aggs().groupby( - [ - Terms("genres", field="genres", size=3), - DateHistogram("movie_decade", field="year", fixed_interval="3650d"), - ] -) - - -# In[15]: - - -Aggs().aggs( - [ - Terms("genres", field="genres", size=3), - DateHistogram("movie_decade", field="year", fixed_interval="3650d"), - ] -) - - -# Both `groupby` and `agg` will place provided aggregations under the `insert_below` (parent id) aggregation clause if `insert_below` is provided, else under the deepest bucket aggregation if there is no ambiguity: -# ``` -# OK: A──> B ─> C ─> NEW_AGGS -# -# KO: A──> B -# └──> C -# ``` - -# In[16]: - - -# taking again this example -example_agg = Aggs(regular_syntax) -example_agg - - -# In[17]: - - -# groupby behaviour -example_agg.groupby(["roles.role", "roles.gender"], insert_below="genres") - - -# In[18]: - - -# agg behaviour -example_agg.aggs(["roles.role", "roles.gender"], insert_below="genres") - - -# ### Aggregation execution and parsing - -# Aggregation instance can be bound to an Elasticsearch client, either at `__init__`, either using `bind` method. - -# In[19]: - - -agg_dsl.bind(client=client, index_name="movies") - - -# Doing so provides the ability to execute aggregation request, and parse the response in multiple formats. Formats will be detailed in next example, here we use the dataframe format: -# -# *Note: requires to install **pandas** dependency* - -# In[20]: - - -per_decate_genres = agg_dsl.execute(output="dataframe") -per_decate_genres.unstack() - - -# In[21]: - - -per_decate_genres.unstack().T.plot(figsize=(12, 12)) - - -# **Another example:** -# who are the actors who have played in the highest number of movies between 1990 and 2000, and what was the average ranking of the movies they played in per genre? -# - -# In[22]: - - -from datetime import datetime -from pandagg.aggs import Aggs, Avg, Min, Max -from pandagg.query import Range - - -# in groupby and agg methods, -agg = ( - Aggs(client=client, index_name="movies", mapping=imdb_mapping) - .groupby(["roles.full_name.raw", "genres"], size=2) - .aggs( - [ - Avg("avg_rank", field="rank"), - Min("min_date", field="year"), - Max("max_date", field="year"), - ] - ) - .query(Range(field="year", gte="2000", lt="2010")) -) - -print(agg) -r = agg.execute() - -r["min_year"] = r.min_date.apply(lambda x: datetime.fromtimestamp(x / 1000.0).year) -r["max_year"] = r.max_date.apply(lambda x: datetime.fromtimestamp(x / 1000.0).year) -r - - -# #### As raw output - -# In[23]: - - -# agg.execute(output='raw') - - -# #### As interactive tree - -# In[24]: - - -t = agg.execute(output="tree") -t - - -# #### Navigation with autocompletion - -# In[25]: - - -t.roles_full_name_raw_Grey_DeLisle__599599_ - - -# #### List documents in given bucket (with autocompletion) - -# In[26]: - - -delisle_adventure = t.roles_full_name_raw_Grey_DeLisle__599599_.reverse_nested_below_roles_full_name_raw.genres_Adventure.list_documents( - _source=["id", "genres", "name"], size=2 -) - - -# In[27]: - - -delisle_adventure - - -# ## 4. Queries - -# Suppose I want: -# - actions or thriller movies -# - with ranking >= 7 -# - with a female actor playing a reporter role - -# ### Regular syntax - -# We would perform the following request: - -# In[28]: - - -expected_query = { - "bool": { - "must": [ - {"terms": {"genres": ["Action", "Thriller"]}}, - {"range": {"rank": {"gte": 7}}}, - { - "nested": { - "path": "roles", - "query": { - "bool": { - "must": [ - {"term": {"roles.gender": {"value": "F"}}}, - {"term": {"roles.role": {"value": "Reporter"}}}, - ] - } - }, - } - }, - ] - } -} - - -# We can build our Query instance using this regular syntax: - -# In[29]: - - -from pandagg.query import Query - -q = Query(expected_query) -q - - -# ### DSL syntax - -# With pandagg DSL syntax, it could also be declared this way: - -# In[30]: - - -from pandagg.query import Nested, Bool, Query, Range, Term, Terms as TermsFilter - -# warning, pandagg.query.Terms and pandagg.agg.Terms classes have same name, but one is a filter, the other an aggreggation - -q = Query( - Bool( - must=[ - TermsFilter("genres", terms=["Action", "Thriller"]), - Range("rank", gte=7), - Nested( - path="roles", - query=Bool( - must=[ - Term("roles.gender", value="F"), - Term("roles.role", value="Reporter"), - ] - ), - ), - ] - ) -) - - -# In[31]: - - -# query computation -q.to_dict() == expected_query - - -# Suppose you want to expose a route to your customers with actionable filters, it is easy to add query clauses at specific places in your query by chaining your clauses: -# - -# In[32]: - - -# accepts mix of DSL and dict syntax - -my_query = ( - Query() - .must(TermsFilter("genres", terms=["Action", "Thriller"])) - .must({"range": {"rank": {"gte": 7}}}) - .must( - Nested( - path="roles", - query=Bool( - must=[ - {"term": {"roles.gender": {"value": "F"}}}, - {"term": {"roles.role": {"value": "Reporter"}}}, - ] - ), - ) - ) -) - -my_query.to_dict() == expected_query - - -# ### Advanced query declaration using _named queries_ -# -# We can take advantage of [named queries](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/search-request-named-queries-and-filters.html) to specifically declare where we want to insert a clause. -# -# A simple use case could be to expose some filters to a client among which some apply to nested clauses (for instance nested 'roles'). - -# In[33]: - - -from pandagg.utils import equal_queries - -# suppose API exposes those filters -genres_in = ["Action", "Thriller"] -rank_above = 7 -filter_role_gender = "F" -filter_role = "Reporter" - - -q = Query() - - -if genres_in is not None: - q = q.must(TermsFilter("genres", terms=genres_in)) -if rank_above is not None: - q = q.must(Range("rank", gte=rank_above)) - -# we name the nested query that we would potentially use -q = q.query(Nested(_name="nested_roles", path="roles")) -# a compound clause (bool, nested etc..) without any children clauses is not serialized -assert q.to_dict() == { - "bool": { - "must": [ - {"terms": {"genres": ["Action", "Thriller"]}}, - {"range": {"rank": {"gte": 7}}}, - ] - } -} - - -# we declare that those clauses must be placed below 'nested_roles' condition -if filter_role_gender is not None: - q = q.query(Term("roles.gender", value=filter_role_gender), parent="nested_roles") -if filter_role is not None: - q = q.query(Term("roles.role", value=filter_role), parent="nested_roles") - -assert equal_queries(q.to_dict(), expected_query) -q - - -# In[ ]: diff --git a/examples/imdb/README.md b/examples/imdb/README.md index 8a3cc1a4..c4a86c7f 100644 --- a/examples/imdb/README.md +++ b/examples/imdb/README.md @@ -97,8 +97,22 @@ _ ## Steps to start playing with your index -Note to Elastic, if you have a spare cluster to prepare demo indices on which you could let your community perform read -operations we could skip this step ;) + +You can either directly use the demo index available [here]('https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/') +with credentials user: `pandagg`, password: `pandagg`: + +Access it with following client instantiation: +``` +from elasticsearch import Elasticsearch +client = Elasticsearch( + hosts=['https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/'], + http_auth=('pandagg', 'pandagg') +) +``` + + +Or follow below steps to install it yourself locally. +In this case, you can either generate yourself the files, or download them from [here](https://drive.google.com/file/d/1po3T18l9QoYxPEGh-iKV4oN3DslWGu8-/view?usp=sharing) (file md5 `b363dee23720052501e24d15361ed605`). #### Dump tables Follow instruction on bottom of https://relational.fit.cvut.cz/dataset/IMDb page and dump following tables in a diff --git a/pandagg/interactive/_field_agg_factory.py b/pandagg/interactive/_field_agg_factory.py index f99aaffc..94d84ebe 100644 --- a/pandagg/interactive/_field_agg_factory.py +++ b/pandagg/interactive/_field_agg_factory.py @@ -59,7 +59,7 @@ def _operate(self, agg_node, index, raw_output, query): return result keys = map(itemgetter(0), result) raw_values = map(itemgetter(1), result) - return pd.DataFrame(index=keys, data=raw_values) + return pd.DataFrame(index=list(keys), data=list(raw_values)) def field_type_klass_factory(field_type): diff --git a/pandagg/tree/aggs/aggs.py b/pandagg/tree/aggs/aggs.py index 3ea1486c..9e629ac3 100644 --- a/pandagg/tree/aggs/aggs.py +++ b/pandagg/tree/aggs/aggs.py @@ -131,7 +131,9 @@ def _fill(self, *args, **kwargs): self.insert_node(shadow_root) pid = shadow_root.identifier for ar in arg: - if not isinstance(ar, (AggNode, Aggs)): + if isinstance(ar, string_types): + ar = Terms(ar, field=ar) + elif not isinstance(ar, (AggNode, Aggs)): raise ValueError("Invalid type %s, %s" % (type(ar), ar)) self.insert(ar, parent_id=pid) @@ -331,11 +333,8 @@ def groupby(self, *args, **kwargs): inserted_aggs = [Aggs(arg) for arg in args] # groupby([{}, {}]) elif len(args) == 1 and isinstance(args[0], (list, tuple)): - if kwargs: - raise ValueError( - "Kwargs not allowed when passing multiple aggregations in args." - ) - inserted_aggs = [Aggs(arg) for arg in args[0]] + # kwargs applied on all + inserted_aggs = [Aggs(arg, **kwargs) for arg in args[0]] # groupby({}) # groupby(Terms()) # groupby('terms', name='per_tag', field='tag') diff --git a/setup.py b/setup.py index 62e032e6..981f7863 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -__version__ = "0.1.1" +__version__ = "0.1.2" import os diff --git a/tests/tree/aggs/test_aggs.py b/tests/tree/aggs/test_aggs.py index 32746fe2..a33f7e11 100644 --- a/tests/tree/aggs/test_aggs.py +++ b/tests/tree/aggs/test_aggs.py @@ -548,6 +548,15 @@ def test_aggs_at_root(self): }, ) + def test_aggs_strings(self): + self.assertEqual( + Aggs().aggs(["yolo1", "yolo2"]).to_dict(), + { + "yolo1": {"terms": {"field": "yolo1"}}, + "yolo2": {"terms": {"field": "yolo2"}}, + }, + ) + def test_validate_aggs_parent_id(self): """