diff --git a/.github/workflows/python-3-tests.yml b/.github/workflows/python-3-tests.yml index ab06e3e0..9d3f5366 100644 --- a/.github/workflows/python-3-tests.yml +++ b/.github/workflows/python-3-tests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8] env: PYTHON: ${{ matrix.python-version }} OS: 'ubuntu-latest' @@ -37,6 +37,9 @@ jobs: flake8 --count --ignore=W503,W605 --show-source --statistics pandagg # on tests, more laxist: allow "missing whitespace after ','" and "line too long" flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests + - name: Lint with black + run: | + black --check . - name: Test with pytest and generate coverage report run: pytest --cov=./pandagg --cov-report=xml diff --git a/.gitignore b/.gitignore index f6b3884e..e8ac7865 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .* !.github +!.pre-commit-config.yaml *.py[co] *.egg *.egg-info diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..c1e0602c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/psf/black + rev: 19.3b0 + hooks: + - id: black diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9f956916..0f73e00f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,7 +16,7 @@ We actively welcome your pull requests. 5. Make sure your code lints. ## Any contributions you make will be under the MIT Software License -In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. +In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. Feel free to contact the maintainers if that's a concern. ## Issues diff --git a/README.md b/README.md index 133fbc86..966ecdb3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![PyPI Latest Release](https://img.shields.io/pypi/v/pandagg.svg)](https://pypi.org/project/pandagg/) -[![License](https://img.shields.io/pypi/l/pandagg.svg)](https://github.com/leonardbinet/pandagg/blob/master/LICENSE) +[![License](https://img.shields.io/pypi/l/pandagg.svg)](https://github.com/alkemics/pandagg/blob/master/LICENSE) ![Python package](https://github.com/alkemics/pandagg/workflows/Python%203%20Tests/badge.svg) ![Python package](https://github.com/alkemics/pandagg/workflows/Python%202%20Tests/badge.svg) [![Coverage](https://codecov.io/github/alkemics/pandagg/coverage.svg?branch=master)](https://codecov.io/gh/alkemics/pandagg) @@ -11,8 +11,8 @@ **pandagg** is a Python package providing a simple interface to manipulate ElasticSearch queries and aggregations. Its goal is to make it the easiest possible to explore data indexed in an Elasticsearch cluster. -Some of its interactive features are inspired by [pandas](https://github.com/pandas-dev/pandas) library, hence the name **pandagg** which aims to apply **panda**s to Elasticsearch -**agg**regations. +Some of its interactive features are inspired by [pandas](https://github.com/pandas-dev/pandas) library, hence the name **pandagg** which aims to apply **panda**s to Elasticsearch +**agg**regations. **pandagg** is also greatly inspired by the official high level python client [elasticsearch-dsl](https://github.com/elastic/elasticsearch-dsl-py), and is intended to make it more convenient to deal with deeply nested queries and aggregations. diff --git a/docs/Makefile b/docs/Makefile index c87edd12..be884f85 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -29,4 +29,4 @@ generate: clean api-doc build # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index 144737b0..7f1d6ab4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,9 +1,9 @@ ## Sphinx documentation Documentation stems from 3 sources: -- automatically generated based on repository sources in `docs/source/reference` directory +- automatically generated based on repository sources in `docs/source/reference` directory - manually written documentation in all other files of `docs/source` directory -- a jupyter notebook file generated following procedure in `example/imdb`, then running notebook and exporting +- a jupyter notebook file generated following procedure in `example/imdb`, then running notebook and exporting html file #### Procedure diff --git a/docs/source/conf.py b/docs/source/conf.py index 413c5bff..21fb7ff8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -130,7 +130,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "pandagg.tex", u"pandagg Documentation", u"Léonard Binet", "manual"), + (master_doc, "pandagg.tex", u"pandagg Documentation", u"Léonard Binet", "manual") ] @@ -155,7 +155,7 @@ "pandagg", "One line description of project.", "Miscellaneous", - ), + ) ] diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index bcba162f..f5691571 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -378,4 +378,3 @@ Cluster indices discovery ************************* TODO - diff --git a/examples/imdb/README.md b/examples/imdb/README.md index c893989e..8a3cc1a4 100644 --- a/examples/imdb/README.md +++ b/examples/imdb/README.md @@ -8,7 +8,7 @@ In this case, relational databases (SQL) are a good fit to store with consistenc Yet indexing some of this data in a optimized search engine will allow more powerful queries. ## Query requirements -In this example, we'll suppose most usage/queries requirements will be around the concept of movie (rather than usages +In this example, we'll suppose most usage/queries requirements will be around the concept of movie (rather than usages focused on fetching actors or directors, even though it will still be possible with this data structure). The index should provide good performances trying to answer these kind question (non-exhaustive): @@ -16,7 +16,7 @@ The index should provide good performances trying to answer these kind question - what movies genres were most popular among decades? - which actors have played in best-rated movies, or worst-rated movies? - which actors movies directors prefer to cast in their movies? -- which are best ranked movies of last decade in Action or Documentary genres? +- which are best ranked movies of last decade in Action or Documentary genres? - ... @@ -25,7 +25,7 @@ I exported following SQL tables from MariaDB [following these instructions](http Relational schema is the following: -![imdb tables](ressources/imdb_ijs.svg) +![imdb tables](ressources/imdb_ijs.svg) ## Index mapping @@ -46,9 +46,9 @@ Movie: #### Which fields require nesting? Since genres contain a single keyword field, in no case we need it to be stored as a nested field. -On the contrary, actor roles and directors require a nested mapping if we consider applying multiple -simultanous query clauses on their sub-fields (for instance search movie in which actor is a woman AND whose role is -nurse). +On the contrary, actor roles and directors require a nested mapping if we consider applying multiple +simultanous query clauses on their sub-fields (for instance search movie in which actor is a woman AND whose role is +nurse). More information on distinction between array and nested fields [here]( https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html). @@ -101,7 +101,7 @@ Note to Elastic, if you have a spare cluster to prepare demo indices on which yo operations we could skip this step ;) #### Dump tables -Follow instruction on bottom of https://relational.fit.cvut.cz/dataset/IMDb page and dump following tables in a +Follow instruction on bottom of https://relational.fit.cvut.cz/dataset/IMDb page and dump following tables in a directory: - movies.csv - movies_genres.csv diff --git a/examples/imdb/load.py b/examples/imdb/load.py index 670550c4..6215a4f4 100644 --- a/examples/imdb/load.py +++ b/examples/imdb/load.py @@ -2,14 +2,7 @@ from os.path import join from elasticsearch import Elasticsearch, helpers from examples.imdb.conf import ES_HOST, ES_USE_AUTH, ES_PASSWORD, ES_USER, DATA_DIR -from pandagg.mapping import ( - Mapping, - Keyword, - Text, - Float, - Nested, - Integer, -) +from pandagg.mapping import Mapping, Keyword, Text, Float, Nested, Integer index_name = "movies" mapping = Mapping( diff --git a/pandagg/interactive/mapping.py b/pandagg/interactive/mapping.py index f573fa07..9501dd60 100644 --- a/pandagg/interactive/mapping.py +++ b/pandagg/interactive/mapping.py @@ -52,6 +52,6 @@ def _set_agg_property_if_required(self): def __call__(self, *args, **kwargs): print( json.dumps( - self._tree.to_dict(), indent=2, sort_keys=True, separators=(",", ": "), + self._tree.to_dict(), indent=2, sort_keys=True, separators=(",", ": ") ) ) diff --git a/pandagg/tree/aggs/aggs.py b/pandagg/tree/aggs/aggs.py index 1174e868..236d83d3 100644 --- a/pandagg/tree/aggs/aggs.py +++ b/pandagg/tree/aggs/aggs.py @@ -11,11 +11,7 @@ from pandagg.tree._tree import Tree from pandagg.tree.mapping import Mapping -from pandagg.node.aggs.abstract import ( - BucketAggNode, - AggNode, - ShadowRoot, -) +from pandagg.node.aggs.abstract import BucketAggNode, AggNode, ShadowRoot from pandagg.node.aggs.bucket import Nested, ReverseNested, Terms from pandagg.node.aggs.pipeline import BucketSelector, BucketSort diff --git a/pandagg/tree/query/abstract.py b/pandagg/tree/query/abstract.py index 3ce6769c..a834f935 100644 --- a/pandagg/tree/query/abstract.py +++ b/pandagg/tree/query/abstract.py @@ -567,13 +567,13 @@ def _compound_update(self, name, new_compound, mode): ) if not existing_param: self.insert( - item=new_compound.subtree(param_node.identifier), parent_id=name, + item=new_compound.subtree(param_node.identifier), parent_id=name ) continue if mode == REPLACE: self.drop_node(existing_param.identifier) self.insert( - item=new_compound.subtree(param_node.identifier), parent_id=name, + item=new_compound.subtree(param_node.identifier), parent_id=name ) continue if mode == ADD: diff --git a/pandagg/tree/response.py b/pandagg/tree/response.py index e9984518..df315b83 100644 --- a/pandagg/tree/response.py +++ b/pandagg/tree/response.py @@ -27,7 +27,7 @@ def __init__(self, aggs, index): self.__index = index def _clone_init(self, deep=False): - return AggsResponseTree(aggs=self.__aggs.clone(deep=deep), index=self.__index,) + return AggsResponseTree(aggs=self.__aggs.clone(deep=deep), index=self.__index) def parse(self, raw_response): """Build response tree from ElasticSearch aggregation response @@ -70,7 +70,7 @@ def _parse_node_with_children(self, agg_node, raw_response, pid=None): self.insert_node(bucket, pid) for child in self.__aggs.children(agg_node.name, id_only=False): self._parse_node_with_children( - agg_node=child, raw_response=raw_value, pid=bucket.identifier, + agg_node=child, raw_response=raw_value, pid=bucket.identifier ) def bucket_properties(self, bucket, properties=None, end_level=None, depth=None): diff --git a/requirements-test-2.txt b/requirements-test-2.txt index 253650d8..e692a358 100644 --- a/requirements-test-2.txt +++ b/requirements-test-2.txt @@ -4,4 +4,4 @@ pytest-cov # last mock version compatible with P2 (will drop constraint when removing support for P2) mock<=3.0.5 # idem, last pandas compatible version with P2 -pandas<=0.23.1 \ No newline at end of file +pandas<=0.23.1 diff --git a/requirements-test.txt b/requirements-test.txt index 2c441dda..1ce53a8e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,7 @@ +pre-commit +black flake8 pytest pytest-cov mock -pandas \ No newline at end of file +pandas diff --git a/setup.py b/setup.py index 86d967d6..8ec48e76 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,7 @@ here = os.path.abspath(os.path.dirname(__file__)) README = open(os.path.join(here, "README.md")).read() -install_requires = [ - "six", - "future", - "lighttree==0.0.8", - "elasticsearch>=7.0.0,<8.0.0", -] +install_requires = ["six", "future", "lighttree==0.0.8", "elasticsearch>=7.0.0,<8.0.0"] setup( diff --git a/tests/interactive/test_mapping.py b/tests/interactive/test_mapping.py index 654031b8..fc7f5211 100644 --- a/tests/interactive/test_mapping.py +++ b/tests/interactive/test_mapping.py @@ -154,7 +154,7 @@ def test_quick_agg(self): mapping_tree = Mapping(MAPPING) client_bound_mapping = IMapping( - mapping_tree, client=client_mock, index="classification_report_index_name", + mapping_tree, client=client_mock, index="classification_report_index_name" ) workflow_field = client_bound_mapping.workflow @@ -169,7 +169,7 @@ def test_quick_agg(self): ) self.assertEqual( response, - [(1, {"doc_count": 25, "key": 1}), (2, {"doc_count": 50, "key": 2}),], + [(1, {"doc_count": 25, "key": 1}), (2, {"doc_count": 50, "key": 2})], ) client_mock.search.assert_called_once() client_mock.search.assert_called_with( @@ -188,7 +188,7 @@ def test_quick_agg_nested(self): client_mock = Mock(spec=["search"]) es_response_mock = { "_shards": {"failed": 0, "successful": 135, "total": 135}, - "aggregations": {"local_metrics": {"avg_agg": {"value": 23},},}, + "aggregations": {"local_metrics": {"avg_agg": {"value": 23}}}, "hits": {"hits": [], "max_score": 0.0, "total": 300}, "timed_out": False, "took": 30, @@ -197,7 +197,7 @@ def test_quick_agg_nested(self): mapping_tree = Mapping(MAPPING) client_bound_mapping = IMapping( - mapping_tree, client=client_mock, index="classification_report_index_name", + mapping_tree, client=client_mock, index="classification_report_index_name" ) local_train_support = client_bound_mapping.local_metrics.dataset.support_train @@ -209,9 +209,7 @@ def test_quick_agg_nested(self): raw_output=True, query={"term": {"classification_type": "multiclass"}}, ) - self.assertEqual( - response, [(None, {"value": 23}),], - ) + self.assertEqual(response, [(None, {"value": 23})]) client_mock.search.assert_called_once() client_mock.search.assert_called_with( body={ diff --git a/tests/node/agg/test_bucket.py b/tests/node/agg/test_bucket.py index 5eff22ef..730e3d48 100644 --- a/tests/node/agg/test_bucket.py +++ b/tests/node/agg/test_bucket.py @@ -72,7 +72,7 @@ def test_filter(self): buckets, [ # key -> bucket - (None, {"doc_count": 12, "sub_aggs": {}}), + (None, {"doc_count": 12, "sub_aggs": {}}) ], ) @@ -98,7 +98,7 @@ def test_nested(self): buckets, [ # key -> bucket - (None, {"doc_count": 12, "sub_aggs": {}}), + (None, {"doc_count": 12, "sub_aggs": {}}) ], ) @@ -106,7 +106,7 @@ def test_nested(self): self.assertEqual(Nested.extract_bucket_value({"doc_count": 12}), 12) # test get_filter - nested_agg = Nested(name="some_agg", path="nested_path",) + nested_agg = Nested(name="some_agg", path="nested_path") self.assertEqual(nested_agg.get_filter(None), None) # test query dict diff --git a/tests/node/query/test_full_text.py b/tests/node/query/test_full_text.py index 02ecd4a6..48832353 100644 --- a/tests/node/query/test_full_text.py +++ b/tests/node/query/test_full_text.py @@ -127,8 +127,7 @@ def test_match_bool_prefix_clause(self): q3 = MatchBoolPrefix(message="quick brown f") self.assertEqual(q3.body, {"message": {"query": "quick brown f"}}) self.assertEqual( - q3.to_dict(), - {"match_bool_prefix": {"message": {"query": "quick brown f"}}}, + q3.to_dict(), {"match_bool_prefix": {"message": {"query": "quick brown f"}}} ) self.assertEqual( q3.line_repr(depth=None), @@ -155,7 +154,7 @@ def test_match_phrase_clause(self): q3 = MatchPhrase(message="this is a test") self.assertEqual(q3.body, {"message": {"query": "this is a test"}}) self.assertEqual( - q3.to_dict(), {"match_phrase": {"message": {"query": "this is a test"}}}, + q3.to_dict(), {"match_phrase": {"message": {"query": "this is a test"}}} ) self.assertEqual( q3.line_repr(depth=None), diff --git a/tests/node/query/test_term_level.py b/tests/node/query/test_term_level.py index 28520a2d..be11ad07 100644 --- a/tests/node/query/test_term_level.py +++ b/tests/node/query/test_term_level.py @@ -129,7 +129,7 @@ def test_terms_clause(self): self.assertEqual(q.body, body) self.assertEqual(q.to_dict(), expected) self.assertEqual( - q.line_repr(depth=None), 'terms, boost=1, user=["kimchy", "elasticsearch"]', + q.line_repr(depth=None), 'terms, boost=1, user=["kimchy", "elasticsearch"]' ) def test_terms_set_clause(self): diff --git a/tests/test_discovery.py b/tests/test_discovery.py index ebb77fd0..70926746 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -39,9 +39,7 @@ def test_pandagg_wrapper(self, indice_get_mock): self.assertTrue(hasattr(indices, "classification_report_one")) report_index = indices.classification_report_one self.assertIsInstance(report_index, Index) - self.assertEqual( - report_index.__str__(), "", - ) + self.assertEqual(report_index.__str__(), "") self.assertEqual(report_index.name, "classification_report_one") # ensure mapping presence diff --git a/tests/test_response.py b/tests/test_response.py index 6d3736f6..d78d2a36 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -247,7 +247,7 @@ def test_parse_as_tabular_multiple_roots(self): "avg_f1_score": {"value": 0.815}, } index_names, index_values = Aggregations( - data=raw_response, aggs=my_agg, index=None, client=None, query=None, + data=raw_response, aggs=my_agg, index=None, client=None, query=None ).to_tabular(index_orient=True, expand_sep=" || ") self.assertEqual(index_names, []) diff --git a/tests/test_search.py b/tests/test_search.py index 65eacd2e..05cb1af0 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -294,7 +294,7 @@ def test_source_on_clone(self): { "_source": {"includes": ["foo.bar.*"], "excludes": ["foo.one"]}, "query": { - "bool": {"filter": [{"term": {"title": {"value": "python"}}}],} + "bool": {"filter": [{"term": {"title": {"value": "python"}}}]} }, }, Search() @@ -307,7 +307,7 @@ def test_source_on_clone(self): { "_source": False, "query": { - "bool": {"filter": [{"term": {"title": {"value": "python"}}}],} + "bool": {"filter": [{"term": {"title": {"value": "python"}}}]} }, }, Search().source(False).filter("term", title="python").to_dict(), diff --git a/tests/tree/aggs/test_aggs.py b/tests/tree/aggs/test_aggs.py index 52b3ab05..32746fe2 100644 --- a/tests/tree/aggs/test_aggs.py +++ b/tests/tree/aggs/test_aggs.py @@ -111,8 +111,7 @@ def test_add_node_with_mapping(self): # try to add field aggregation on non-existing field will fail with self.assertRaises(AbsentMappingFieldError): with_mapping.aggs( - Terms("imaginary_agg", field="imaginary_field"), - insert_below="workflow", + Terms("imaginary_agg", field="imaginary_field"), insert_below="workflow" ) self.assertEqual(len(with_mapping.list()), 1) @@ -370,7 +369,7 @@ def test_insert_tree_without_mapping(self): "interval": "1w", } } - }, + } ) self.assertEqual({n.identifier for n in initial_agg_1.list()}, {"week"}) @@ -545,7 +544,7 @@ def test_aggs_at_root(self): "one": { "terms": {"field": "terms_one"}, "aggs": {"two": {"terms": {"field": "terms_two"}}}, - }, + } }, ) @@ -707,7 +706,7 @@ def test_groupby_at_root(self): "two": { "terms": {"field": "terms_two"}, "aggs": {"one": {"terms": {"field": "terms_one"}}}, - }, + } }, ) @@ -723,7 +722,7 @@ def test_groupby_at_root(self): "one": { "terms": {"field": "terms_one"}, "aggs": {"two": {"terms": {"field": "terms_two"}}}, - }, + } }, ) diff --git a/tests/tree/query/test_query.py b/tests/tree/query/test_query.py index e1644615..73c694d2 100644 --- a/tests/tree/query/test_query.py +++ b/tests/tree/query/test_query.py @@ -264,9 +264,7 @@ def test_not_possible_parent_child(self): # ABOVE non-existing with self.assertRaises(ValueError) as e: initial_q.must( - {"term": {"new_field": 2}}, - child="not_existing_node", - _name="somewhere", + {"term": {"new_field": 2}}, child="not_existing_node", _name="somewhere" ) self.assertEqual( e.exception.args, @@ -275,9 +273,7 @@ def test_not_possible_parent_child(self): with self.assertRaises(ValueError) as e: initial_q.must( - {"term": {"new_field": 2}}, - child="not_existing_node", - _name="somewhere", + {"term": {"new_field": 2}}, child="not_existing_node", _name="somewhere" ) self.assertEqual( e.exception.args, @@ -432,7 +428,7 @@ def test_replace_existing_bool(self): def test_must_at_root(self): q_i1 = Query() q1 = q_i1.must( - Term(field="some_field", value=2, _name="term_nid"), _name="bool_nid", + Term(field="some_field", value=2, _name="term_nid"), _name="bool_nid" ) self.assertEqual(len(q_i1.list()), 0) bool_ = next((c for c in q1.list() if isinstance(c, BoolNode)))