diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..5c67cc32 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @alk-lbinet diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-2-tests.yml similarity index 63% rename from .github/workflows/python-package.yml rename to .github/workflows/python-2-tests.yml index e1ad68ea..16ab3112 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-2-tests.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Python 2 Tests on: push: @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8] + python-version: [2.7] steps: - uses: actions/checkout@v2 @@ -26,14 +26,14 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install -r requirements-test-2.txt python setup.py develop - name: Lint with flake8 run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # ignore "line break before binary operator", and "invalid escape sequence '\_'" useful for doc + flake8 --count --ignore=W503,W605 --show-source --statistics pandagg + # on tests, more laxist: allow "missing whitespace after ','" and "line too long" + flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests - name: Test with pytest run: | pytest diff --git a/.github/workflows/python-3-tests.yml b/.github/workflows/python-3-tests.yml new file mode 100644 index 00000000..2560900c --- /dev/null +++ b/.github/workflows/python-3-tests.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python 3 Tests + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.5, 3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-test.txt + python setup.py develop + - name: Lint with flake8 + run: | + # ignore "line break before binary operator", and "invalid escape sequence '\_'" useful for doc + flake8 --count --ignore=W503,W605 --show-source --statistics pandagg + # on tests, more laxist: allow "missing whitespace after ','" and "line too long" + flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests + - name: Test with pytest + run: | + pytest diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index 5d1b1bdd..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,17 +0,0 @@ -Map config = [ - "app_root": "pandagg", - "ut_create_database": false, - "ut_push_config": false, - "ut_check_coverage": true, - "ut_python_3": true, - "ut_on_each_pr": true -] -timestamps { - node("master") { - fileLoader.withGit('git@github.com:alkemics/lib-groovy-jenkins.git', 'master', 'github-read', '') { - workflow = fileLoader.load("Workflow") - } - } - - workflow.launch(config) -} diff --git a/LICENCE b/LICENSE similarity index 100% rename from LICENCE rename to LICENSE diff --git a/Makefile b/Makefile index 039bad93..d7525c3b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY : develop check clean clean_pyc doc lint-diff black doc-references coverage +.PHONY : develop check clean clean_pyc doc lint lint-diff black doc-references coverage clean: -python setup.py clean @@ -10,6 +10,12 @@ clean_pyc: lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 +lint: + # ignore "line break before binary operator", and "invalid escape sequence '\_'" useful for doc + flake8 --count --ignore=W503,W605 --show-source --statistics pandagg + # on tests, more laxist: allow "missing whitespace after ','" and "line too long" + flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests + black: black examples docs pandagg tests setup.py diff --git a/README.md b/README.md index 7c8b66ad..626e8c4a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +[![PyPI Latest Release](https://img.shields.io/pypi/v/pandagg.svg)](https://pypi.org/project/pandagg/) +[![License](https://img.shields.io/pypi/l/pandagg.svg)](https://github.com/alkemics/pandagg/blob/master/LICENSE) +![Python package](https://github.com/alkemics/pandagg/workflows/Python%20package/badge.svg) + + ## What is it? **pandagg** is a Python package providing a simple interface to manipulate ElasticSearch queries and aggregations. Its goal is to make it @@ -50,7 +55,7 @@ All contributions, bug reports, bug fixes, documentation improvements, enhanceme ## Roadmap -- implement CI workflow: python2/3 tests, coverage +- improve CI workflow with coverage report - on aggregation `nodes`, ensure all allowed `fields` are listed - expand functionalities: proper ORM similar to elasticsearch-dsl Document classes, index managing operations - package versions for different ElasticSearch versions diff --git a/docs/requirements.txt b/docs/requirements.txt index 1411a4a0..fe471a7a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,3 @@ -pandas \ No newline at end of file +pandas +sphinx_rtd_theme +recommonmark diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index bf03bee1..bcba162f 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -124,12 +124,10 @@ In the flattened syntax, the query clause type is used as first argument: Query enrichment ================ -query() method --------------- +All methods described below return a new :class:`~pandagg.tree.query.abstract.Query` instance, and keep unchanged the +initial query. -The base method to enrich a :class:`~pandagg.tree.query.abstract.Query` is :func:`~pandagg.tree.query.abstract.Query.query`. - -It returns a new instance, and keep unchanged the initial query: +For instance: >>> from pandagg.query import Query >>> initial_q = Query() @@ -150,8 +148,11 @@ It returns a new instance, and keep unchanged the initial query: None -Available syntaxes -^^^^^^^^^^^^^^^^^^ +query() method +-------------- + +The base method to enrich a :class:`~pandagg.tree.query.abstract.Query` is :func:`~pandagg.tree.query.abstract.Query.query`. + Considering this query: @@ -160,23 +161,18 @@ Considering this query: :func:`~pandagg.tree.query.abstract.Query.query` accepts following syntaxes: -from dictionnary -"""""""""""""""" +from dictionnary:: >>> q.query({"terms": {"genres": ['Comedy', 'Short']}) -flattened syntax -"""""""""""""""" +flattened syntax:: >>> q.query("terms", genres=['Comedy', 'Short']) -from Query instance -""""""""""""""""""" - -This includes DSL classes: +from Query instance (this includes DSL classes):: >>> from pandagg.query import Terms >>> q.query(Terms(genres=['Action', 'Thriller'])) diff --git a/pandagg/_decorators.py b/pandagg/_decorators.py new file mode 100644 index 00000000..cf086f1f --- /dev/null +++ b/pandagg/_decorators.py @@ -0,0 +1,95 @@ +from textwrap import dedent + + +# Substitution and Appender are copied from pandas.util._decorators +# https://github.com/pandas-dev/pandas/blob/master/LICENSE + + +class Substitution: + """ + A decorator to take a function's docstring and perform string + substitution on it. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter) + + Usage: construct a docstring.Substitution with a sequence or + dictionary suitable for performing substitution; then + decorate a suitable function with the constructed object. e.g. + + sub_author_name = Substitution(author='Jason') + + @sub_author_name + def some_function(x): + "%(author)s wrote this function" + + # note that some_function.__doc__ is now "Jason wrote this function" + + One can also use positional arguments. + + sub_first_last_names = Substitution('Edgar Allen', 'Poe') + + @sub_first_last_names + def some_function(x): + "%s %s wrote the Raven" + """ + + def __init__(self, *args, **kwargs): + if args and kwargs: + raise AssertionError("Only positional or keyword args are allowed") + + self.params = args or kwargs + + def __call__(self, func): + func.__doc__ = func.__doc__ and func.__doc__ % self.params + return func + + def update(self, *args, **kwargs): + """ + Update self.params with supplied args. + """ + + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) + + +class Appender: + """ + A function decorator that will append an addendum to the docstring + of the target function. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter). + + Usage: construct a docstring.Appender with a string to be joined to + the original docstring. An optional 'join' parameter may be supplied + which will be used to join the docstring and addendum. e.g. + + add_copyright = Appender("Copyright (c) 2009", join='\n') + + @add_copyright + def my_dog(has='fleas'): + "This docstring will have a copyright below" + pass + """ + + def __init__(self, addendum, join="", indents=0): + if indents > 0: + self.addendum = indent(addendum, indents=indents) + else: + self.addendum = addendum + self.join = join + + def __call__(self, func): + func.__doc__ = func.__doc__ if func.__doc__ else "" + self.addendum = self.addendum if self.addendum else "" + docitems = [func.__doc__, self.addendum] + func.__doc__ = dedent(self.join.join(docitems)) + return func + + +def indent(text, indents=1): + if not text or not isinstance(text, str): + return "" + jointext = "".join(["\n"] + [" "] * indents) + return jointext.join(text.split("\n")) diff --git a/pandagg/node/query/compound.py b/pandagg/node/query/compound.py index 3e834bc6..d974592f 100644 --- a/pandagg/node/query/compound.py +++ b/pandagg/node/query/compound.py @@ -2,7 +2,9 @@ class CompoundClause(QueryClause): - """Compound clauses can encapsulate other query clauses:: + """Compound clauses can encapsulate other query clauses: + + .. code-block:: { "" : { @@ -10,6 +12,7 @@ class CompoundClause(QueryClause): } } + """ _default_operator = None diff --git a/pandagg/response.py b/pandagg/response.py index 36cfff4a..b0126302 100644 --- a/pandagg/response.py +++ b/pandagg/response.py @@ -363,7 +363,7 @@ def to_dataframe( index = (None,) * len(index) else: index = pd.MultiIndex.from_tuples(index, names=index_names) - return pd.DataFrame(index=index, data=values) + return pd.DataFrame(index=index, data=list(values)) def to_normalized(self): children = [] diff --git a/pandagg/tree/query/abstract.py b/pandagg/tree/query/abstract.py index 136ae02c..3ce6769c 100644 --- a/pandagg/tree/query/abstract.py +++ b/pandagg/tree/query/abstract.py @@ -7,6 +7,7 @@ from future.utils import python_2_unicode_compatible, iteritems, string_types +from pandagg._decorators import Substitution from pandagg.node.query._parameter_clause import ParentParameterClause from pandagg.node.query.abstract import QueryClause, LeafQueryClause from pandagg.node.query.compound import CompoundClause, Bool as BoolNode @@ -19,6 +20,32 @@ REPLACE = "replace" REPLACE_ALL = "replace_all" +sub_insertion = Substitution( + insertion_doc=""" + * *parent* (``str``) -- + named query clause under which the inserted clauses should be placed. + + * *parent_param* (``str`` optional parameter when using *parent* param) -- + parameter under which inserted clauses will be placed. For instance if *parent* clause is a boolean, can be + 'must', 'filter', 'should', 'must_not'. + + * *child* (``str``) -- + named query clause above which the inserted clauses should be placed. + + * *child_param* (``str`` optional parameter when using *parent* param) -- + parameter of inserted boolean clause under which child clauses will be placed. For instance if inserted clause + is a boolean, can be 'must', 'filter', 'should', 'must_not'. + + * *mode* (``str`` one of 'add', 'replace', 'replace_all') -- + merging strategy when inserting clauses on a existing compound clause. + + - 'add' (default) : adds new clauses keeping initial ones + - 'replace' : for each parameter (for instance in 'bool' case : 'filter', 'must', 'must_not', 'should'), + replace existing clauses under this parameter, by new ones only if declared in inserted compound query + - 'replace_all' : existing compound clause is completely replaced by the new one +""" +) + @python_2_unicode_compatible class Query(Tree): @@ -167,7 +194,10 @@ def applied_nested_path_at_node(self, nid): return None def to_dict(self, from_=None, with_name=True): - """Return None if no query clause. + """Serialize query as native dict. + :param from\_: optional, + :param with_name: optional + :return: """ if self.root is None: return None @@ -199,7 +229,37 @@ def to_dict(self, from_=None, with_name=True): return {node.KEY: serialized_children} return {node.KEY: serialized_children[0]} + @sub_insertion def query(self, *args, **kwargs): + r"""Insert new clause(s) in current query. + + Inserted clause can accepts following syntaxes. + + Given an empty query: + + >>> from pandagg.query import Query + >>> q = Query() + + flat syntax: clause type, followed by query clause body as keyword arguments: + + >>> q.query('term', some_field=23) + {'term': {'some_field': 23}} + + from regular Elasticsearch dict query: + + >>> q.query({'term': {'some_field': 23}}) + {'term': {'some_field': 23}} + + using pandagg DSL: + + >>> from pandagg.query import Term + >>> q.query(Term(field=23)) + {'term': {'some_field': 23}} + + :Keyword Arguments: + %(insertion_doc)s + + """ mode = kwargs.pop("mode", ADD) parent = kwargs.pop("parent", None) parent_param = kwargs.pop("parent_param", None) diff --git a/requirements-test-2.txt b/requirements-test-2.txt new file mode 100644 index 00000000..4a1ec43a --- /dev/null +++ b/requirements-test-2.txt @@ -0,0 +1,7 @@ +flake8 +coverage +pytest +# last mock version compatible with P2 (will drop constraint when removing support for P2) +mock<=3.0.5 +# idem, last pandas compatible version with P2 +pandas<=0.23.1 \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 00000000..43a961d8 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,5 @@ +flake8 +coverage +pytest +mock +pandas \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index d8e802df..c65e833e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,6 @@ +[aliases] +test=pytest + [flake8] max-line-length = 120 ignore = diff --git a/setup.py b/setup.py index 8abee1c9..3bc24707 100644 --- a/setup.py +++ b/setup.py @@ -12,22 +12,13 @@ here = os.path.abspath(os.path.dirname(__file__)) README = open(os.path.join(here, "README.md")).read() -tests_require = [ - "coverage", - "pytest", - "mock", - "pandas", -] - install_requires = [ "six", "future", - "lighttree==0.0.6", + "lighttree==0.0.8", "elasticsearch>=7.0.0,<8.0.0", ] -extras_require = {"test": tests_require, "pandas": ["pandas"]} - setup( name="pandagg", @@ -44,6 +35,5 @@ test_suite="pandagg.tests", zip_safe=False, install_requires=install_requires, - tests_require=tests_require, - extras_require=extras_require, + setup_requires=["pytest-runner"], ) diff --git a/tests/test_response.py b/tests/test_response.py index ac50c8bf..6d3736f6 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -160,11 +160,6 @@ def test_parse_as_tabular(self): "avg_nb_classes": 18.71, "doc_count": 128, }, - ("multilabel", "gpc"): { - "avg_f1_micro": 0.95, - "avg_nb_classes": 183.21, - "doc_count": 119, - }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, @@ -205,13 +200,6 @@ def test_parse_as_tabular(self): "doc_count": 128, "global_metrics.field.name": "ispracticecompatible", }, - { - "avg_f1_micro": 0.95, - "avg_nb_classes": 183.21, - "classification_type": "multilabel", - "doc_count": 119, - "global_metrics.field.name": "gpc", - }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, @@ -304,11 +292,6 @@ def test_parse_as_dataframe(self): "avg_nb_classes": 206.5, "doc_count": 370, }, - ("multilabel", "gpc"): { - "avg_f1_micro": 0.95, - "avg_nb_classes": 183.21, - "doc_count": 119, - }, ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, diff --git a/tests/testing_samples/data_sample.py b/tests/testing_samples/data_sample.py index f1a4bfec..4740e2eb 100644 --- a/tests/testing_samples/data_sample.py +++ b/tests/testing_samples/data_sample.py @@ -80,12 +80,6 @@ def get_node_hierarchy(): "doc_count": 128, "key": "ispracticecompatible", }, - { - "avg_f1_micro": {"value": 0.95}, - "avg_nb_classes": {"value": 183.21}, - "doc_count": 119, - "key": "gpc", - }, { "avg_f1_micro": {"value": 0.80}, "avg_nb_classes": {"value": 9.97}, @@ -135,9 +129,6 @@ def get_node_hierarchy(): │ ├── avg_f1_micro 0.89 │ └── avg_nb_classes 206.5 └── classification_type=multilabel 1797 - ├── global_metrics.field.name=gpc 119 - │ ├── avg_f1_micro 0.95 - │ └── avg_nb_classes 183.21 ├── global_metrics.field.name=ispracticecompatible 128 │ ├── avg_f1_micro 0.72 │ └── avg_nb_classes 18.71 @@ -162,15 +153,6 @@ def get_node_hierarchy(): "level": "global_metrics.field.name", "value": 128, }, - { - "children": [ - {"key": None, "level": "avg_nb_classes", "value": 183.21}, - {"key": None, "level": "avg_f1_micro", "value": 0.95}, - ], - "key": "gpc", - "level": "global_metrics.field.name", - "value": 119, - }, { "children": [ {"key": None, "level": "avg_nb_classes", "value": 9.97}, diff --git a/tests/tree/query/test_query.py b/tests/tree/query/test_query.py index d7ca27dc..e1644615 100644 --- a/tests/tree/query/test_query.py +++ b/tests/tree/query/test_query.py @@ -6,7 +6,6 @@ from mock import patch from pandagg.node.query._parameter_clause import Must -from pandagg.node.query.joining import Nested from pandagg.query import Query, Range, Prefix, Ids, Term, Terms, Nested from pandagg.node.query.term_level import Term as TermNode, Exists as ExistsNode from pandagg.node.query.joining import Nested as NestedNode diff --git a/tests/tree/test_response.py b/tests/tree/test_response.py index 52774bc4..849d5b6a 100644 --- a/tests/tree/test_response.py +++ b/tests/tree/test_response.py @@ -19,9 +19,9 @@ def test_response_tree(self, uuid_mock): sample.ES_AGG_RESPONSE ) self.assertEqual(response_tree.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR) - self.assertEqual(len(response_tree.list()), 18) + self.assertEqual(len(response_tree.list()), 15) - multilabel_gpc_bucket = next( + multiclass_gpc_bucket = next( ( b for b in response_tree.list() @@ -31,11 +31,11 @@ def test_response_tree(self, uuid_mock): # bucket properties will give parents levels and keys self.assertEqual( - response_tree.bucket_properties(multilabel_gpc_bucket), + response_tree.bucket_properties(multiclass_gpc_bucket), OrderedDict( [ ("global_metrics.field.name", "gpc"), - ("classification_type", "multilabel"), + ("classification_type", "multiclass"), ] ), ) @@ -64,12 +64,12 @@ def test_client_bound_response(self, uuid_mock): self.assertIn("classification_type_multiclass", dir(response)) self.assertIn("classification_type_multilabel", dir(response)) - multilabel = response.classification_type_multilabel - self.assertIsInstance(multilabel, IResponse) - self.assertIs(multilabel._initial_tree, response._tree) + multiclass = response.classification_type_multiclass + self.assertIsInstance(multiclass, IResponse) + self.assertIs(multiclass._initial_tree, response._tree) - self.assertIn("global_metrics_field_name_gpc", dir(multilabel)) - gpc = multilabel.global_metrics_field_name_gpc + self.assertIn("global_metrics_field_name_gpc", dir(multiclass)) + gpc = multiclass.global_metrics_field_name_gpc self.assertIsInstance(gpc, IResponse) self.assertIs(gpc._initial_tree, response._tree) @@ -81,7 +81,7 @@ def test_client_bound_response(self, uuid_mock): "bool": { "must": [ {"term": {"global_metrics.field.name": {"value": "gpc"}}}, - {"term": {"classification_type": {"value": "multilabel"}}}, + {"term": {"classification_type": {"value": "multiclass"}}}, {"term": {"some_field": {"value": 1}}}, ] }