alkemics · leonardbinet · Feb 14, 2022 · Feb 14, 2022 · Feb 16, 2022 · Mar 8, 2022
diff --git a/.github/workflows/python-3-tests.yml b/.github/workflows/python-3-tests.yml
@@ -3,9 +3,9 @@ name: Python 3 Tests
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, dev ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, dev ]
 
 jobs:
   static_analysis:
@@ -32,6 +32,10 @@ jobs:
         pip install mypy
         pip install -e ".[develop]"
         mypy --install-types --non-interactive pandagg
+    - name: Isort check
+      run: |
+        pip install isort
+        isort pandagg examples tests -c
 
   run_tests:
     runs-on: ubuntu-latest

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,3 +17,9 @@ repos:
           pass_filenames: false
           language: system
           types: [ python ]
+  -   repo: https://github.com/pycqa/isort
+      rev: 5.10.1
+      hooks:
+        - id: isort
+          name: isort (python)
+          args: ["--profile", "black", "--filter-files"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,34 @@
+
+# Change Log
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/)
+and this project adheres to [Semantic Versioning](http://semver.org/).
+
+## [Unreleased] - 2022-02-11
+
+### Added
+- Use isort to automatically sort imports
+
+### Changed
+
+### Fixed
+
+## [0.2.4] - 2022-02-11
+
+Introduction of the repository changelog.
+
+### Added
+
+- Github actions run when pushing changes or making a pull request to `dev` branch ([#113](https://github.com/alkemics/pandagg/pull/113)).
+- `match_all`, and `match_none` query clauses ([103](https://github.com/alkemics/pandagg/issues/103#issuecomment-1040425685), [#112](https://github.com/alkemics/pandagg/pull/112)).
+
+### Changed
+
+- Handle deprecation warnings introduced in [elasticsearch-py](https://github.com/elastic/elasticsearch-py/issues/1698) ([#109](https://github.com/alkemics/pandagg/pull/109)).
+- Improved IMDB and NY-restaurants examples, by allowing them to be ingested on client cluster by a simple command line ([#116](https://github.com/alkemics/pandagg/pull/116)).
+
+### Fixed
+
+- Fix aggregation scan via composite aggregation, the first batch was not yielded ([#101](https://github.com/alkemics/pandagg/issues/101), [#110](https://github.com/alkemics/pandagg/pull/110)).
+- Fix search scan, by allowing passing of parameters ([#103](https://github.com/alkemics/pandagg/issues/103#issuecomment-1040445479), [#111](https://github.com/alkemics/pandagg/pull/111)).
diff --git a/Makefile b/Makefile
@@ -13,14 +13,18 @@ lint-diff:
 	git diff upstream/master --name-only -- "*.py" | xargs flake8
 
 lint:
-	# ignore "line break before binary operator", and "invalid escape sequence '\_'" useful for doc
-	flake8 --count --ignore=W503,W605 --show-source --statistics pandagg
+	flake8 --count --show-source --statistics pandagg
 	# on tests, more laxist: allow "missing whitespace after ','" and "line too long"
 	flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests
 
 black:
 	black examples docs pandagg tests setup.py
 
+isort:
+	isort examples docs pandagg tests setup.py
+
+format: isort black lint
+
 develop:
 	-python -m pip install -e ".[develop]"
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -19,14 +19,14 @@
 
 # -- Project information -----------------------------------------------------
 
-project = u"pandagg"
-copyright = u"2020, Léonard Binet"
-author = u"Léonard Binet"
+project = "pandagg"
+copyright = "2020, Léonard Binet"
+author = "Léonard Binet"
 
 # The short X.Y version
-version = u""
+version = ""
 # The full version, including alpha/beta/rc tags
-release = u"0.1"
+release = "0.1"
 
 
 # -- General configuration ---------------------------------------------------
@@ -130,15 +130,15 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "pandagg.tex", u"pandagg Documentation", u"Léonard Binet", "manual")
+    (master_doc, "pandagg.tex", "pandagg Documentation", "Léonard Binet", "manual")
 ]
 
 
 # -- Options for manual page output ------------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "pandagg", u"pandagg Documentation", [author], 1)]
+man_pages = [(master_doc, "pandagg", "pandagg Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output ----------------------------------------------
@@ -150,7 +150,7 @@
     (
         master_doc,
         "pandagg",
-        u"pandagg Documentation",
+        "pandagg Documentation",
         author,
         "pandagg",
         "One line description of project.",

diff --git a/examples/NY-restaurants/ingest.py b/examples/NY-restaurants/ingest.py
@@ -3,11 +3,11 @@
 """Script that downloads a public dataset and streams it to an Elasticsearch cluster"""
 
 import csv
-from os.path import abspath, join, dirname, exists
+from os.path import abspath, dirname, exists, join
+
 import urllib3
 from elasticsearch import Elasticsearch
-
-from pandagg.index import DeclarativeIndex
+from model import NYCRestaurants
 
 NYC_RESTAURANTS = (
     "https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD"
@@ -16,22 +16,6 @@
 CHUNK_SIZE = 16384
 
 
-class NYCRestaurants(DeclarativeIndex):
-    name = "nyc-restaurants"
-    mappings = {
-        "properties": {
-            "name": {"type": "text"},
-            "borough": {"type": "keyword"},
-            "cuisine": {"type": "keyword"},
-            "grade": {"type": "keyword"},
-            "score": {"type": "integer"},
-            "location": {"type": "geo_point"},
-            "inspection_date": {"type": "date", "format": "MM/dd/yyyy"},
-        }
-    }
-    settings = {"number_of_shards": 1}
-
-
 def download_dataset():
     """Downloads the public dataset if not locally downloaded
     and returns the number of rows are in the .csv file.

diff --git a/examples/NY-restaurants/model.py b/examples/NY-restaurants/model.py
@@ -0,0 +1,31 @@
+from pandagg.document import DocumentSource
+from pandagg.index import DeclarativeIndex
+from pandagg.mappings import Date, GeoPoint, Integer, Keyword, Text
+
+
+class Inspection(DocumentSource):
+    name = Text()
+    borough = Keyword()
+    cuisine = Keyword()
+    grade = Keyword()
+    score = Integer()
+    location = GeoPoint()
+    inspection_date = Date(format="MM/dd/yyyy")
+
+
+class NYCRestaurants(DeclarativeIndex):
+    name = "nyc-restaurants"
+    document = Inspection
+    # Note: "mappings" attribute take precedence over "document" attribute in mappings definition
+    mappings = {
+        "properties": {
+            "name": {"type": "text"},
+            "borough": {"type": "keyword"},
+            "cuisine": {"type": "keyword"},
+            "grade": {"type": "keyword"},
+            "score": {"type": "integer"},
+            "location": {"type": "geo_point"},
+            "inspection_date": {"type": "date", "format": "MM/dd/yyyy"},
+        }
+    }
+    settings = {"number_of_shards": 1}
diff --git a/examples/imdb/README.md b/examples/imdb/README.md
@@ -21,7 +21,7 @@ The index should provide good performances trying to answer these kind question
 
 
 ## Data source
-I exported following SQL tables from MariaDB [following these instructions](https://relational.fit.cvut.cz/dataset/IMDb).
+https://relational.fit.cvut.cz/dataset/IMDb.
 
 Relational schema is the following:
 
@@ -98,66 +98,21 @@ _
 ## Steps to start playing with your index
 
 
-You can either directly use the demo index available [here]('https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/')
-with credentials user: `pandagg`, password: `pandagg`:
+Follow below steps to install it yourself locally.
 
-Access it with following client instantiation:
-```
-from elasticsearch import Elasticsearch
-client = Elasticsearch(
-    hosts=['https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/'],
-    http_auth=('pandagg', 'pandagg')
-)
-```
-
-
-Or follow below steps to install it yourself locally.
-In this case, you can either generate yourself the files, or download them from [here](https://drive.google.com/file/d/1po3T18l9QoYxPEGh-iKV4oN3DslWGu8-/view?usp=sharing) (file md5 `b363dee23720052501e24d15361ed605`).
-
-#### Dump tables
-Follow instruction on bottom of https://relational.fit.cvut.cz/dataset/IMDb page and dump following tables in a
-directory:
-- movies.csv
-- movies_genres.csv
-- movies_directors.csv
-- directors.csv
-- directors_genres.csv
-- roles.csv
-- actors.csv
-
-#### Clone pandagg and setup environment
 ```
+# clone repo
 git clone [email protected]:alkemics/pandagg.git
 cd pandagg
 
+# create and activate your virtual environment using virtualenv or any similar tool
 virtualenv env
-python setup.py develop
-pip install pandas simplejson jupyter seaborn
-```
-Then copy `conf.py.dist` file into `conf.py` and edit variables as suits you, for instance:
-```
-# your cluster address
-ES_HOST = 'localhost:9200'
+source env/bin/activate
 
-# where your table dumps are stored, and where serialized output will be written
-DATA_DIR = '/path/to/dumps/'
-OUTPUT_FILE_NAME = 'serialized.json'
-```
-
-#### Serialize movie documents and insert them
-
-```
-# generate serialized movies documents, ready to be inserted in ES
-# can take a while
-python examples/imdb/serialize.py
+# install dependencies for this example
+make develop
+pip install pandas simplejson mysqlclient mariadb
 
-# create index with mappings if necessary, bulk insert documents in ES
-python examples/imdb/load.py
+# run ingestion script (type `python examples/imdb/ingest.py --help` for options)
+python examples/imdb/ingest.py
 ```
-
-
-#### Explore pandagg notebooks
-
-An example notebook is available to showcase some of `pandagg` functionalities: [here it is](https://gistpreview.github.io/?4cedcfe49660cd6757b94ba491abb95a).
-
-Code is present in `examples/imdb/IMDB exploration.py` file.
diff --git a/examples/imdb/conf.py.dist b/examples/imdb/conf.py.dist