diff --git a/.github/workflows/python-3-tests.yml b/.github/workflows/python-3-tests.yml index 60f3824c..25e6286a 100644 --- a/.github/workflows/python-3-tests.yml +++ b/.github/workflows/python-3-tests.yml @@ -3,9 +3,9 @@ name: Python 3 Tests on: push: - branches: [ master ] + branches: [ master, dev ] pull_request: - branches: [ master ] + branches: [ master, dev ] jobs: static_analysis: @@ -32,6 +32,10 @@ jobs: pip install mypy pip install -e ".[develop]" mypy --install-types --non-interactive pandagg + - name: Isort check + run: | + pip install isort + isort pandagg examples tests -c run_tests: runs-on: ubuntu-latest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea40e16b..3a473966 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,3 +17,9 @@ repos: pass_filenames: false language: system types: [ python ] + - repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files"] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..af97dfa8 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,34 @@ + +# Change Log +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/) +and this project adheres to [Semantic Versioning](http://semver.org/). + +## [Unreleased] - 2022-02-11 + +### Added +- Use isort to automatically sort imports + +### Changed + +### Fixed + +## [0.2.4] - 2022-02-11 + +Introduction of the repository changelog. + +### Added + +- Github actions run when pushing changes or making a pull request to `dev` branch ([#113](https://github.com/alkemics/pandagg/pull/113)). +- `match_all`, and `match_none` query clauses ([103](https://github.com/alkemics/pandagg/issues/103#issuecomment-1040425685), [#112](https://github.com/alkemics/pandagg/pull/112)). + +### Changed + +- Handle deprecation warnings introduced in [elasticsearch-py](https://github.com/elastic/elasticsearch-py/issues/1698) ([#109](https://github.com/alkemics/pandagg/pull/109)). +- Improved IMDB and NY-restaurants examples, by allowing them to be ingested on client cluster by a simple command line ([#116](https://github.com/alkemics/pandagg/pull/116)). + +### Fixed + +- Fix aggregation scan via composite aggregation, the first batch was not yielded ([#101](https://github.com/alkemics/pandagg/issues/101), [#110](https://github.com/alkemics/pandagg/pull/110)). +- Fix search scan, by allowing passing of parameters ([#103](https://github.com/alkemics/pandagg/issues/103#issuecomment-1040445479), [#111](https://github.com/alkemics/pandagg/pull/111)). diff --git a/Makefile b/Makefile index b08d7777..af1e33a4 100644 --- a/Makefile +++ b/Makefile @@ -13,14 +13,18 @@ lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 lint: - # ignore "line break before binary operator", and "invalid escape sequence '\_'" useful for doc - flake8 --count --ignore=W503,W605 --show-source --statistics pandagg + flake8 --count --show-source --statistics pandagg # on tests, more laxist: allow "missing whitespace after ','" and "line too long" flake8 --count --ignore=W503,W605,E231,E501 --show-source --statistics tests black: black examples docs pandagg tests setup.py +isort: + isort examples docs pandagg tests setup.py + +format: isort black lint + develop: -python -m pip install -e ".[develop]" diff --git a/docs/source/conf.py b/docs/source/conf.py index 21fb7ff8..e4add382 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,14 +19,14 @@ # -- Project information ----------------------------------------------------- -project = u"pandagg" -copyright = u"2020, Léonard Binet" -author = u"Léonard Binet" +project = "pandagg" +copyright = "2020, Léonard Binet" +author = "Léonard Binet" # The short X.Y version -version = u"" +version = "" # The full version, including alpha/beta/rc tags -release = u"0.1" +release = "0.1" # -- General configuration --------------------------------------------------- @@ -130,7 +130,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "pandagg.tex", u"pandagg Documentation", u"Léonard Binet", "manual") + (master_doc, "pandagg.tex", "pandagg Documentation", "Léonard Binet", "manual") ] @@ -138,7 +138,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "pandagg", u"pandagg Documentation", [author], 1)] +man_pages = [(master_doc, "pandagg", "pandagg Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -150,7 +150,7 @@ ( master_doc, "pandagg", - u"pandagg Documentation", + "pandagg Documentation", author, "pandagg", "One line description of project.", diff --git a/examples/NY-restaurants/ingest.py b/examples/NY-restaurants/ingest.py index 8be76bd3..2b2de25c 100644 --- a/examples/NY-restaurants/ingest.py +++ b/examples/NY-restaurants/ingest.py @@ -3,11 +3,11 @@ """Script that downloads a public dataset and streams it to an Elasticsearch cluster""" import csv -from os.path import abspath, join, dirname, exists +from os.path import abspath, dirname, exists, join + import urllib3 from elasticsearch import Elasticsearch - -from pandagg.index import DeclarativeIndex +from model import NYCRestaurants NYC_RESTAURANTS = ( "https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD" @@ -16,22 +16,6 @@ CHUNK_SIZE = 16384 -class NYCRestaurants(DeclarativeIndex): - name = "nyc-restaurants" - mappings = { - "properties": { - "name": {"type": "text"}, - "borough": {"type": "keyword"}, - "cuisine": {"type": "keyword"}, - "grade": {"type": "keyword"}, - "score": {"type": "integer"}, - "location": {"type": "geo_point"}, - "inspection_date": {"type": "date", "format": "MM/dd/yyyy"}, - } - } - settings = {"number_of_shards": 1} - - def download_dataset(): """Downloads the public dataset if not locally downloaded and returns the number of rows are in the .csv file. diff --git a/examples/NY-restaurants/model.py b/examples/NY-restaurants/model.py new file mode 100644 index 00000000..6451f75e --- /dev/null +++ b/examples/NY-restaurants/model.py @@ -0,0 +1,31 @@ +from pandagg.document import DocumentSource +from pandagg.index import DeclarativeIndex +from pandagg.mappings import Date, GeoPoint, Integer, Keyword, Text + + +class Inspection(DocumentSource): + name = Text() + borough = Keyword() + cuisine = Keyword() + grade = Keyword() + score = Integer() + location = GeoPoint() + inspection_date = Date(format="MM/dd/yyyy") + + +class NYCRestaurants(DeclarativeIndex): + name = "nyc-restaurants" + document = Inspection + # Note: "mappings" attribute take precedence over "document" attribute in mappings definition + mappings = { + "properties": { + "name": {"type": "text"}, + "borough": {"type": "keyword"}, + "cuisine": {"type": "keyword"}, + "grade": {"type": "keyword"}, + "score": {"type": "integer"}, + "location": {"type": "geo_point"}, + "inspection_date": {"type": "date", "format": "MM/dd/yyyy"}, + } + } + settings = {"number_of_shards": 1} diff --git a/examples/imdb/README.md b/examples/imdb/README.md index ed694829..62bdd400 100644 --- a/examples/imdb/README.md +++ b/examples/imdb/README.md @@ -21,7 +21,7 @@ The index should provide good performances trying to answer these kind question ## Data source -I exported following SQL tables from MariaDB [following these instructions](https://relational.fit.cvut.cz/dataset/IMDb). +https://relational.fit.cvut.cz/dataset/IMDb. Relational schema is the following: @@ -98,66 +98,21 @@ _ ## Steps to start playing with your index -You can either directly use the demo index available [here]('https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/') -with credentials user: `pandagg`, password: `pandagg`: +Follow below steps to install it yourself locally. -Access it with following client instantiation: -``` -from elasticsearch import Elasticsearch -client = Elasticsearch( - hosts=['https://beba020ee88d49488d8f30c163472151.eu-west-2.aws.cloud.es.io:9243/'], - http_auth=('pandagg', 'pandagg') -) -``` - - -Or follow below steps to install it yourself locally. -In this case, you can either generate yourself the files, or download them from [here](https://drive.google.com/file/d/1po3T18l9QoYxPEGh-iKV4oN3DslWGu8-/view?usp=sharing) (file md5 `b363dee23720052501e24d15361ed605`). - -#### Dump tables -Follow instruction on bottom of https://relational.fit.cvut.cz/dataset/IMDb page and dump following tables in a -directory: -- movies.csv -- movies_genres.csv -- movies_directors.csv -- directors.csv -- directors_genres.csv -- roles.csv -- actors.csv - -#### Clone pandagg and setup environment ``` +# clone repo git clone git@github.com:alkemics/pandagg.git cd pandagg +# create and activate your virtual environment using virtualenv or any similar tool virtualenv env -python setup.py develop -pip install pandas simplejson jupyter seaborn -``` -Then copy `conf.py.dist` file into `conf.py` and edit variables as suits you, for instance: -``` -# your cluster address -ES_HOST = 'localhost:9200' +source env/bin/activate -# where your table dumps are stored, and where serialized output will be written -DATA_DIR = '/path/to/dumps/' -OUTPUT_FILE_NAME = 'serialized.json' -``` - -#### Serialize movie documents and insert them - -``` -# generate serialized movies documents, ready to be inserted in ES -# can take a while -python examples/imdb/serialize.py +# install dependencies for this example +make develop +pip install pandas simplejson mysqlclient mariadb -# create index with mappings if necessary, bulk insert documents in ES -python examples/imdb/load.py +# run ingestion script (type `python examples/imdb/ingest.py --help` for options) +python examples/imdb/ingest.py ``` - - -#### Explore pandagg notebooks - -An example notebook is available to showcase some of `pandagg` functionalities: [here it is](https://gistpreview.github.io/?4cedcfe49660cd6757b94ba491abb95a). - -Code is present in `examples/imdb/IMDB exploration.py` file. diff --git a/examples/imdb/conf.py.dist b/examples/imdb/conf.py.dist deleted file mode 100644 index f0c82beb..00000000 --- a/examples/imdb/conf.py.dist +++ /dev/null @@ -1,10 +0,0 @@ -# your cluster address, if local probably 'localhost:9200' -ES_HOST = '__FILL_ME__' -ES_USE_AUTH = False -# if auth = True: -ES_USER = 'xxx' -ES_PASSWORD = 'xxx' - -# where your table dumps are stored, and where serialized output will be written -DATA_DIR = '__FILL_ME__' -OUTPUT_FILE_NAME = 'serialized.json' diff --git a/examples/imdb/ingest.py b/examples/imdb/ingest.py new file mode 100644 index 00000000..534410a9 --- /dev/null +++ b/examples/imdb/ingest.py @@ -0,0 +1,302 @@ +import argparse +import csv +import json +import logging +import os +from os import path, remove +from os.path import join +from typing import Iterator, List + +import mariadb +import numpy as np +import pandas as pd +import simplejson +from elasticsearch import Elasticsearch + +from examples.imdb.model import Movies +from pandagg.index import Action + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +class NpEncoder(simplejson.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NpEncoder, self).default(obj) + + +def download_table_as_csv( + data_dir: str, overwrite: bool, cursor, table_name: str, column_names: List[str] +): + cursor.execute("SELECT * FROM %s" % table_name) + + file_path = join(data_dir, "%s.csv" % table_name) + if path.exists(file_path): + if overwrite: + remove(file_path) + else: + return + + with open(file_path, "w") as f: + csv_dump = csv.writer(f) + csv_dump.writerow(column_names) + csv_dump.writerows(cursor.fetchall()) + + +def download_tables(data_dir: str, overwrite: bool) -> None: + + # Using credentials defined here https://relational.fit.cvut.cz/dataset/IMDb + conn = mariadb.connect( + user="guest", + password="relational", + host="relational.fit.cvut.cz", + port=3306, + database="imdb_ijs", + ) + cursor = conn.cursor() + + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="directors_genres", + column_names=["director_id", "genre", "prob"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="directors", + column_names=["id", "first_name", "last_name"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="movies_directors", + column_names=["director_id", "movie_id"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="movies_genres", + column_names=["movie_id", "genre"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="movies", + column_names=["id", "name", "year", "rank"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="roles", + column_names=["actor_id", "movie_id", "role"], + ) + download_table_as_csv( + data_dir=data_dir, + overwrite=overwrite, + cursor=cursor, + table_name="actors", + column_names=["id", "first_name", "last_name", "gender"], + ) + + +def serialize_documents(data_dir: str, limit: int) -> None: + # tables imports + reader_kwargs = { + "encoding": "utf-8", + "sep": ",", + "quotechar": '"', + "escapechar": "\\", + } + movies = pd.read_csv( + join(data_dir, "movies.csv"), index_col="id", **reader_kwargs + ).sample(n=limit) + movies_genres = pd.read_csv(join(data_dir, "movies_genres.csv"), **reader_kwargs) + movies_directors = pd.read_csv( + join(data_dir, "movies_directors.csv"), **reader_kwargs + ) + movies_directors = movies_directors[ + movies_directors.movie_id.isin(movies.index.values) + ] + directors = pd.read_csv( + join(data_dir, "directors.csv"), index_col="id", **reader_kwargs + ) + director_genres = pd.read_csv( + join(data_dir, "directors_genres.csv"), **reader_kwargs + ) + roles = pd.read_csv(join(data_dir, "roles.csv"), **reader_kwargs) + roles = roles[roles.movie_id.isin(movies.index.values)] + actors = pd.read_csv(join(data_dir, "actors.csv"), index_col="id", **reader_kwargs) + + # actors + actor_roles = pd.merge(actors, roles, left_index=True, right_on="actor_id") + actor_roles["serialized_roles"] = actor_roles.apply( + lambda x: { + "actor_id": x.actor_id, + "first_name": x.first_name, + "last_name": x.last_name, + "full_name": "%s %s" % (x.first_name, x.last_name), + "gender": x.gender, + "role": x.role, + }, + axis=1, + ) + movie_serialized_actors = actor_roles.groupby("movie_id").serialized_roles.apply( + list + ) + + # directors + directors_grouped_genres = pd.DataFrame( + director_genres.groupby("director_id").genre.apply(list) + ) + movie_directors_extended = pd.merge( + movies_directors, directors, left_on="director_id", right_index=True + ) + movie_directors_extended = pd.merge( + movie_directors_extended, + directors_grouped_genres, + how="left", + left_on="director_id", + right_index=True, + ) + movie_directors_extended["serialized_directors"] = movie_directors_extended.apply( + lambda x: { + "director_id": x.director_id, + "first_name": x.first_name, + "last_name": x.last_name, + "full_name": "%s %s" % (x.first_name, x.last_name), + "genres": x.genre, + }, + axis=1, + ) + movie_serialized_directors = pd.DataFrame( + movie_directors_extended.groupby("movie_id").serialized_directors.apply(list) + ) + + # movie genres + movie_serialized_genres = movies_genres.groupby("movie_id").genre.apply(list) + + # merge + enriched_movies = pd.merge( + movies, movie_serialized_actors, how="left", left_index=True, right_index=True + ) + enriched_movies = pd.merge( + enriched_movies, + movie_serialized_directors, + how="left", + left_index=True, + right_index=True, + ) + enriched_movies = pd.merge( + enriched_movies, + movie_serialized_genres, + how="left", + left_index=True, + right_index=True, + ) + + enriched_movies["nb_directors"] = enriched_movies.serialized_directors.apply( + lambda x: len(x) if isinstance(x, list) else 0 + ) + enriched_movies["nb_roles"] = enriched_movies.serialized_roles.apply( + lambda x: len(x) if isinstance(x, list) else 0 + ) + + serialized = enriched_movies.apply( + lambda x: { + "movie_id": x.name, + "name": x.loc["name"], + "year": x.year, + "genres": x.genre, + "roles": x.serialized_roles, + "nb_roles": x.nb_roles, + "directors": x.serialized_directors, + "nb_directors": x.nb_directors, + "rank": x.loc["rank"], + }, + axis=1, + ) + # write + with open(join(data_dir, "index_documents.json"), "w") as f: + for s in serialized: + f.write(simplejson.dumps(s, ignore_nan=True, cls=NpEncoder) + "\n") + + +def operations_iterator(data_dir) -> Iterator[Action]: + with open(join(data_dir, "index_documents.json"), "r") as f: + for line in f.readlines(): + d = json.loads(line) + yield {"_source": d, "_id": d["movie_id"]} + + +def setup_index_and_index_documents(data_dir: str, reset_index: bool) -> None: + movies = Movies(client) + if reset_index and movies.exists(): + movies.delete() + movies.save() + movies.docs.bulk( + actions=operations_iterator(data_dir), _op_type_overwrite="index" + ).perform() + movies.refresh() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download and ingest IMDB data in an Elasticsearch cluster.\nBy default the used cluster will be " + "localhost:9200, overwrite client options in this file to customize this behavior." + ) + parser.add_argument( + "--dir", + type=str, + default="temp-data", + help="path to directory where the downloaded files will be stored (default 'temp-data')", + ) + parser.add_argument( + "--overwrite", + action="store_const", + const=True, + default=False, + help="if True, re-download files even if they are already present (default False)", + ) + parser.add_argument( + "--resetindex", + action="store_const", + const=True, + default=False, + help="if True, index is deleted and recreated even though it already exists (default False)", + ) + parser.add_argument( + "--limit", + type=int, + default=20000, + help="number of movies that will be ingested (default 20000)", + ) + args = parser.parse_args() + + client = Elasticsearch( + # specify your config here + ) + + if not path.exists(args.dir): + os.mkdir(args.dir) + + logging.info("Download tables from remote sql database") + download_tables(data_dir=args.dir, overwrite=args.overwrite) + logging.info("Serialize documents") + serialize_documents(data_dir=args.dir, limit=args.limit) + logging.info("Index documents in cluster") + setup_index_and_index_documents(data_dir=args.dir, reset_index=args.resetindex) diff --git a/examples/imdb/load.py b/examples/imdb/model.py similarity index 55% rename from examples/imdb/load.py rename to examples/imdb/model.py index a6e7edea..e234cae4 100644 --- a/examples/imdb/load.py +++ b/examples/imdb/model.py @@ -1,15 +1,40 @@ -import json -from collections import Iterator -from os.path import join -from elasticsearch import Elasticsearch -from examples.imdb.conf import ES_HOST, ES_USE_AUTH, ES_PASSWORD, ES_USER, DATA_DIR +from pandagg.document import DocumentSource, InnerDocSource +from pandagg.index import DeclarativeIndex +from pandagg.mappings import Float, Integer, Keyword, Nested, Text -from pandagg.index import DeclarativeIndex, Action -from pandagg.mappings import Keyword, Text, Float, Nested, Integer + +class Role(InnerDocSource): + role = Keyword() + actor_id = Keyword() + gender = Keyword() + first_name = Text() + last_name = Text() + full_name = Text() + + +class Director(InnerDocSource): + director_id = Keyword() + first_name = Text() + last_name = Text() + full_name = Text() + genres = Keyword() + + +class MovieEntry(DocumentSource): + movie_id = Keyword() + name = Text(fields={"raw": Keyword()}) + year = Integer() + rank = Float() + genres = Keyword() + roles = Nested(Role) + directors = Nested(Director) + nb_directors = Integer() + nb_roles = Integer() class Movies(DeclarativeIndex): name = "movies" + document = MovieEntry mappings = { "dynamic": False, "properties": { @@ -41,28 +66,3 @@ class Movies(DeclarativeIndex): "nb_roles": Integer(), }, } - - -def operations_iterator() -> Iterator[Action]: - with open(join(DATA_DIR, "serialized.json"), "r") as f: - for line in f.readlines(): - d = json.loads(line) - yield {"_source": d, "_id": d["id"]} - - -if __name__ == "__main__": - client_kwargs = {"hosts": [ES_HOST]} - if ES_USE_AUTH: - client_kwargs["http_auth"] = (ES_USER, ES_PASSWORD) - client = Elasticsearch(**client_kwargs) - - movies = Movies(client) - - print("Index creation") - movies.save() - - print("Write documents") - movies.docs.bulk( - actions=operations_iterator(), _op_type_overwrite="index" - ).perform() - movies.refresh() diff --git a/examples/imdb/serialize.py b/examples/imdb/serialize.py deleted file mode 100644 index 5644af88..00000000 --- a/examples/imdb/serialize.py +++ /dev/null @@ -1,147 +0,0 @@ -import logging -from os.path import join -import simplejson -import pandas as pd -import numpy as np - -from examples.imdb.conf import DATA_DIR, OUTPUT_FILE_NAME - -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - - -class NpEncoder(simplejson.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.integer): - return int(obj) - elif isinstance(obj, np.floating): - return float(obj) - elif isinstance(obj, np.ndarray): - return obj.tolist() - else: - return super(NpEncoder, self).default(obj) - - -if __name__ == "__main__": - - logging.info("LOADING TABLES") - # tables imports - reader_kwargs = { - "encoding": "utf-8", - "sep": ",", - "quotechar": '"', - "escapechar": "\\", - } - movies = pd.read_csv(join(DATA_DIR, "movies.csv"), index_col="id", **reader_kwargs) - movies_genres = pd.read_csv(join(DATA_DIR, "movies_genres.csv"), **reader_kwargs) - movies_directors = pd.read_csv( - join(DATA_DIR, "movies_directors.csv"), **reader_kwargs - ) - directors = pd.read_csv( - join(DATA_DIR, "directors.csv"), index_col="id", **reader_kwargs - ) - director_genres = pd.read_csv( - join(DATA_DIR, "directors_genres.csv"), **reader_kwargs - ) - roles = pd.read_csv(join(DATA_DIR, "roles.csv"), **reader_kwargs) - actors = pd.read_csv(join(DATA_DIR, "actors.csv"), index_col="id", **reader_kwargs) - - # actors - logging.info("SERIALIZE ACTORS") - actor_roles = pd.merge(actors, roles, left_index=True, right_on="actor_id") - actor_roles["serialized_roles"] = actor_roles.apply( - lambda x: { - "actor_id": x.actor_id, - "first_name": x.first_name, - "last_name": x.last_name, - "full_name": "%s %s" % (x.first_name, x.last_name), - "gender": x.gender, - "role": x.role, - }, - axis=1, - ) - movie_serialized_actors = actor_roles.groupby("movie_id").serialized_roles.apply( - list - ) - - # directors - logging.info("SERIALIZE DIRECTORS") - directors_grouped_genres = pd.DataFrame( - director_genres.groupby("director_id").genre.apply(list) - ) - movie_directors_extended = pd.merge( - movies_directors, directors, left_on="director_id", right_index=True - ) - movie_directors_extended = pd.merge( - movie_directors_extended, - directors_grouped_genres, - how="left", - left_on="director_id", - right_index=True, - ) - movie_directors_extended["serialized_directors"] = movie_directors_extended.apply( - lambda x: { - "director_id": x.director_id, - "first_name": x.first_name, - "last_name": x.last_name, - "full_name": "%s %s" % (x.first_name, x.last_name), - "genres": x.genre, - }, - axis=1, - ) - movie_serialized_directors = pd.DataFrame( - movie_directors_extended.groupby("movie_id").serialized_directors.apply(list) - ) - - # movie genres - logging.info("SERIALIZE MOVIE GENRES") - movie_serialized_genres = movies_genres.groupby("movie_id").genre.apply(list) - - # merge - logging.info("MERGE DATASETS") - enriched_movies = pd.merge( - movies, movie_serialized_actors, how="left", left_index=True, right_index=True - ) - enriched_movies = pd.merge( - enriched_movies, - movie_serialized_directors, - how="left", - left_index=True, - right_index=True, - ) - enriched_movies = pd.merge( - enriched_movies, - movie_serialized_genres, - how="left", - left_index=True, - right_index=True, - ) - - enriched_movies["nb_directors"] = enriched_movies.serialized_directors.apply( - lambda x: len(x) if isinstance(x, list) else 0 - ) - enriched_movies["nb_roles"] = enriched_movies.serialized_roles.apply( - lambda x: len(x) if isinstance(x, list) else 0 - ) - - serialized = enriched_movies.apply( - lambda x: { - "movie_id": x.name, - "name": x.loc["name"], - "year": x.year, - "genres": x.genre, - "roles": x.serialized_roles, - "nb_roles": x.nb_roles, - "directors": x.serialized_directors, - "nb_directors": x.nb_directors, - "rank": x.loc["rank"], - }, - axis=1, - ) - - # write - logging.info("WRITE SERIALIZED DOCUMENTS") - with open(join(DATA_DIR, OUTPUT_FILE_NAME), "w") as f: - for s in serialized: - f.write(simplejson.dumps(s, ignore_nan=True, cls=NpEncoder) + "\n") diff --git a/examples/jupyter_notebook_config.py b/examples/jupyter_notebook_config.py deleted file mode 100644 index c1ae2262..00000000 --- a/examples/jupyter_notebook_config.py +++ /dev/null @@ -1,16 +0,0 @@ -# taken from https://www.svds.com/jupyter-notebook-best-practices-for-data-science/ - -import os -from subprocess import check_call - - -def post_save(model, os_path, contents_manager): - """post-save hook for converting notebooks to .py scripts""" - if model["type"] != "notebook": - return # only do this for notebooks - d, fname = os.path.split(os_path) - check_call(["jupyter", "nbconvert", "--to", "script", fname], cwd=d) - check_call(["jupyter", "nbconvert", "--to", "html", fname], cwd=d) - - -c.FileContentsManager.post_save_hook = post_save diff --git a/pandagg/__init__.py b/pandagg/__init__.py index 18517883..8d532a6a 100644 --- a/pandagg/__init__.py +++ b/pandagg/__init__.py @@ -1,11 +1,10 @@ import logging - from logging import NullHandler +from typing import List # ensures all required classes are registered in DSLMeta from .interactive import * from .search import * -from typing import List # Inspired by https://python-guide-pt-br.readthedocs.io/fr/latest/writing/logging.html#logging-in-a-library # Set default logging handler to avoid "No handler found" warnings. diff --git a/pandagg/_decorators.py b/pandagg/_decorators.py index 90d306cc..44585d62 100644 --- a/pandagg/_decorators.py +++ b/pandagg/_decorators.py @@ -1,7 +1,6 @@ from textwrap import dedent from typing import Any, Callable - # Substitution and Appender are copied from pandas.util._decorators # https://github.com/pandas-dev/pandas/blob/master/LICENSE diff --git a/pandagg/aggs.py b/pandagg/aggs.py index ab9838c6..95ec2157 100644 --- a/pandagg/aggs.py +++ b/pandagg/aggs.py @@ -1,67 +1,63 @@ from pandagg.node.aggs.bucket import ( - Terms, - Filters, - Histogram, + AdjacencyMatrix, + AutoDateHistogram, + Children, DateHistogram, - Global, + DiversifiedSampler, Filter, - Nested, - ReverseNested, - Range, - Missing, - MatchAll, - GeoHashGrid, + Filters, GeoDistance, - AdjacencyMatrix, - AutoDateHistogram, - VariableWidthHistogram, - SignificantTerms, - RareTerms, + GeoHashGrid, GeoTileGrid, + Global, + Histogram, IPRange, - Sampler, - DiversifiedSampler, - Children, + MatchAll, + Missing, + MultiTerms, + Nested, Parent, + Range, + RareTerms, + ReverseNested, + Sampler, + SignificantTerms, SignificantText, - MultiTerms, + Terms, + VariableWidthHistogram, ) - from pandagg.node.aggs.composite import Composite - from pandagg.node.aggs.metric import ( Avg, - Max, - Min, - Sum, Cardinality, - Stats, ExtendedStats, - Percentiles, - PercentileRanks, GeoBound, GeoCentroid, + Max, + Min, + PercentileRanks, + Percentiles, + Stats, + Sum, TopHits, ValueCount, ) - from pandagg.node.aggs.pipeline import ( AvgBucket, + BucketScript, + BucketSelector, + BucketSort, + CumulativeSum, Derivative, + ExtendedStatsBucket, MaxBucket, MinBucket, - SumBucket, - StatsBucket, - ExtendedStatsBucket, - PercentilesBucket, MovingAvg, - CumulativeSum, - BucketScript, - BucketSelector, - BucketSort, + PercentilesBucket, SerialDiff, + StatsBucket, + SumBucket, ) - from pandagg.tree.aggs import Aggs __all__ = [ diff --git a/pandagg/discovery.py b/pandagg/discovery.py index a1e3bd9a..3976bbc1 100644 --- a/pandagg/discovery.py +++ b/pandagg/discovery.py @@ -1,8 +1,8 @@ from dataclasses import dataclass -from typing import Dict, Any, Optional +from typing import Any, Dict, Optional -from lighttree.interactive import Obj from elasticsearch import Elasticsearch +from lighttree.interactive import Obj from pandagg import Mappings, MappingsDict from pandagg.interactive.mappings import IMappings @@ -46,9 +46,11 @@ class Indices(Obj): def discover(using: Elasticsearch, index: str = "*") -> Indices: - """ - :param using: Elasticsearch client + """Scan cluster and return indices that match the provided index pattern. + + :param using: Elasticsearch client. :param index: Comma-separated list or wildcard expression of index names used to limit the request. + :rtype: Indices """ indices = Indices() for index_name, index_detail in using.indices.get(index=index).items(): diff --git a/pandagg/document.py b/pandagg/document.py index 13988edb..39cc94ac 100644 --- a/pandagg/document.py +++ b/pandagg/document.py @@ -1,11 +1,43 @@ -from typing import Tuple, Dict, Any +from typing import Any, Dict, Tuple from pandagg import Mappings -from pandagg.node.mappings import Field, ComplexField +from pandagg.node.mappings import ComplexField, Field class DocumentMeta(type): + + """`DocumentSource` metaclass. + + Note: you shouldn't have to use it directly, see `DocumentSource` instead. + """ + def __new__(cls, name: str, bases: Tuple, attrs: Dict) -> "DocumentMeta": + """Document metaclass is responsible for: + + - registering `Fields`: + - as a `Mappings` instance in `_mappings_` attribute + - as a dict of `Fields` in `_field_attrs_` + - keeping other attributes and methods as is + - building `__init__` to accepts only those declared fields at `DocumentSource` instanciation + + In following example:: + + class RestaurantInspection(DocumentSource): + + description = "Represent a new-york restaurant inspection." + + name = Text() + borough = Keyword() + cuisine = Keyword() + + document = RestaurantInspection( + name="Almighty burger", + borough="Brooklyn" + ) + + The `description` attribute isn't a `Field` instance, and won't be considered as a valid input at document + instanciation. + """ regular_attrs: Dict = {} field_attrs: Dict = {} for k, v in attrs.items(): @@ -32,31 +64,88 @@ def __init__(self: "DocumentSource", **kwargs: Any) -> None: setattr(self, k, v) self._post_init_() + def __str__(self: "DocumentSource") -> str: + return "{}({})".format( + self.__class__.__name__, + ", ".join( + "{}={!r}".format(key, getattr(self, key)) + for key in field_attrs.keys() + ), + ) + regular_attrs["__init__"] = __init__ + regular_attrs["__str__"] = __str__ + regular_attrs["__repr__"] = __str__ return super(DocumentMeta, cls).__new__(cls, name, bases, regular_attrs) class DocumentSource(metaclass=DocumentMeta): - # __init__ is overidden by metaclass, this is only so that pycharm doesn't highlights fake errors when instantiating - # documents + """ + Model-like class for persisting documents in elasticsearch. + + It is both used for mappings declaration, and for document manipulation and persistence:: + + class RestaurantInspection(DocumentSource): + name = Text() + borough = Keyword() + cuisine = Keyword() + grade = Keyword() + score = Integer() + location = GeoPoint() + inspection_date = Date(format="MM/dd/yyyy") + + This document can be referenced in a DeclarativeIndex class to declare index mappings:: + + class NYCRestaurants(DeclarativeIndex): + name = "nyc-restaurants" + document = RestaurantInspection + + + + Note: these mappings will be used to create index mappings when using DeclarativeIndex `save` method. + + It is possible to serialize mappings via `_mappings.to_dict` method:: + + >>> NYCRestaurants._mappings.to_dict() + { + 'properties': { + 'name': {'type': 'text'}, + 'borough': {'type': 'keyword'}, + 'cuisine': {'type': 'keyword'}, + 'grade': {'type': 'keyword'}, + 'score': {'type': 'integer'}, + 'location': {'type': 'geo_point'}, + 'inspection_date': {'format': 'MM/dd/yyyy', 'type': 'date'} + } + } + + """ + def __init__(self, **kwargs: Any) -> None: - pass + """Overridden by metaclass, it is declared here only so that pycharm doesn't highlight fake errors when + instantiating documents. + """ def _post_init_(self) -> None: - # intended to be overwritten - # apply transformations after instantiation - # note: it applies both when instantiating manually documents, and when documents are instantiated while - # deserializing ElasticSearch search response - pass + """Intended to be overwritten. + Apply transformations after document instantiation. Executed both when manually instantiating documents, and + when documents are instantiated while deserializing ElasticSearch search response. + """ def _pre_save_op_(self) -> None: - # intended to be overwritten - # apply transformations before any persisting operation (create / index / update) - # for instance to update 'last_updated_at' date - pass + """Intended to be overwritten. + Apply transformations before any persisting operation (create / index / update). + + Example: update 'last_updated_at' date. + """ def _to_dict_(self, with_empty_keys: bool = False) -> Dict[str, Any]: + """ + Serialize document as a json-compatible python dict. + + :param with_empty_keys: if True, empty field will be serialized with `None` value. + """ d: Dict[str, Any] = {} k: str field: Field @@ -80,6 +169,14 @@ def _to_dict_(self, with_empty_keys: bool = False) -> Dict[str, Any]: def _from_dict_( cls, source: Dict, strict: bool = True, path: str = "" ) -> "DocumentSource": + """ + Deserialize document source into a Document instance. + + :param source: document source (python dict) to deserialize. + :param strict: if True, check that fields declared with `multiple=True` or `multiple=False` have the intended + shape (fields with `multiple=True` are expected to contain a list or values, fields with `multiple=False` are + expected to contain a single value.) + """ doc = cls() k: str field: Field @@ -121,4 +218,7 @@ def _from_dict_( class InnerDocSource(DocumentSource): - pass + + """ + Common class for inner documents like Object or Nested + """ diff --git a/pandagg/index.py b/pandagg/index.py index 1e1b6235..fb951a74 100644 --- a/pandagg/index.py +++ b/pandagg/index.py @@ -1,17 +1,17 @@ # adapted from elasticsearch-dsl-py import dataclasses -from itertools import chain from copy import deepcopy -from typing import Optional, Any, List, Dict, Tuple, Union, Iterator, Iterable -from typing_extensions import TypedDict, Literal +from itertools import chain +from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union from elasticsearch import Elasticsearch, helpers +from typing_extensions import Literal, TypedDict from pandagg import Mappings, Search -from pandagg.document import DocumentSource, DocumentMeta +from pandagg.document import DocumentMeta, DocumentSource from pandagg.tree.mappings import MappingsDictOrNode -from pandagg.types import SettingsDict, IndexAliases, DocSource, Action, OpType +from pandagg.types import Action, DocSource, IndexAliases, OpType, SettingsDict from pandagg.utils import _cast_pre_save_source, get_action_modifier, is_subset @@ -23,6 +23,11 @@ class Template(TypedDict, total=False): @dataclasses.dataclass class DocumentBulkWriter: + + """Class responsible for performing document operations by batches. + Not intended to be used directly (it is used within `DeclarativeIndex`). + """ + _index: "DeclarativeIndex" _operations: Iterator[Action] = dataclasses.field( init=False, default_factory=lambda: iter([]) @@ -158,6 +163,15 @@ def _deepcopy_mutable_attrs(attrs: Dict[str, Any], attrs_names: List[str]) -> No class IndexMeta(type): + """DeclarativeIndex Metaclass. + + - ensure `DeclarativeIndex` has a `name` + - ensure Mappings declaration is valid, and that `mappings` and `document` do not conflict + - build `Mappings` instance into `_mappings` attribute, based on `mappings` if provided, else on `document` if + provided. + - deepcopies `mappings` and `settings` attributes so that they cannot be mutated by side effects. + """ + # global flag to apply changes to descendants of DeclarativeIndex class (and not the abstract class itself) _abstract_index_initialized = False @@ -288,6 +302,58 @@ def exists(self, **kwargs: Any) -> bool: class DeclarativeIndex(metaclass=IndexMeta): + """Express an index structure. Provide methods to persist this structure to the cluster, and to perform queries on + it. + + The only compulsory class attribute is the index name. + Index settings and aliases can simply be declared via `settings` and `aliases` attributes. + Mappings though can be declared by using `mappings` and/or `document` attributes. + + - `mappings` attribute more strictly sticks to Elasticsearch syntax, and take precedence over `document` attribute. + - `document` attribute accepts a `DocumentSource` class, and provide ORM capabilities, see `DocumentSource`. + + Example of index declaration:: + + from pandagg.index import DeclarativeIndex + + class Inspection(DocumentSource): + name = Text() + borough = Keyword() + score = Integer() + location = GeoPoint() + inspection_date = Date(format="MM/dd/yyyy") + + + class NYCRestaurants(DeclarativeIndex): + name = "nyc-restaurants" + document = Inspection + mappings = { + "properties": { + "name": {"type": "text"}, + "borough": {"type": "keyword"}, + "score": {"type": "integer"}, + "location": {"type": "geo_point"}, + "inspection_date": {"type": "date", "format": "MM/dd/yyyy"}, + } + } + settings = {"number_of_shards": 2} + + This declaration can be used to persist this expected state to the cluster + + from elasticsearch import Elasticsearch + client = Elasticsearch() + index = NYCRestaurants(client=client) + + if not index.exists(): + index.save() + + Search this index using `search` method. + + >>> index.search().execute() + took 3ms, success: True, total result >=10000, contains 10 hits + + """ + name: str mappings: Optional[MappingsDictOrNode] = None settings: Optional[SettingsDict] = None @@ -322,10 +388,17 @@ def _get_connection(self) -> Elasticsearch: def search( self, + deserialize_source: bool = False, nested_autocorrect: bool = False, repr_auto_execute: bool = False, - deserialize_source: bool = False, ) -> Search: + """Return a Search instance: + :param deserialize_source: if True, hits sources are deserialized into `DocumentSource` instances (based on + class provided in `DeclarativeIndex` `document` attribute). + :param nested_autocorrect: if True, missing nested clauses will automatically be injected. + :param repr_auto_execute: intended for interactive usage only, if True automatically performs a request whil + :rtype: Search + """ if deserialize_source is True and self.document is None: raise ValueError( "'%s' DeclarativeIndex doesn't have any declared document to deserialize hits' sources" @@ -348,7 +421,7 @@ def create(self, **kwargs: Any) -> Any: ``Elasticsearch.indices.create`` unchanged. """ return self._get_connection().indices.create( - index=self.name, body=self.to_dict(), **kwargs + index=self.name, **self.to_dict(), **kwargs ) def is_closed(self) -> bool: diff --git a/pandagg/interactive/_field_agg_factory.py b/pandagg/interactive/_field_agg_factory.py index c9c0dfeb..45af19b7 100644 --- a/pandagg/interactive/_field_agg_factory.py +++ b/pandagg/interactive/_field_agg_factory.py @@ -1,10 +1,9 @@ import dataclasses -from typing import List, Type, Any, Callable, Dict - -from pandagg.search import Search +from typing import Any, Callable, Dict, List, Type from pandagg.node.aggs.abstract import AggClause, BucketAggClause from pandagg.node.types import MAPPING_TYPES +from pandagg.search import Search from pandagg.types import FieldType diff --git a/pandagg/interactive/mappings.py b/pandagg/interactive/mappings.py index 86f167f0..84a48b68 100644 --- a/pandagg/interactive/mappings.py +++ b/pandagg/interactive/mappings.py @@ -1,13 +1,12 @@ import json -from typing import Optional, List +from typing import List, Optional from elasticsearch import Elasticsearch - from lighttree import TreeBasedObj from lighttree.node import NodeId -from pandagg.tree.mappings import Mappings from pandagg.interactive._field_agg_factory import field_classes_per_name +from pandagg.tree.mappings import Mappings from pandagg.utils import DSLMixin diff --git a/pandagg/mappings.py b/pandagg/mappings.py index 1fb8e616..1385f1e3 100644 --- a/pandagg/mappings.py +++ b/pandagg/mappings.py @@ -1,60 +1,60 @@ -from pandagg.tree.mappings import Mappings from pandagg.interactive.mappings import IMappings from pandagg.node.mappings.field_datatypes import ( - IpRange, - Text, - Keyword, - ConstantKeyword, - WildCard, - Long, - Integer, - Short, + IP, + Alias, + Binary, + Boolean, Byte, - Double, - HalfFloat, - ScaledFloat, + Completion, + ConstantKeyword, Date, DateNanos, - Boolean, - Binary, - IntegerRange, + DateRange, + DenseVector, + Double, + DoubleRange, + Flattened, Float, FloatRange, - LongRange, - DoubleRange, - DateRange, - Object, - Nested, GeoPoint, GeoShape, - IP, - Completion, - TokenCount, - MapperMurMur3, + HalfFloat, + Histogram, + Integer, + IntegerRange, + IpRange, + Join, + Keyword, + Long, + LongRange, MapperAnnotatedText, + MapperMurMur3, + Nested, + Object, Percolator, - Join, RankFeature, RankFeatures, - DenseVector, - SparseVector, + ScaledFloat, SearchAsYouType, - Alias, - Flattened, Shape, - Histogram, + Short, + SparseVector, + Text, + TokenCount, + WildCard, ) from pandagg.node.mappings.meta_fields import ( - Index, - Type, - Id, FieldNames, - Source, - Size, + Id, Ignored, - Routing, + Index, Meta, + Routing, + Size, + Source, + Type, ) +from pandagg.tree.mappings import Mappings __all__ = [ "Mappings", diff --git a/pandagg/node/__init__.py b/pandagg/node/__init__.py index 2b4a6567..a40c40d6 100644 --- a/pandagg/node/__init__.py +++ b/pandagg/node/__init__.py @@ -1,4 +1,4 @@ # make sure all classes are loaded and available through metaclass -import pandagg.node.query import pandagg.node.aggs import pandagg.node.mappings +import pandagg.node.query diff --git a/pandagg/node/_node.py b/pandagg/node/_node.py index 583b7605..fcc3355c 100644 --- a/pandagg/node/_node.py +++ b/pandagg/node/_node.py @@ -1,7 +1,8 @@ +from typing import Any, Dict + from lighttree import AutoIdNode as OriginalNode from pandagg.utils import DSLMixin -from typing import Dict, Any class Node(DSLMixin, OriginalNode): diff --git a/pandagg/node/aggs/abstract.py b/pandagg/node/aggs/abstract.py index 64f153a7..5feda0be 100644 --- a/pandagg/node/aggs/abstract.py +++ b/pandagg/node/aggs/abstract.py @@ -1,21 +1,20 @@ import json +from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union from pandagg.node._node import Node -from typing import Optional, List, Union, Dict, Any, Tuple, Iterator, Type - from pandagg.types import ( - Meta, - BucketKey, - BucketDict, AggClauseDict, - AggType, - Script, - GapPolicy, - AggName, AggClauseResponseDict, - BucketsWrapperDict, + AggName, + AggType, + BucketDict, + BucketKey, BucketKeyAtom, BucketsDict, + BucketsWrapperDict, + GapPolicy, + Meta, + Script, ) diff --git a/pandagg/node/aggs/bucket.py b/pandagg/node/aggs/bucket.py index 07412c89..3b90555c 100644 --- a/pandagg/node/aggs/bucket.py +++ b/pandagg/node/aggs/bucket.py @@ -1,10 +1,10 @@ # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket.html -from typing import Any, Optional, Dict, Union, List +from typing import Any, Dict, List, Optional, Union -from pandagg.node.types import NUMERIC_TYPES from pandagg.node.aggs.abstract import MultipleBucketAgg, UniqueBucketAgg -from pandagg.types import Meta, QueryClauseDict, RangeDict, DistanceType, ExecutionHint +from pandagg.node.types import NUMERIC_TYPES +from pandagg.types import DistanceType, ExecutionHint, Meta, QueryClauseDict, RangeDict class Global(UniqueBucketAgg): @@ -39,6 +39,8 @@ def __init__( class MatchAll(Filter): + KEY = "match_all" + def __init__(self, **body: Any): super(MatchAll, self).__init__(filter={"match_all": {}}, **body) diff --git a/pandagg/node/aggs/composite.py b/pandagg/node/aggs/composite.py index 159e1e3f..91e22618 100644 --- a/pandagg/node/aggs/composite.py +++ b/pandagg/node/aggs/composite.py @@ -1,14 +1,16 @@ -from .abstract import BucketAggClause -from typing import Optional, Any, Dict, List, Iterator, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple + from pandagg.types import ( AfterKey, - CompositeSource, - BucketDict, AggClauseResponseDict, AggName, + BucketDict, CompositeBucketKey, + CompositeSource, ) +from .abstract import BucketAggClause + class Composite(BucketAggClause): diff --git a/pandagg/node/aggs/metric.py b/pandagg/node/aggs/metric.py index 61183436..2eb8f693 100644 --- a/pandagg/node/aggs/metric.py +++ b/pandagg/node/aggs/metric.py @@ -1,7 +1,7 @@ from typing import Any, List -from pandagg.node.types import NUMERIC_TYPES from pandagg.node.aggs.abstract import FieldOrScriptMetricAgg, MetricAgg +from pandagg.node.types import NUMERIC_TYPES class TopHits(MetricAgg): diff --git a/pandagg/node/mappings/abstract.py b/pandagg/node/mappings/abstract.py index 71efc6a0..05dcf187 100644 --- a/pandagg/node/mappings/abstract.py +++ b/pandagg/node/mappings/abstract.py @@ -1,14 +1,13 @@ from __future__ import annotations import json +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union from pandagg.node._node import Node -from typing import Optional, Any, Tuple, Dict, Type, Union, TYPE_CHECKING - from pandagg.types import FieldType if TYPE_CHECKING: - from pandagg.document import DocumentSource, DocumentMeta + from pandagg.document import DocumentMeta, DocumentSource class Field(Node): @@ -72,7 +71,7 @@ class ComplexField(Field): def __init__( self, properties: Optional[Union[Dict, Type[DocumentSource]]] = None, - **body: Any + **body: Any, ) -> None: properties = properties or {} if not isinstance(properties, dict): diff --git a/pandagg/node/query/__init__.py b/pandagg/node/query/__init__.py index d9dedddf..90e2c4e4 100644 --- a/pandagg/node/query/__init__.py +++ b/pandagg/node/query/__init__.py @@ -4,6 +4,7 @@ from .full_text import * from .geo import * from .joining import * +from .match_all import * from .shape import * from .span import * from .specialized import * diff --git a/pandagg/node/query/abstract.py b/pandagg/node/query/abstract.py index 10a20db6..ff785ea4 100644 --- a/pandagg/node/query/abstract.py +++ b/pandagg/node/query/abstract.py @@ -1,9 +1,8 @@ import json +from typing import Any, Dict, List, Optional, Tuple, Type, Union from pandagg.node._node import Node -from typing import Optional, Union, Dict, Any, Tuple, List, Type - -from pandagg.types import QueryType, QueryClauseDict +from pandagg.types import QueryClauseDict, QueryType class QueryClause(Node): diff --git a/pandagg/node/query/compound.py b/pandagg/node/query/compound.py index 0af1b6a1..590f7046 100644 --- a/pandagg/node/query/compound.py +++ b/pandagg/node/query/compound.py @@ -1,7 +1,7 @@ -from typing import List, Optional, Any +from typing import Any, List, Optional -from pandagg.types import QueryName from pandagg.node.query.abstract import QueryClause +from pandagg.types import QueryName class CompoundClause(QueryClause): diff --git a/pandagg/node/query/full_text.py b/pandagg/node/query/full_text.py index b0e9f689..76972f69 100644 --- a/pandagg/node/query/full_text.py +++ b/pandagg/node/query/full_text.py @@ -1,4 +1,4 @@ -from .abstract import LeafQueryClause, KeyFieldQueryClause, MultiFieldsQueryClause +from .abstract import KeyFieldQueryClause, LeafQueryClause, MultiFieldsQueryClause class Intervals(KeyFieldQueryClause): diff --git a/pandagg/node/query/geo.py b/pandagg/node/query/geo.py index 89be4ff6..b29c8d27 100644 --- a/pandagg/node/query/geo.py +++ b/pandagg/node/query/geo.py @@ -1,8 +1,9 @@ from typing import Any, Optional, Tuple -from .abstract import KeyFieldQueryClause, AbstractSingleFieldQueryClause from pandagg.types import DistanceType, ValidationMethod +from .abstract import AbstractSingleFieldQueryClause, KeyFieldQueryClause + class GeoBoundingBox(KeyFieldQueryClause): KEY = "geo_bounding_box" diff --git a/pandagg/node/query/match_all.py b/pandagg/node/query/match_all.py new file mode 100644 index 00000000..f4e12582 --- /dev/null +++ b/pandagg/node/query/match_all.py @@ -0,0 +1,18 @@ +from typing import Any + +from pandagg.node.query import LeafQueryClause + + +class MatchAll(LeafQueryClause): + + KEY = "match_all" + + def __init__(self, **body: Any) -> None: + super(MatchAll, self).__init__(**body) + + +class MatchNone(LeafQueryClause): + KEY = "match_none" + + def __init__(self, **body: Any) -> None: + super(MatchNone, self).__init__(**body) diff --git a/pandagg/node/query/specialized.py b/pandagg/node/query/specialized.py index 9ce6829d..360098e2 100644 --- a/pandagg/node/query/specialized.py +++ b/pandagg/node/query/specialized.py @@ -1,4 +1,4 @@ -from .abstract import LeafQueryClause, FlatFieldQueryClause, MultiFieldsQueryClause +from .abstract import FlatFieldQueryClause, LeafQueryClause, MultiFieldsQueryClause class DistanceFeature(FlatFieldQueryClause): diff --git a/pandagg/node/query/term_level.py b/pandagg/node/query/term_level.py index 4c39d9d9..20b4f2de 100644 --- a/pandagg/node/query/term_level.py +++ b/pandagg/node/query/term_level.py @@ -1,11 +1,12 @@ -from typing import Optional, Any, Tuple, List, Union +from typing import Any, List, Optional, Tuple, Union + +from pandagg.types import QueryName from .abstract import ( - LeafQueryClause, AbstractSingleFieldQueryClause, KeyFieldQueryClause, + LeafQueryClause, ) -from pandagg.types import QueryName class Exists(LeafQueryClause): diff --git a/pandagg/query.py b/pandagg/query.py index f3f9b187..a0f2d878 100644 --- a/pandagg/query.py +++ b/pandagg/query.py @@ -1,37 +1,25 @@ -from pandagg.node.query.shape import Shape -from pandagg.node.query.term_level import ( - Exists, - Fuzzy, - Ids, - Prefix, - Range, - Regexp, - Term, - Terms, - TermsSet, - Type, - Wildcard, +from pandagg.node.query.compound import ( + Bool, + Boosting, + ConstantScore, + DisMax, + FunctionScore, ) from pandagg.node.query.full_text import ( + Common, Intervals, Match, MatchBoolPrefix, MatchPhrase, MatchPhrasePrefix, MultiMatch, - Common, QueryString, SimpleQueryString, ) -from pandagg.node.query.compound import ( - Bool, - Boosting, - ConstantScore, - FunctionScore, - DisMax, -) -from pandagg.node.query.joining import Nested, HasChild, HasParent, ParentId -from pandagg.node.query.geo import GeoShape, GeoPolygone, GeoDistance, GeoBoundingBox +from pandagg.node.query.geo import GeoBoundingBox, GeoDistance, GeoPolygone, GeoShape +from pandagg.node.query.joining import HasChild, HasParent, Nested, ParentId +from pandagg.node.query.match_all import MatchAll, MatchNone +from pandagg.node.query.shape import Shape from pandagg.node.query.specialized import ( DistanceFeature, MoreLikeThis, @@ -40,7 +28,20 @@ Script, Wrapper, ) -from pandagg.node.query.specialized_compound import ScriptScore, PinnedQuery +from pandagg.node.query.specialized_compound import PinnedQuery, ScriptScore +from pandagg.node.query.term_level import ( + Exists, + Fuzzy, + Ids, + Prefix, + Range, + Regexp, + Term, + Terms, + TermsSet, + Type, + Wildcard, +) from pandagg.tree.query import Query __all__ = [ @@ -78,6 +79,9 @@ "HasParent", "HasChild", "ParentId", + # match_all + "MatchAll", + "MatchNone", # shape "Shape", # geo diff --git a/pandagg/response.py b/pandagg/response.py index 28daa5d3..4105ec8b 100644 --- a/pandagg/response.py +++ b/pandagg/response.py @@ -2,47 +2,48 @@ import copy import dataclasses -from typing_extensions import Literal, TypedDict from typing import ( - Iterator, - Optional, - List, TYPE_CHECKING, + Any, Dict, + Iterator, + List, + Optional, Tuple, Union, overload, - Any, ) from elasticsearch import Elasticsearch from lighttree.node import NodeId +from typing_extensions import Literal, TypedDict -from pandagg.query import Query from pandagg.aggs import Aggs, Composite -from pandagg.node.aggs.abstract import UniqueBucketAgg, MetricAgg, Root, AggClause +from pandagg.node.aggs.abstract import AggClause, MetricAgg, Root, UniqueBucketAgg from pandagg.node.aggs.bucket import Nested, ReverseNested +from pandagg.query import Query from pandagg.types import ( - HitDict, - HitsDict, - DocSource, - TotalDict, - SearchResponseDict, - ShardsDict, - AggregationsResponseDict, AggName, - ProfileDict, + AggregationsResponseDict, BucketDict, BucketKey, - CompositeBucketKey, BucketKeyAtom, + CompositeBucketKey, + DocSource, + HitDict, + HitsDict, + ProfileDict, + SearchResponseDict, + ShardsDict, + TotalDict, ) if TYPE_CHECKING: import pandas as pd - from pandagg.search import Search + from pandagg import DocumentMeta from pandagg.document import DocumentSource + from pandagg.search import Search GroupingKeysDict = Dict[AggName, BucketKeyAtom] diff --git a/pandagg/search.py b/pandagg/search.py index 6ac575dd..f6b5a4e5 100644 --- a/pandagg/search.py +++ b/pandagg/search.py @@ -5,15 +5,15 @@ import copy import json from typing import ( - Optional, - Union, - Tuple, - List, + TYPE_CHECKING, Any, - TypeVar, Dict, Iterator, - TYPE_CHECKING, + List, + Optional, + Tuple, + TypeVar, + Union, ) from elasticsearch import Elasticsearch @@ -21,31 +21,32 @@ from pandagg.node.aggs.abstract import TypeOrAgg from pandagg.query import Bool -from pandagg.response import SearchResponse, Hit, Aggregations -from pandagg.tree.mappings import _mappings, Mappings +from pandagg.response import Aggregations, Hit, SearchResponse +from pandagg.tree.aggs import Aggs, AggsDictOrNode +from pandagg.tree.mappings import Mappings, _mappings from pandagg.tree.query import ( - Query, ADD, - TypeOrQuery, InsertionModes, + Query, SingleOrMultipleQueryClause, + TypeOrQuery, ) -from pandagg.tree.aggs import Aggs, AggsDictOrNode from pandagg.types import ( - MappingsDict, - QueryName, - ClauseBody, + AfterKey, AggName, - SearchResponseDict, + BucketDict, + ClauseBody, DeleteByQueryResponse, + MappingsDict, + QueryName, SearchDict, - BucketDict, - AfterKey, + SearchResponseDict, ) from pandagg.utils import DSLMixin if TYPE_CHECKING: import pandas as pd + from pandagg.document import DocumentMeta # because Search.bool method shadows bool typing @@ -100,7 +101,7 @@ def index(self: T, *index: Union[str, List[str], Tuple[str]]) -> T: """ Set the index for the search. If called empty it will remove all information. - Example: + Example:: s = Search() s = s.index('twitter-2015.01.01', 'twitter-2015.01.02') @@ -200,7 +201,7 @@ def query( on: Optional[QueryName] = None, mode: InsertionModes = ADD, compound_param: str = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.query( @@ -209,7 +210,7 @@ def query( on=on, mode=mode, compound_param=compound_param, - **body + **body, ) return s @@ -224,7 +225,7 @@ def bool( insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.bool( @@ -235,7 +236,7 @@ def bool( insert_below=insert_below, on=on, mode=mode, - **body + **body, ) return s @@ -248,7 +249,7 @@ def filter( on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.filter( @@ -257,7 +258,7 @@ def filter( on=on, mode=mode, bool_body=bool_body, - **body + **body, ) return s @@ -270,7 +271,7 @@ def must_not( on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.must_not( @@ -279,7 +280,7 @@ def must_not( on=on, mode=mode, bool_body=bool_body, - **body + **body, ) return s @@ -292,7 +293,7 @@ def should( on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.should( @@ -301,7 +302,7 @@ def should( on=on, mode=mode, bool_body=bool_body, - **body + **body, ) return s @@ -314,7 +315,7 @@ def must( on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._query = s._query.must( @@ -323,7 +324,7 @@ def must( on=on, mode=mode, bool_body=bool_body, - **body + **body, ) return s @@ -336,7 +337,7 @@ def exclude( on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, - **body: Any + **body: Any, ) -> "Search": """Must not wrapped in filter context.""" s = self._clone() @@ -356,7 +357,7 @@ def post_filter( on: Optional[QueryName] = None, mode: InsertionModes = ADD, compound_param: str = None, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._post_filter = s._post_filter.query( @@ -365,7 +366,7 @@ def post_filter( on=on, mode=mode, compound_param=compound_param, - **body + **body, ) return s @@ -375,7 +376,7 @@ def agg( type_or_agg: Optional[TypeOrAgg] = None, insert_below: Optional[AggName] = None, at_root: bool_ = False, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._aggs = s._aggs.agg( @@ -383,7 +384,7 @@ def agg( type_or_agg=type_or_agg, insert_below=insert_below, at_root=at_root, - **body + **body, ) return s @@ -407,7 +408,7 @@ def groupby( type_or_agg: Optional[TypeOrAgg] = None, insert_below: Optional[AggName] = None, at_root: bool_ = False, - **body: Any + **body: Any, ) -> "Search": s = self._clone() s._aggs = s._aggs.groupby( @@ -415,7 +416,7 @@ def groupby( type_or_agg=type_or_agg, insert_below=insert_below, at_root=at_root, - **body + **body, ) return s @@ -791,7 +792,7 @@ def execute(self) -> SearchResponse: the data. """ es = self._get_connection() - raw_data = es.search(index=self._index, body=self.to_dict()) + raw_data = es.search(index=self._index, **self.to_dict()) # type: ignore return SearchResponse(data=raw_data, _search=self) # type: ignore def scan_composite_agg(self, size: int) -> Iterator[BucketDict]: @@ -803,6 +804,7 @@ def scan_composite_agg(self, size: int) -> Iterator[BucketDict]: buckets: List[BucketDict] = r.aggregations.data[a_name][ # type: ignore "buckets" ] + yield from buckets after_key: AfterKey = r.aggregations.data[a_name]["after_key"] # type: ignore init: bool = True @@ -812,8 +814,7 @@ def scan_composite_agg(self, size: int) -> Iterator[BucketDict]: r = s.execute() agg_clause_response = r.aggregations.data[a_name] buckets = agg_clause_response["buckets"] # type: ignore - for bucket in buckets: - yield bucket + yield from buckets if "after_key" in agg_clause_response: after_key = agg_clause_response["after_key"] # type: ignore else: @@ -831,18 +832,18 @@ def scan_composite_agg_at_once(self, size: int) -> Aggregations: # artificially merge all buckets as if they were returned in a single query return Aggregations(_search=s, data={agg_name: {"buckets": all_buckets}}) - def scan(self) -> Iterator[Hit]: + def scan(self, **kwargs: Any) -> Iterator[Hit]: """ Turn the search into a scan search and return a generator that will iterate over all the documents matching the query. - Use ``params`` method to specify any additional arguments you with to - pass to the underlying ``scan`` helper from ``elasticsearch-py`` - + Use ``kwargs`` to specify any additional arguments to pass to the underlying ``scan`` helper from + ``elasticsearch-py`` - https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan """ es = self._get_connection() - for hit in scan(es, query=self.to_dict(), index=self._index): + for hit in scan(es, query=self.to_dict(), index=self._index, **kwargs): yield Hit(hit, _document_class=self._document_class) def delete(self) -> DeleteByQueryResponse: diff --git a/pandagg/tree/__init__.py b/pandagg/tree/__init__.py index b4364d8e..644e958d 100644 --- a/pandagg/tree/__init__.py +++ b/pandagg/tree/__init__.py @@ -1,3 +1,3 @@ import pandagg.tree.aggs -import pandagg.tree.query import pandagg.tree.mappings +import pandagg.tree.query diff --git a/pandagg/tree/aggs.py b/pandagg/tree/aggs.py index c751a71b..f83a0911 100644 --- a/pandagg/tree/aggs.py +++ b/pandagg/tree/aggs.py @@ -1,23 +1,23 @@ import json -from typing import Optional, Union, Any, Dict, Tuple +from typing import Any, Dict, Optional, Tuple, Union from lighttree import Key, Tree from lighttree.node import NodeId -from pandagg.node.aggs import Composite -from pandagg.tree._tree import TreeReprMixin -from pandagg.tree.mappings import _mappings, Mappings, MappingsDict +from pandagg.node.aggs import Composite from pandagg.node.aggs.abstract import ( + A, + AggClause, + AggClauseDict, BucketAggClause, Root, - A, TypeOrAgg, - AggClauseDict, - AggClause, ) from pandagg.node.aggs.bucket import Nested, ReverseNested from pandagg.node.aggs.pipeline import BucketSelector, BucketSort -from pandagg.types import AggName, NamedAggsDict, AfterKey +from pandagg.tree._tree import TreeReprMixin +from pandagg.tree.mappings import Mappings, MappingsDict, _mappings +from pandagg.types import AfterKey, AggName, NamedAggsDict # {"my_agg": {"terms": "some_field"}} or {"my_agg": Terms(field="some_field")} AggsDictOrNode = Dict[AggName, Union[AggClauseDict, AggClause]] diff --git a/pandagg/tree/mappings.py b/pandagg/tree/mappings.py index d9a6a3b1..f07fb1da 100644 --- a/pandagg/tree/mappings.py +++ b/pandagg/tree/mappings.py @@ -1,21 +1,20 @@ from __future__ import annotations -from typing_extensions import TypedDict -from typing import Optional, Union, Any, List, Dict, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -from lighttree.node import NodeId from lighttree import Tree - -from pandagg.node.aggs.abstract import AggClause -from pandagg.node.mappings import Object, Nested -from pandagg.node.mappings.abstract import Field, RegularField, ComplexField, Root +from lighttree.node import NodeId +from typing_extensions import TypedDict from pandagg.exceptions import ( AbsentMappingFieldError, InvalidOperationMappingFieldError, ) +from pandagg.node.aggs.abstract import AggClause +from pandagg.node.mappings import Nested, Object +from pandagg.node.mappings.abstract import ComplexField, Field, RegularField, Root from pandagg.tree._tree import TreeReprMixin -from pandagg.types import DocSource, MappingsDict, FieldName, FieldClauseDict +from pandagg.types import DocSource, FieldClauseDict, FieldName, MappingsDict if TYPE_CHECKING: from pandagg.document import DocumentSource @@ -45,7 +44,7 @@ def __init__( self, properties: Optional[FieldPropertiesDictOrNode] = None, dynamic: Optional[bool] = None, - **body: Any + **body: Any, ) -> None: super(Mappings, self).__init__() # a Mappings always has a root after __init__ @@ -119,7 +118,7 @@ def validate_agg_clause(self, agg_clause: AggClause, exc: bool = True) -> bool: nid = self.get_node_id_by_path(agg_field.split(".")) except Exception: raise AbsentMappingFieldError( - u"Agg of type <%s> on non-existing field <%s>." + "Agg of type <%s> on non-existing field <%s>." % (agg_clause.KEY, agg_field) ) _, field_node = self.get(nid) @@ -129,7 +128,7 @@ def validate_agg_clause(self, agg_clause: AggClause, exc: bool = True) -> bool: if not exc: return False raise InvalidOperationMappingFieldError( - u"Agg of type <%s> not possible on field of type <%s>." + "Agg of type <%s> not possible on field of type <%s>." % (agg_clause.KEY, field_type) ) return True @@ -156,7 +155,7 @@ def mapping_type_of_field(self, field_path: str) -> str: nid = self.get_node_id_by_path(field_path.split(".")) except ValueError: raise AbsentMappingFieldError( - u"<%s field is not present in mappings>" % field_path + "<%s field is not present in mappings>" % field_path ) _, node = self.get(nid) return node.KEY diff --git a/pandagg/tree/query.py b/pandagg/tree/query.py index 73a72a63..8e70df9f 100644 --- a/pandagg/tree/query.py +++ b/pandagg/tree/query.py @@ -1,24 +1,24 @@ import json -from typing import Optional, Union, Any, List, Dict -from typing_extensions import Literal +from typing import Any, Dict, List, Optional, Union from lighttree import Key, Tree from lighttree.node import NodeId +from typing_extensions import Literal + from pandagg._decorators import Substitution from pandagg.node.query._parameter_clause import ParentParameterClause from pandagg.node.query.abstract import ( - QueryClause, LeafQueryClause, Q, + QueryClause, QueryClauseDict, QueryType, TypeOrQuery_, ) -from pandagg.node.query.compound import CompoundClause, Bool +from pandagg.node.query.compound import Bool, CompoundClause from pandagg.node.query.joining import Nested - -from pandagg.tree.mappings import _mappings, MappingsDict, Mappings -from pandagg.types import QueryName, ClauseBody +from pandagg.tree.mappings import Mappings, MappingsDict, _mappings +from pandagg.types import ClauseBody, QueryName # because a method `bool` shadows the real bool bool_ = bool diff --git a/pandagg/types.py b/pandagg/types.py index f728d195..3a315114 100644 --- a/pandagg/types.py +++ b/pandagg/types.py @@ -1,5 +1,6 @@ -from typing_extensions import TypedDict, Literal -from typing import Optional, Dict, Any, List, Union +from typing import Any, Dict, List, Optional, Union + +from typing_extensions import Literal, TypedDict ClauseName = str ClauseType = str diff --git a/pandagg/utils.py b/pandagg/utils.py index d80345af..ed577ead 100644 --- a/pandagg/utils.py +++ b/pandagg/utils.py @@ -1,9 +1,9 @@ # adapted from https://github.com/elastic/elasticsearch-dsl-py/blob/master/elasticsearch_dsl/utils.py#L162 from __future__ import annotations -from typing import Dict, Tuple, Any, Union, Callable, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union -from pandagg.types import DocSource, Action, IndexName, OpType +from pandagg.types import Action, DocSource, IndexName, OpType if TYPE_CHECKING: from pandagg.document import DocumentSource diff --git a/setup.cfg b/setup.cfg index 405d6f86..7a04af35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,8 +4,8 @@ test=pytest [flake8] max-line-length = 120 ignore = - E402, # module level import not at top of file - W503 + E402, # Module level import not at top of file + W503 # Line break occurred before a binary operator exclude = *__init__.py @@ -14,3 +14,11 @@ disallow_untyped_defs = True [mypy-pandas.*] ignore_missing_imports = True + +[tool:pytest] +filterwarnings = + ignore:.*Elasticsearch built-in security features are not enabled: + +[isort] +profile = black +src_paths = pandagg,tests,examples diff --git a/setup.py b/setup.py index 8d9837f7..c3ea6257 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,21 @@ -__version__ = "0.2.3" +__version__ = "0.2.4" import os -from setuptools import setup -from setuptools import find_packages - +from setuptools import find_packages, setup here = os.path.abspath(os.path.dirname(__file__)) README = open(os.path.join(here, "README.md")).read() install_requires = [ - "lighttree==1.3.4", + "lighttree==1.3.5", "elasticsearch>=7.0.0,<8.0.0", "typing_extensions", ] develop_requires = [ "pre-commit", + "isort", "black", "coverage", "flake8", @@ -27,6 +26,7 @@ "pandas", "Sphinx", "twine", + "simplejson", ] setup( diff --git a/tests/conftest.py b/tests/conftest.py index 7376b4f0..c1a31645 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,14 @@ # adapted from elasticsearch-dsl.py import re - -from mock import Mock from unittest import SkipTest -from pytest import fixture, skip from elasticsearch.helpers import bulk from elasticsearch.helpers.test import get_test_client +from mock import Mock +from pytest import fixture, skip - -from .test_data import TEST_GIT_DATA, create_git_index, GIT_MAPPINGS +from .test_data import GIT_MAPPINGS, TEST_GIT_DATA, create_git_index @fixture(scope="session") @@ -24,8 +22,9 @@ def client(): @fixture def write_client(client): yield client - client.indices.delete("test-*", ignore=404) - client.indices.delete_index_template("test-template", ignore=404) + client.indices.delete(index="test-git", ignore=404) + client.indices.delete(index="test-post", ignore=404) + client.indices.delete_index_template(name="test-template", ignore=404) @fixture(scope="session") @@ -52,13 +51,13 @@ def git_mappings(): @fixture(scope="session") def data_client(client): - client.indices.delete("git", ignore=(404,)) + client.indices.delete(index="git", ignore=(404,)) # create mappings create_git_index(client, "git") # load data bulk(client, TEST_GIT_DATA, raise_on_error=True, refresh=True) yield client - client.indices.delete("git") + client.indices.delete(index="git") @fixture @@ -109,4 +108,4 @@ def updatable_index(client): create_git_index(client, index) bulk(client, TEST_GIT_DATA, raise_on_error=True, refresh=True) yield index - client.indices.delete(index, ignore=404) + client.indices.delete(index=index, ignore=404) diff --git a/tests/interactive/test_mapping.py b/tests/interactive/test_mapping.py index 34449243..56dbd0e9 100644 --- a/tests/interactive/test_mapping.py +++ b/tests/interactive/test_mapping.py @@ -1,10 +1,8 @@ from pandagg import Search - -from pandagg.mappings import Keyword, Text, Nested, Object, Integer -from pandagg.tree.mappings import Mappings from pandagg.interactive._field_agg_factory import field_classes_per_name from pandagg.interactive.mappings import IMappings - +from pandagg.mappings import Integer, Keyword, Nested, Object, Text +from pandagg.tree.mappings import Mappings from tests.testing_samples.mapping_example import MAPPINGS diff --git a/tests/node/agg/test_bucket.py b/tests/node/agg/test_bucket.py index 01ead4ff..5e96a80b 100644 --- a/tests/node/agg/test_bucket.py +++ b/tests/node/agg/test_bucket.py @@ -1,27 +1,27 @@ from pandagg.aggs import ( - Terms, + AdjacencyMatrix, + AutoDateHistogram, + Children, + DateHistogram, + DiversifiedSampler, Filter, Filters, - DateHistogram, - Nested, - Range, - Histogram, GeoDistance, GeoHashGrid, - AdjacencyMatrix, - AutoDateHistogram, - VariableWidthHistogram, - SignificantTerms, - RareTerms, GeoTileGrid, - IPRange, - Sampler, - DiversifiedSampler, Global, - Children, + Histogram, + IPRange, + MultiTerms, + Nested, Parent, + Range, + RareTerms, + Sampler, + SignificantTerms, SignificantText, - MultiTerms, + Terms, + VariableWidthHistogram, ) diff --git a/tests/node/mapping/test_field.py b/tests/node/mapping/test_field.py index f05e676e..2a3a29ee 100644 --- a/tests/node/mapping/test_field.py +++ b/tests/node/mapping/test_field.py @@ -1,5 +1,5 @@ from pandagg.document import DocumentSource, InnerDocSource -from pandagg.node.mappings import Text, Keyword, Nested, Object +from pandagg.node.mappings import Keyword, Nested, Object, Text from pandagg.node.mappings.abstract import Root diff --git a/tests/node/query/test_match_all.py b/tests/node/query/test_match_all.py new file mode 100644 index 00000000..ed7bc540 --- /dev/null +++ b/tests/node/query/test_match_all.py @@ -0,0 +1,20 @@ +from pandagg.node.query import MatchAll, MatchNone + + +def test_match_all_clause(): + q = MatchAll() + assert q.body == {} + assert q.to_dict() == {"match_all": {}} + assert q.line_repr(depth=None) == ("match_all", "") + + q = MatchAll(boost=0.5) + assert q.body == {"boost": 0.5} + assert q.to_dict() == {"match_all": {"boost": 0.5}} + assert q.line_repr(depth=None) == ("match_all", "boost=0.5") + + +def test_match_none_clause(): + q = MatchNone() + assert q.body == {} + assert q.to_dict() == {"match_none": {}} + assert q.line_repr(depth=None) == ("match_none", "") diff --git a/tests/node/query/test_specialized.py b/tests/node/query/test_specialized.py index ac32740d..edf80ac3 100644 --- a/tests/node/query/test_specialized.py +++ b/tests/node/query/test_specialized.py @@ -1,10 +1,10 @@ from pandagg.query import ( - Wrapper, DistanceFeature, MoreLikeThis, Percolate, RankFeature, Script, + Wrapper, ) diff --git a/tests/node/query/test_term_level.py b/tests/node/query/test_term_level.py index 2395464a..b6edff32 100644 --- a/tests/node/query/test_term_level.py +++ b/tests/node/query/test_term_level.py @@ -1,12 +1,12 @@ from pandagg.query import ( - Terms, - Term, - Fuzzy, Exists, + Fuzzy, Ids, Prefix, Range, Regexp, + Term, + Terms, TermsSet, Wildcard, ) diff --git a/tests/test_data.py b/tests/test_data.py index c8983a77..57f3f8a3 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,10 +1,9 @@ # adapated from elasticsearch-dsl.py -from typing import List, Any, Dict +from typing import Any, Dict, List from typing_extensions import TypedDict - GIT_MAPPINGS = { "dynamic": False, "properties": { @@ -39,34 +38,32 @@ def create_git_index(client, index): client.indices.create( index=index, - body={ - "settings": { - # just one shard, no replicas for testing - "number_of_shards": 1, - "number_of_replicas": 0, - # custom analyzer for analyzing file paths - "analysis": { - "analyzer": { - "file_path": { - "type": "custom", - "tokenizer": "path_hierarchy", - "filter": ["lowercase"], - } + settings={ + # just one shard, no replicas for testing + "number_of_shards": 1, + "number_of_replicas": 0, + # custom analyzer for analyzing file paths + "analysis": { + "analyzer": { + "file_path": { + "type": "custom", + "tokenizer": "path_hierarchy", + "filter": ["lowercase"], } - }, + } }, - "mappings": GIT_MAPPINGS, }, + mappings=GIT_MAPPINGS, ) -class TestDocument(TypedDict): +class _TestDocument(TypedDict): _id: str _source: Dict[str, Any] _index: str -TEST_GIT_DATA: List[TestDocument] = [ +TEST_GIT_DATA: List[_TestDocument] = [ { "_id": "3ca6e1e73a071a705b4babd2f581c91a2a3e5037", "_source": { diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 72282f0d..38d5a270 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -1,9 +1,8 @@ from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient - -from pandagg.discovery import discover, Index from mock import patch +from pandagg.discovery import Index, discover from pandagg.interactive.mappings import IMappings from tests.test_data import GIT_MAPPINGS from tests.testing_samples.mapping_example import MAPPINGS diff --git a/tests/test_document.py b/tests/test_document.py index e105e811..3e45aded 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -1,8 +1,8 @@ import pytest from pandagg import Mappings -from pandagg.document import InnerDocSource, DocumentSource -from pandagg.mappings import Text, Long, Date, Keyword, Object, Nested +from pandagg.document import DocumentSource, InnerDocSource +from pandagg.mappings import Date, Keyword, Long, Nested, Object, Text from pandagg.node.mappings import Boolean @@ -58,6 +58,14 @@ def test_document_init(): ) +def test_doc_repr(): + user = User(id=1, signed_up="2021-01-01") + assert ( + user.__repr__() + == "User(id=1, signed_up='2021-01-01', username=None, email=None, location=None)" + ) + + def test_pre_save(write_client): class AutoDatePost(DocumentSource): diff --git a/tests/test_index.py b/tests/test_index.py index fc737375..90df4fa9 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -4,8 +4,8 @@ from pandagg import Mappings, Search from pandagg.document import DocumentSource -from pandagg.mappings import Keyword, Text, Date from pandagg.index import DeclarativeIndex, DeclarativeIndexTemplate +from pandagg.mappings import Date, Keyword, Text class Post(DeclarativeIndex): @@ -62,11 +62,11 @@ def test_index_without_client_raises_error_on_write_op(): def test_create_index(write_client): - assert not write_client.indices.exists("test-post") + assert not write_client.indices.exists(index="test-post") index = Post(client=write_client) index.save() - assert write_client.indices.exists("test-post") - persisted_index = write_client.indices.get("test-post")["test-post"] + assert write_client.indices.exists(index="test-post") + persisted_index = write_client.indices.get(index="test-post")["test-post"] assert persisted_index["aliases"] == {"post": {}} assert persisted_index["mappings"] == { "properties": {"published_from": {"type": "date"}, "title": {"type": "text"}} @@ -260,7 +260,7 @@ def test_template_save(write_client): _source={"title": "salut", "published_from": "2021-01-01"}, ).perform(refresh=True) assert post_index.exists() - auto_created_index = write_client.indices.get("test-post")["test-post"] + auto_created_index = write_client.indices.get(index="test-post")["test-post"] assert auto_created_index["mappings"] == { "properties": {"published_from": {"type": "date"}, "title": {"type": "text"}} } diff --git a/tests/test_response.py b/tests/test_response.py index 4e0a671d..bb476896 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -1,14 +1,13 @@ -from pandagg.document import DocumentSource -from pandagg.node.mappings import Keyword -from tests import PandaggTestCase import pandas as pd +import tests.testing_samples.data_sample as sample +from pandagg.document import DocumentSource +from pandagg.node.mappings import Keyword +from pandagg.response import Aggregations, Hit, Hits, SearchResponse from pandagg.search import Search -from pandagg.response import SearchResponse, Hits, Hit, Aggregations from pandagg.tree.aggs import Aggs - -import tests.testing_samples.data_sample as sample from pandagg.utils import ordered +from tests import PandaggTestCase from tests.testing_samples.mapping_example import MAPPINGS diff --git a/tests/test_search.py b/tests/test_search.py index 7d2ccf2f..ccf78dcd 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -1,15 +1,15 @@ from copy import deepcopy - -from mock import patch +from typing import List from elasticsearch import Elasticsearch +from mock import patch -from pandagg import Aggregations -from pandagg.node.aggs import Max, DateHistogram, Sum +from pandagg import Aggregations, Hit +from pandagg.node.aggs import DateHistogram, Max, Sum +from pandagg.query import Bool, Match, Query from pandagg.search import Search -from pandagg.query import Query, Bool, Match from pandagg.tree.mappings import Mappings -from pandagg.utils import ordered, equal_queries +from pandagg.utils import equal_queries, ordered def test_expand__to_dot_is_respected(): @@ -96,7 +96,7 @@ def test_search_index(): assert s._index == ["i"] s = s.index("i2") assert s._index == ["i", "i2"] - s = s.index(u"i3") + s = s.index("i3") assert s._index == ["i", "i2", "i3"] s = s.index() assert s._index is None @@ -410,13 +410,13 @@ def test_repr_execution(client_search): s.size(2).__repr__() client_search.assert_called_once() - client_search.assert_any_call(body={"size": 2}, index=["yolo"]) + client_search.assert_any_call(size=2, index=["yolo"]) client_search.reset_mock() s.size(2)._repr_html_() client_search.assert_called_once() - client_search.assert_any_call(body={"size": 2}, index=["yolo"]) + client_search.assert_any_call(size=2, index=["yolo"]) @patch.object(Elasticsearch, "search") @@ -457,19 +457,26 @@ def test_repr_aggs_execution(client_search): s.__repr__() client_search.assert_called_once() client_search.assert_any_call( - body={ - "size": 0, - "aggs": { - "toto_terms": { - "terms": {"field": "toto"}, - "aggs": {"toto_avg_price": {"avg": {"field": "price"}}}, - } - }, + size=0, + aggs={ + "toto_terms": { + "terms": {"field": "toto"}, + "aggs": {"toto_avg_price": {"avg": {"field": "price"}}}, + } }, index=["yolo"], ) +def test_scan(data_client): + search = Search(using=data_client, index="git") + hits_it = search.scan(scroll="1m", size=30) + assert hasattr(hits_it, "__iter__") + hits: List[Hit] = list(hits_it) + assert len(hits) == 52 + assert all(isinstance(h, Hit) for h in hits) + + def test_scan_composite_agg(data_client, git_mappings): search = ( Search(using=data_client, index="git", mappings=git_mappings) @@ -486,6 +493,31 @@ def test_scan_composite_agg(data_client, git_mappings): assert hasattr(bucket_iterator, "__iter__") buckets = list(bucket_iterator) assert buckets == [ + { + "doc_count": 2, + "insertions_sum": {"value": 91.0}, + "key": {"compatible_histogram": 1393804800000}, + }, + { + "doc_count": 1, + "insertions_sum": {"value": 692.0}, + "key": {"compatible_histogram": 1393891200000}, + }, + { + "doc_count": 3, + "insertions_sum": {"value": 134.0}, + "key": {"compatible_histogram": 1393977600000}, + }, + { + "doc_count": 3, + "insertions_sum": {"value": 179.0}, + "key": {"compatible_histogram": 1394064000000}, + }, + { + "doc_count": 9, + "insertions_sum": {"value": 344.0}, + "key": {"compatible_histogram": 1394150400000}, + }, { "doc_count": 2, "insertions_sum": {"value": 120.0}, @@ -567,6 +599,11 @@ def test_scan_composite_agg_at_once(data_client, git_mappings): assert agg_response.to_tabular(index_orient=True) == ( ["compatible_histogram", "author"], { + (1393804800000, "Honza Král"): {"doc_count": 2, "insertions_sum": 91.0}, + (1393891200000, "Honza Král"): {"doc_count": 1, "insertions_sum": 692.0}, + (1393977600000, "Honza Král"): {"doc_count": 3, "insertions_sum": 134.0}, + (1394064000000, "Honza Král"): {"doc_count": 3, "insertions_sum": 179.0}, + (1394150400000, "Honza Král"): {"doc_count": 9, "insertions_sum": 344.0}, (1394409600000, "Honza Král"): {"doc_count": 2, "insertions_sum": 120.0}, (1394841600000, "Honza Král"): {"doc_count": 4, "insertions_sum": 45.0}, (1395360000000, "Honza Král"): {"doc_count": 2, "insertions_sum": 34.0}, @@ -608,6 +645,11 @@ def test_scan_composite_agg_at_once(data_client, git_mappings): assert agg_response.to_tabular(index_orient=True) == ( ["commit_date", "author_name"], { + (1393804800000, "Honza Král"): {"doc_count": 2, "insertions_sum": 91.0}, + (1393891200000, "Honza Král"): {"doc_count": 1, "insertions_sum": 692.0}, + (1393977600000, "Honza Král"): {"doc_count": 3, "insertions_sum": 134.0}, + (1394064000000, "Honza Král"): {"doc_count": 3, "insertions_sum": 179.0}, + (1394150400000, "Honza Král"): {"doc_count": 9, "insertions_sum": 344.0}, (1394409600000, "Honza Král"): {"doc_count": 2, "insertions_sum": 120.0}, (1394841600000, "Honza Král"): {"doc_count": 4, "insertions_sum": 45.0}, (1395360000000, "Honza Král"): {"doc_count": 2, "insertions_sum": 34.0}, diff --git a/tests/test_utils.py b/tests/test_utils.py index a07cffad..d2d7a416 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,8 @@ import pytest from pandagg.document import DocumentSource -from pandagg.node.mappings import Text, Keyword -from pandagg.utils import equal_queries, equal_search, is_subset, get_action_modifier +from pandagg.node.mappings import Keyword, Text +from pandagg.utils import equal_queries, equal_search, get_action_modifier, is_subset def test_equal(): diff --git a/tests/testing_samples/data_nested_sample.py b/tests/testing_samples/data_nested_sample.py index d026e9ce..1e0e9543 100644 --- a/tests/testing_samples/data_nested_sample.py +++ b/tests/testing_samples/data_nested_sample.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- -EXPECTED_REPR = u"""""" +EXPECTED_REPR = """""" EXPECTED_AGG_QUERY = { "week": { diff --git a/tests/testing_samples/data_sample.py b/tests/testing_samples/data_sample.py index 399efdff..7dcaeb55 100644 --- a/tests/testing_samples/data_sample.py +++ b/tests/testing_samples/data_sample.py @@ -8,7 +8,6 @@ from pandagg.aggs import Aggs, Avg, Terms from tests.testing_samples.mapping_example import MAPPINGS - EXPECTED_AGG_QUERY = { "classification_type": { "aggs": { diff --git a/tests/tree/aggs/test_aggs.py b/tests/tree/aggs/test_aggs.py index 26756dfb..dcbb7e68 100644 --- a/tests/tree/aggs/test_aggs.py +++ b/tests/tree/aggs/test_aggs.py @@ -3,15 +3,13 @@ import pytest from mock import patch -from pandagg.tree.aggs import Aggs +import tests.testing_samples.data_sample as sample +from pandagg.aggs import Avg, DateHistogram, Filter, Min, Terms from pandagg.exceptions import ( - InvalidOperationMappingFieldError, AbsentMappingFieldError, + InvalidOperationMappingFieldError, ) -from pandagg.aggs import DateHistogram, Terms, Avg, Min, Filter - -import tests.testing_samples.data_sample as sample - +from pandagg.tree.aggs import Aggs from tests.testing_samples.mapping_example import MAPPINGS diff --git a/tests/tree/mapping/test_mappings.py b/tests/tree/mapping/test_mappings.py index a8510561..2fa823e9 100644 --- a/tests/tree/mapping/test_mappings.py +++ b/tests/tree/mapping/test_mappings.py @@ -1,9 +1,9 @@ import pytest from pandagg.exceptions import AbsentMappingFieldError -from pandagg.mappings import Keyword, Object, Text, Nested, Integer, Mappings +from pandagg.mappings import Integer, Keyword, Mappings, Nested, Object, Text from pandagg.node.mappings.abstract import Field -from tests.testing_samples.mapping_example import MAPPINGS, EXPECTED_MAPPING_TREE_REPR +from tests.testing_samples.mapping_example import EXPECTED_MAPPING_TREE_REPR, MAPPINGS def test_deserialization(): diff --git a/tests/tree/query/test_query.py b/tests/tree/query/test_query.py index 436c10ce..2af598b0 100644 --- a/tests/tree/query/test_query.py +++ b/tests/tree/query/test_query.py @@ -1,13 +1,12 @@ -from mock import patch - from lighttree.tree import NotFoundNodeError +from mock import patch from pandagg.node.query._parameter_clause import _Must -from pandagg.query import Query, Range, Prefix, Ids, Term, Terms, Nested -from pandagg.node.query.term_level import Term as TermNode, Exists as ExistsNode -from pandagg.node.query.joining import Nested as NestedNode from pandagg.node.query.compound import Bool - +from pandagg.node.query.joining import Nested as NestedNode +from pandagg.node.query.term_level import Exists as ExistsNode +from pandagg.node.query.term_level import Term as TermNode +from pandagg.query import Ids, Nested, Prefix, Query, Range, Term, Terms from pandagg.utils import equal_queries, ordered from tests import PandaggTestCase