From 259a8742c56f0749d36df0bac1f2c2f1ca4c09f6 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 9 Nov 2023 13:54:35 +0300 Subject: [PATCH] Refactor typing stubs (#155) * added black * bumped mypy to 1.5.1 * updated typing usages * added py.typed and specified that insight pkg is typed in build-time --- .pre-commit-config.yaml | 15 +- pyproject.toml | 10 +- src/insight/__init__.py | 2 +- src/insight/alembic/env.py | 21 +-- src/insight/alembic/main.py | 5 +- .../9aca5ae68ff5_add_initial_tables.py | 81 ++++++---- .../d2198fd60b0e_added_result_run_id.py | 16 +- src/insight/check.py | 35 ++--- src/insight/database/schema.py | 1 + src/insight/database/utils.py | 35 +++-- src/insight/metrics/__init__.py | 25 ++- src/insight/metrics/base.py | 126 ++++++++------- src/insight/metrics/confidence_interval.py | 78 ++++++---- src/insight/metrics/metrics.py | 95 ++++++++---- src/insight/metrics/metrics_usage.py | 18 +-- src/insight/metrics/utils.py | 44 ++++-- src/insight/plot.py | 128 ++++++++-------- src/insight/py.typed | 2 + tests/test_database/test_db.py | 22 ++- tests/test_metrics/test_check.py | 46 ++++-- .../test_metrics/test_confidence_interval.py | 23 ++- tests/test_metrics/test_metrics.py | 145 ++++++++++-------- tests/test_metrics/test_metrics_usage.py | 60 ++++++-- tests/test_metrics/test_plotting.py | 48 +++--- 24 files changed, 642 insertions(+), 439 deletions(-) create mode 100644 src/insight/py.typed diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bee1666c..87e9848e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,8 +14,16 @@ repos: - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.961 + rev: v1.5.1 hooks: - id: mypy files: src @@ -26,8 +34,3 @@ repos: args: [--install-types, --non-interactive] # Note that using the --install-types is problematic if running in # parallel as mutating the pre-commit env at runtime breaks cache. - - repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort -... diff --git a/pyproject.toml b/pyproject.toml index 96552c9b..83ef0cea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ dependencies = [ write_to = "src/insight/version.py" [tool.setuptools.package-data] - insight = ["src/insight/fonts/SourceSansPro-Regular.ttf"] + insight = ["src/insight/fonts/SourceSansPro-Regular.ttf", "py.typed"] [tool.pytest.ini_options] junit_suite_name = "unit" @@ -102,6 +102,8 @@ dependencies = [ [tool.mypy] plugins = "numpy.typing.mypy_plugin, sqlalchemy.ext.mypy.plugin" + disallow_any_generics = false + disable_error_code = "type-arg" [[tool.mypy.overrides]] module = "pandas.*" @@ -120,9 +122,13 @@ dependencies = [ ignore_missing_imports = true [tool.isort] + profile = "black" multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true ensure_newline_before_comments = true - line_length = 120 + line_length = 100 + + [tool.black] + line-length = 100 diff --git a/src/insight/__init__.py b/src/insight/__init__.py index ca2b3d7c..a17ce80a 100644 --- a/src/insight/__init__.py +++ b/src/insight/__init__.py @@ -1,4 +1,4 @@ from . import metrics, plot from .check import ColumnCheck -__all__ = ['ColumnCheck', 'plot', 'metrics'] +__all__ = ["ColumnCheck", "plot", "metrics"] diff --git a/src/insight/alembic/env.py b/src/insight/alembic/env.py index ecae9355..166d78ba 100644 --- a/src/insight/alembic/env.py +++ b/src/insight/alembic/env.py @@ -63,13 +63,16 @@ def run_migrations_online() -> None: if url is None: raise ValueError("No sqlalchemy.url specified in config file") - config.set_main_option("sqlalchemy.url", url.format( - POSTGRES_USER=POSTGRES_USER, - POSTGRES_PASSWORD=POSTGRES_PASSWORD, - POSTGRES_HOST=POSTGRES_HOST, - POSTGRES_PORT=POSTGRES_PORT, - POSTGRES_DATABASE=POSTGRES_DATABASE - )) + config.set_main_option( + "sqlalchemy.url", + url.format( + POSTGRES_USER=POSTGRES_USER, + POSTGRES_PASSWORD=POSTGRES_PASSWORD, + POSTGRES_HOST=POSTGRES_HOST, + POSTGRES_PORT=POSTGRES_PORT, + POSTGRES_DATABASE=POSTGRES_DATABASE, + ), + ) connectable = engine_from_config( config.get_section(config.config_ini_section) or {}, prefix="sqlalchemy.", @@ -77,9 +80,7 @@ def run_migrations_online() -> None: ) with connectable.connect() as connection: - context.configure( - connection=connection, target_metadata=target_metadata - ) + context.configure(connection=connection, target_metadata=target_metadata) with context.begin_transaction(): context.run_migrations() diff --git a/src/insight/alembic/main.py b/src/insight/alembic/main.py index 0a9b1363..13c8cc0d 100644 --- a/src/insight/alembic/main.py +++ b/src/insight/alembic/main.py @@ -7,9 +7,10 @@ here = os.path.dirname(os.path.abspath(__file__)) args = sys.argv[1:] -args = args if len(args) > 0 else ["upgrade", "head"] # default +args = args if len(args) > 0 else ["upgrade", "head"] # default alembic_args = [ - '-c', os.path.join(here, 'alembic.ini'), + "-c", + os.path.join(here, "alembic.ini"), ] + args diff --git a/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py b/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py index 9ce62566..80211c87 100644 --- a/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py +++ b/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py @@ -9,7 +9,7 @@ from alembic import op # revision identifiers, used by Alembic. -revision = '9aca5ae68ff5' +revision = "9aca5ae68ff5" down_revision = None branch_labels = None depends_on = None @@ -17,46 +17,59 @@ def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.create_table('dataset', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('name', sa.VARCHAR(length=200), nullable=False), - sa.Column('num_rows', sa.INTEGER(), nullable=True), - sa.Column('num_columns', sa.INTEGER(), nullable=True), - sa.Column('created_at', sa.TIMESTAMP(), nullable=True), - sa.PrimaryKeyConstraint('id') + op.create_table( + "dataset", + sa.Column("id", sa.INTEGER(), nullable=False), + sa.Column("name", sa.VARCHAR(length=200), nullable=False), + sa.Column("num_rows", sa.INTEGER(), nullable=True), + sa.Column("num_columns", sa.INTEGER(), nullable=True), + sa.Column("created_at", sa.TIMESTAMP(), nullable=True), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('metric', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('name', sa.VARCHAR(length=100), nullable=False), - sa.Column('category', sa.VARCHAR(length=100), nullable=True), - sa.Column('created_at', sa.TIMESTAMP(), nullable=True), - sa.PrimaryKeyConstraint('id') + op.create_table( + "metric", + sa.Column("id", sa.INTEGER(), nullable=False), + sa.Column("name", sa.VARCHAR(length=100), nullable=False), + sa.Column("category", sa.VARCHAR(length=100), nullable=True), + sa.Column("created_at", sa.TIMESTAMP(), nullable=True), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('version', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('name', sa.VARCHAR(length=50), nullable=True), - sa.Column('created_at', sa.TIMESTAMP(), nullable=True), - sa.PrimaryKeyConstraint('id') + op.create_table( + "version", + sa.Column("id", sa.INTEGER(), nullable=False), + sa.Column("name", sa.VARCHAR(length=50), nullable=True), + sa.Column("created_at", sa.TIMESTAMP(), nullable=True), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('result', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('metric_id', sa.INTEGER(), nullable=True), - sa.Column('dataset_id', sa.INTEGER(), nullable=True), - sa.Column('version_id', sa.INTEGER(), nullable=True), - sa.Column('value', sa.FLOAT(), nullable=True), - sa.Column('created_at', sa.TIMESTAMP(), nullable=True), - sa.ForeignKeyConstraint(['dataset_id'], ['dataset.id'], ), - sa.ForeignKeyConstraint(['metric_id'], ['metric.id'], ), - sa.ForeignKeyConstraint(['version_id'], ['version.id'], ), - sa.PrimaryKeyConstraint('id') + op.create_table( + "result", + sa.Column("id", sa.INTEGER(), nullable=False), + sa.Column("metric_id", sa.INTEGER(), nullable=True), + sa.Column("dataset_id", sa.INTEGER(), nullable=True), + sa.Column("version_id", sa.INTEGER(), nullable=True), + sa.Column("value", sa.FLOAT(), nullable=True), + sa.Column("created_at", sa.TIMESTAMP(), nullable=True), + sa.ForeignKeyConstraint( + ["dataset_id"], + ["dataset.id"], + ), + sa.ForeignKeyConstraint( + ["metric_id"], + ["metric.id"], + ), + sa.ForeignKeyConstraint( + ["version_id"], + ["version.id"], + ), + sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('result') - op.drop_table('version') - op.drop_table('metric') - op.drop_table('dataset') + op.drop_table("result") + op.drop_table("version") + op.drop_table("metric") + op.drop_table("dataset") # ### end Alembic commands ### diff --git a/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py b/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py index dba926ec..2ff658e1 100644 --- a/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py +++ b/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py @@ -9,25 +9,21 @@ from alembic import op # revision identifiers, used by Alembic. -revision = 'd2198fd60b0e' -down_revision = '9aca5ae68ff5' +revision = "d2198fd60b0e" +down_revision = "9aca5ae68ff5" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.add_column('result', sa.Column('run_id', sa.VARCHAR(length=50), nullable=True, default=None)) - op.alter_column('version', 'name', - existing_type=sa.VARCHAR(length=50), - nullable=False) + op.add_column("result", sa.Column("run_id", sa.VARCHAR(length=50), nullable=True, default=None)) + op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=False) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.alter_column('version', 'name', - existing_type=sa.VARCHAR(length=50), - nullable=True) - op.drop_column('result', 'run_id') + op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=True) + op.drop_column("result", "run_id") # ### end Alembic commands ### diff --git a/src/insight/check.py b/src/insight/check.py index 5db91b83..a2483012 100644 --- a/src/insight/check.py +++ b/src/insight/check.py @@ -1,5 +1,5 @@ +import typing as ty from abc import ABC, abstractmethod -from typing import cast import numpy as np import pandas as pd @@ -27,13 +27,13 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series: n_empty_str = 0 if not pd.api.types.is_string_dtype(col) else sr.eq("").sum() # Try to convert it to numeric - if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != 'M': + if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != "M": col_num = pd.to_numeric(col, errors="coerce") if col_num.isna().sum() == n_nans + n_empty_str: col = col_num # Try to convert it to date - if col.dtype.kind == "O" or col.dtype.kind == 'M': + if col.dtype.kind == "O" or col.dtype.kind == "M": try: col_date = pd.to_datetime(col, errors="coerce") except TypeError: @@ -48,7 +48,7 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series: elif out_dtype in ("i", "u", "f", "f8", "i8", "u8"): return pd.to_numeric(col, errors="coerce") - return cast(pd.Series, col.astype(out_dtype, errors="ignore")) + return ty.cast(pd.Series, col.astype(out_dtype, errors="ignore")) # type: ignore @abstractmethod def continuous(self, sr: pd.Series): @@ -83,9 +83,8 @@ class ColumnCheck(Check): Categorical threshold log multiplier. Default value is 2.5. """ - def __init__(self, - min_num_unique: int = 10, - ctl_mult: float = 2.5): + + def __init__(self, min_num_unique: int = 10, ctl_mult: float = 2.5): self.min_num_unique = min_num_unique self.ctl_mult = ctl_mult @@ -95,9 +94,9 @@ def continuous(self, sr: pd.Series) -> bool: """ sr = self.infer_dtype(sr) sr_dtype = str(sr.dtype) - if len(sr.unique()) >= max(self.min_num_unique, - self.ctl_mult * np.log(len(sr)))\ - and sr_dtype in ("float64", "int64"): + if len(sr.unique()) >= max( + self.min_num_unique, self.ctl_mult * np.log(len(sr)) + ) and sr_dtype in ("float64", "int64"): return True return False @@ -106,7 +105,9 @@ def categorical(self, sr: pd.Series) -> bool: if pd.api.types.is_categorical_dtype(sr) is True: return True sr = self.infer_dtype(sr) - if sr.dtype.kind == "M": # TODO: Need to implement ability to deal with dates well in metrics + if ( + sr.dtype.kind == "M" + ): # TODO: Need to implement ability to deal with dates well in metrics return False if not self.continuous(sr): @@ -119,14 +120,13 @@ def ordinal(self, sr: pd.Series) -> bool: Columns which are categorical in nature, but contain numbers/dates/bool are ordinal too. E.g. [2, 1, 1, 2, 7, 2, 1, 7] """ - if (pd.api.types.is_categorical_dtype(sr) is True - and sr.cat.ordered is True)\ - or pd.api.types.is_bool_dtype(sr) is True: + if ( + pd.api.types.is_categorical_dtype(sr) is True and sr.cat.ordered is True + ) or pd.api.types.is_bool_dtype(sr) is True: return True sr_inferred = self.infer_dtype(sr) - if sr_inferred.dtype in ("float64", "int64")\ - or sr_inferred.dtype.kind in ("M", "m"): + if sr_inferred.dtype in ("float64", "int64") or sr_inferred.dtype.kind in ("M", "m"): return True return False @@ -135,7 +135,6 @@ def affine(self, sr: pd.Series) -> bool: Continuous columns along with the columns of type DateTime and the Timedelta are affine """ - if self.continuous(sr) is True\ - or self.infer_dtype(sr).dtype.kind in ("M", "m"): + if self.continuous(sr) is True or self.infer_dtype(sr).dtype.kind in ("M", "m"): return True return False diff --git a/src/insight/database/schema.py b/src/insight/database/schema.py index fe392cb2..cbec19d4 100644 --- a/src/insight/database/schema.py +++ b/src/insight/database/schema.py @@ -5,6 +5,7 @@ Base = declarative_base() mapped_column = Column + class Dataset(Base): __tablename__ = "dataset" diff --git a/src/insight/database/utils.py b/src/insight/database/utils.py index 51f7050b..1ff70dbf 100644 --- a/src/insight/database/utils.py +++ b/src/insight/database/utils.py @@ -1,7 +1,7 @@ """Utils for fetching information from the backend DB.""" import os import re -import typing +import typing as ty import pandas as pd from sqlalchemy import create_engine @@ -10,7 +10,7 @@ import insight.database.schema as model -NamedModelType = typing.TypeVar('NamedModelType', model.Dataset, model.Metric, model.Version) +NamedModelType = ty.TypeVar("NamedModelType", model.Dataset, model.Metric, model.Version) _database_fail_note = "Failure to communicate with the database" @@ -25,7 +25,10 @@ def get_df(url_or_path: str): def get_df_id( - df_name: str, session: Session, num_rows: int = None, num_columns: int = None + df_name: str, + session: Session, + num_rows: ty.Optional[int] = None, + num_columns: ty.Optional[int] = None, ) -> int: """Get the id of a dataframe in the database. If it doesn't exist, create it. @@ -47,7 +50,7 @@ def get_df_id( return int(dataset.id) -def get_metric_id(metric: str, session: Session, category: str = None) -> int: +def get_metric_id(metric: str, session: Session, category: ty.Optional[str] = None) -> int: """Get the id of a metric in the database. If it doesn't exist, create it. Args: @@ -86,14 +89,14 @@ def get_version_id(version: str, session: Session) -> int: def get_object_from_db_by_name( - name: str, session: Session, model_cls: typing.Type[NamedModelType] -) -> typing.Union[NamedModelType, None]: + name: str, session: Session, model_cls: ty.Type[NamedModelType] +) -> ty.Union[NamedModelType, None]: """Get an object from the database by name. Args: name (str): The name of the object. session (Session): The database session. - model_cls (typing.Type[NamedModelType]): The class of the object. + model_cls (ty.Type[NamedModelType]): The class of the object. """ with session: result = session.execute( @@ -102,7 +105,7 @@ def get_object_from_db_by_name( return result -def get_session() -> typing.Optional[Session]: +def get_session() -> ty.Optional[Session]: """ If a database exists, returns a sessionmaker object. Else returns None. Returns: sessionmaker object that can be used to access the database. @@ -114,13 +117,15 @@ def get_session() -> typing.Optional[Session]: POSTGRES_PORT = os.environ.get("POSTGRES_PORT", "5432") POSTGRES_USER = os.environ.get("POSTGRES_USER", "postgres") POSTGRES_DATABASE = os.environ.get("POSTGRES_DATABASE", "postgres") - db_url = "postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@" \ - "{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DATABASE}".format( - POSTGRES_HOST=POSTGRES_HOST, - POSTGRES_PORT=POSTGRES_PORT, - POSTGRES_USER=POSTGRES_USER, - POSTGRES_PASSWORD=POSTGRES_PASSWORD, - POSTGRES_DATABASE=POSTGRES_DATABASE + db_url = ( + "postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@" + "{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DATABASE}".format( + POSTGRES_HOST=POSTGRES_HOST, + POSTGRES_PORT=POSTGRES_PORT, + POSTGRES_USER=POSTGRES_USER, + POSTGRES_PASSWORD=POSTGRES_PASSWORD, + POSTGRES_DATABASE=POSTGRES_DATABASE, + ) ) engine = create_engine(db_url, future=True) diff --git a/src/insight/metrics/__init__.py b/src/insight/metrics/__init__.py index 37c3369b..a1fc7dd8 100644 --- a/src/insight/metrics/__init__.py +++ b/src/insight/metrics/__init__.py @@ -15,7 +15,24 @@ ) from .metrics_usage import CorrMatrix, DiffCorrMatrix, OneColumnMap, TwoColumnMap -__all__ = ['OneColumnMetric', 'TwoColumnMetric', 'OneColumnMap', 'TwoColumnMap', 'CorrMatrix', 'DiffCorrMatrix', - 'CramersV', 'EarthMoversDistance', 'Mean', 'StandardDeviation', 'KendallTauCorrelation', 'Norm', 'TwoDataFrameMetric', - 'EarthMoversDistanceBinned', 'JensenShannonDivergence', 'KullbackLeiblerDivergence', 'HellingerDistance', - 'BhattacharyyaCoefficient', 'TotalVariationDistance'] +__all__ = [ + "OneColumnMetric", + "TwoColumnMetric", + "OneColumnMap", + "TwoColumnMap", + "CorrMatrix", + "DiffCorrMatrix", + "CramersV", + "EarthMoversDistance", + "Mean", + "StandardDeviation", + "KendallTauCorrelation", + "Norm", + "TwoDataFrameMetric", + "EarthMoversDistanceBinned", + "JensenShannonDivergence", + "KullbackLeiblerDivergence", + "HellingerDistance", + "BhattacharyyaCoefficient", + "TotalVariationDistance", +] diff --git a/src/insight/metrics/base.py b/src/insight/metrics/base.py index ce82dc1b..32802c93 100644 --- a/src/insight/metrics/base.py +++ b/src/insight/metrics/base.py @@ -1,7 +1,7 @@ """This module contains the base classes for the metrics used across synthesized.""" import os +import typing as ty from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Type, Union import pandas as pd @@ -23,9 +23,10 @@ class _Metric(ABC): """ An abstract base class from which more detailed metrics are derived. """ - name: Optional[str] = None - _registry: Dict[str, Type] = {} - _session: Optional[Session] = utils.get_session() if utils is not None else None + + name: ty.Optional[str] = None + _registry: ty.Dict[str, ty.Type] = {} + _session: ty.Optional[Session] = utils.get_session() if utils is not None else None def __init_subclass__(cls): if cls.name is not None and cls.name not in _Metric._registry: @@ -37,15 +38,15 @@ def __repr__(self): def __str__(self): return f"{self.name}" - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> ty.Dict[str, ty.Any]: """ Converts the metric into a dictionary representation of itself. Returns: a dictionary with key value pairs that represent the metric. """ - return {'name': self.name} + return {"name": self.name} @classmethod - def from_dict(cls, bluprnt: Dict[str, Any], check: Check = None): + def from_dict(cls, bluprnt: ty.Dict[str, ty.Any], check: ty.Optional[Check] = None): """ Given a dictionary, builds and returns a metric that corresponds to the specified metric with the given metric parameters. @@ -56,19 +57,21 @@ def from_dict(cls, bluprnt: Dict[str, Any], check: Check = None): 'metric_param2': param2 ...} """ - bluprnt_params = {key: val for key, val in bluprnt.items() if key != 'name'} + bluprnt_params = {key: val for key, val in bluprnt.items() if key != "name"} if check is not None: - bluprnt.update({'check': check}) + bluprnt.update({"check": check}) - metric = _Metric._registry[bluprnt['name']](**bluprnt_params) + metric = _Metric._registry[bluprnt["name"]](**bluprnt_params) return metric - def _add_to_database(self, - value, - dataset_name: str, - dataset_rows: int = None, - dataset_cols: int = None, - category: str = None): + def _add_to_database( + self, + value, + dataset_name: str, + dataset_rows: ty.Optional[int] = None, + dataset_cols: ty.Optional[int] = None, + category: ty.Optional[str] = None, + ): """ Adds the metric result to the database. The metric result should be specified as value. Args: @@ -85,7 +88,9 @@ def _add_to_database(self, run_id = os.getenv("RUN_ID") if model is None or utils is None: - raise ModuleNotFoundError("The database module is not available. Please install it using the command: pip install 'insight[db]'") + raise ModuleNotFoundError( + "The database module is not available. Please install it using the command: pip install 'insight[db]'" + ) if self._session is None: raise RuntimeError("Called a database function when no database exists.") @@ -93,15 +98,21 @@ def _add_to_database(self, if self.name is None: raise AttributeError("Every initializeable subclass of _Metric must have a name string") - if hasattr(value, 'item'): + if hasattr(value, "item"): value = value.item() with self._session as session: metric_id = utils.get_metric_id(self.name, session, category=category) version_id = utils.get_version_id(version, session) - dataset_id = utils.get_df_id(dataset_name, session, num_rows=dataset_rows, num_columns=dataset_cols) + dataset_id = utils.get_df_id( + dataset_name, session, num_rows=dataset_rows, num_columns=dataset_cols + ) result = model.Result( - metric_id=metric_id, dataset_id=dataset_id, version_id=version_id, value=value, run_id=run_id + metric_id=metric_id, + dataset_id=dataset_id, + version_id=version_id, + value=value, + run_id=run_id, ) session.add(result) session.commit() @@ -156,9 +167,7 @@ def check_column_types(cls, sr: pd.Series, check: Check = ColumnCheck()) -> bool def _compute_metric(self, sr: pd.Series): ... - def __call__(self, - sr: pd.Series, - dataset_name: str = None): + def __call__(self, sr: pd.Series, dataset_name: ty.Optional[str] = None): if not self.check_column_types(sr, self.check): value = None else: @@ -166,11 +175,13 @@ def __call__(self, if self._upload_to_database: dataset_name = "Series_" + str(sr.name) if dataset_name is None else dataset_name - self._add_to_database(value, - dataset_name, - dataset_rows=len(sr), - category='OneColumnMetric', - dataset_cols=1) + self._add_to_database( + value, + dataset_name, + dataset_rows=len(sr), + category="OneColumnMetric", + dataset_cols=1, + ) return value @@ -226,7 +237,7 @@ def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = Col def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): ... - def __call__(self, sr_a: pd.Series, sr_b: pd.Series, dataset_name: str = None): + def __call__(self, sr_a: pd.Series, sr_b: pd.Series, dataset_name: ty.Optional[str] = None): if not self.check_column_types(sr_a, sr_b, self.check): value = None else: @@ -234,11 +245,13 @@ def __call__(self, sr_a: pd.Series, sr_b: pd.Series, dataset_name: str = None): if self._upload_to_database: dataset_name = "Series_" + str(sr_a.name) if dataset_name is None else dataset_name - self._add_to_database(value, - dataset_name, - dataset_rows=len(sr_a), - category='TwoColumnMetric', - dataset_cols=1) + self._add_to_database( + value, + dataset_name, + dataset_rows=len(sr_a), + category="TwoColumnMetric", + dataset_cols=1, + ) return value @@ -259,7 +272,9 @@ class DataFrameMetric(_Metric): 3 """ - def __call__(self, df: pd.DataFrame, dataset_name: str = None) -> Union[pd.DataFrame, None]: + def __call__( + self, df: pd.DataFrame, dataset_name: ty.Optional[str] = None + ) -> ty.Union[pd.DataFrame, None]: result = self._compute_result(df) dataset_rows = df.shape[0] dataset_cols = df.shape[1] @@ -268,13 +283,16 @@ def __call__(self, df: pd.DataFrame, dataset_name: str = None) -> Union[pd.DataF dataset_name = df.attrs.get("name") # Explicit cast for mypy. if dataset_name is None: raise AttributeError( - "Must specify the name of the dataset name as a parameter to upload to database.") - - self._add_to_database(self.summarize_result(result), - dataset_name, - dataset_rows=dataset_rows, - dataset_cols=dataset_cols, - category='DataFrameMetric') + "Must specify the name of the dataset name as a parameter to upload to database." + ) + + self._add_to_database( + self.summarize_result(result), + dataset_name, + dataset_rows=dataset_rows, + dataset_cols=dataset_cols, + category="DataFrameMetric", + ) return result @abstractmethod @@ -308,10 +326,9 @@ class TwoDataFrameMetric(_Metric): """ - def __call__(self, - df_old: pd.DataFrame, - df_new: pd.DataFrame, - dataset_name: str = None) -> Union[pd.DataFrame, None]: + def __call__( + self, df_old: pd.DataFrame, df_new: pd.DataFrame, dataset_name: ty.Optional[str] = None + ) -> ty.Union[pd.DataFrame, None]: result = self._compute_result(df_old, df_new) dataset_rows = df_old.shape[0] dataset_cols = df_old.shape[1] @@ -320,13 +337,16 @@ def __call__(self, dataset_name = df_old.attrs.get("name") # Explicit cast for mypy. if dataset_name is None: raise AttributeError( - "Must specify the name of the dataset name as a parameter to upload to database.") - - self._add_to_database(self.summarize_result(result), - dataset_name, - dataset_cols=dataset_cols, - dataset_rows=dataset_rows, - category='TwoDataFrameMetrics') + "Must specify the name of the dataset name as a parameter to upload to database." + ) + + self._add_to_database( + self.summarize_result(result), + dataset_name, + dataset_cols=dataset_cols, + dataset_rows=dataset_rows, + category="TwoDataFrameMetrics", + ) return result @abstractmethod diff --git a/src/insight/metrics/confidence_interval.py b/src/insight/metrics/confidence_interval.py index b95edd86..a518a737 100644 --- a/src/insight/metrics/confidence_interval.py +++ b/src/insight/metrics/confidence_interval.py @@ -1,4 +1,4 @@ -from typing import NamedTuple, Tuple, Union, cast +import typing as ty import numpy as np import pandas as pd @@ -8,16 +8,18 @@ from .utils import bootstrap_binned_statistic, bootstrap_statistic -class ConfidenceInterval(NamedTuple): - limits: Tuple +class ConfidenceInterval(ty.NamedTuple): + limits: ty.Tuple level: float -def compute_bootstrap_interval(metric_cls_obj: Union[OneColumnMetric, TwoColumnMetric], - sr_a: pd.Series, - sr_b: pd.Series = None, - confidence_level: float = 0.95, - binned: bool = False) -> ConfidenceInterval: +def compute_bootstrap_interval( + metric_cls_obj: ty.Union[OneColumnMetric, TwoColumnMetric], + sr_a: pd.Series, + sr_b: ty.Optional[pd.Series] = None, + confidence_level: float = 0.95, + binned: bool = False, +) -> ConfidenceInterval: """Return a frequentist confidence interval for this metric obtained, via bootstrap resampling Args: @@ -33,25 +35,37 @@ def compute_bootstrap_interval(metric_cls_obj: Union[OneColumnMetric, TwoColumnM if isinstance(metric_cls_obj, OneColumnMetric): metric_value = metric_cls_obj(sr_a) - one_col_metric: OneColumnMetric = metric_cls_obj # Need explicit casting because of mypy bug/issue (#2608) + one_col_metric: OneColumnMetric = ( + metric_cls_obj # Need explicit casting because of mypy bug/issue (#2608) + ) samples = bootstrap_statistic((sr_a,), lambda x: one_col_metric(x)) else: - metric_value = metric_cls_obj(sr_a, cast(pd.Series, sr_b)) - two_col_metric: TwoColumnMetric = metric_cls_obj # Need explicit casting because of mypy bug/issue (#2608) + metric_value = metric_cls_obj(sr_a, ty.cast(pd.Series, sr_b)) + two_col_metric: TwoColumnMetric = ( + metric_cls_obj # Need explicit casting because of mypy bug/issue (#2608) + ) if not binned: - samples = bootstrap_statistic((sr_a, cast(pd.Series, sr_b)), lambda x, y: two_col_metric(x, y)) + samples = bootstrap_statistic( + (sr_a, ty.cast(pd.Series, sr_b)), lambda x, y: two_col_metric(x, y) + ) else: - samples = bootstrap_binned_statistic((sr_a, cast(pd.Series, sr_b)), lambda x, y: two_col_metric(x, y)) + samples = bootstrap_binned_statistic( + (sr_a, ty.cast(pd.Series, sr_b)), lambda x, y: two_col_metric(x, y) + ) percentiles = 100 * (1 - confidence_level) / 2, 100 * (1 - (1 - confidence_level) / 2) d1, d2 = np.percentile(samples, percentiles) - return ConfidenceInterval(limits=(2 * metric_value - d2, 2 * metric_value - d1), level=confidence_level) + return ConfidenceInterval( + limits=(2 * metric_value - d2, 2 * metric_value - d1), level=confidence_level + ) -def binomial_proportion_interval(success_prop: float, - num_samples: int, - method: str = 'clopper-pearson', - confidence_level: float = 0.95) -> ConfidenceInterval: +def binomial_proportion_interval( + success_prop: float, + num_samples: int, + method: str = "clopper-pearson", + confidence_level: float = 0.95, +) -> ConfidenceInterval: """ Calculate an approximate confidence interval for a binomial proportion of a sample. Should only be used for binomial distribution @@ -59,7 +73,7 @@ def binomial_proportion_interval(success_prop: float, Args: success_prop: Proportion of successes. num_samples: Sample size. - method: Optional; The approximation method used to calculate the interval. + method: ty.Optional; The approximation method used to calculate the interval. One of 'normal', 'clopper-pearson', 'agresti-coull'. confidence_level: Level on which confidence interval is computed @@ -71,36 +85,40 @@ def binomial_proportion_interval(success_prop: float, alpha = 1 - confidence_level z = norm.ppf(1 - alpha / 2) - if method == 'normal': + if method == "normal": low = success_prop - z * np.sqrt(success_prop * (1 - success_prop) / num_samples) high = success_prop + z * np.sqrt(success_prop * (1 - success_prop) / num_samples) - elif method == 'clopper-pearson': + elif method == "clopper-pearson": low = beta.ppf(alpha / 2, k, num_samples - k + 1) high = beta.ppf(1 - alpha / 2, k + 1, num_samples - k) - elif method == 'agresti-coull': + elif method == "agresti-coull": n_ = num_samples + z**2 p_ = 1 / n_ * (k + z**2 / 2) low = p_ - z * np.sqrt(p_ * (1 - p_) / n_) high = p_ + z * np.sqrt(p_ * (1 - p_) / n_) else: - raise ValueError("'method' argument must be one of 'normal', 'clopper-pearson', 'agresti-coull'.") + raise ValueError( + "'method' argument must be one of 'normal', 'clopper-pearson', 'agresti-coull'." + ) return ConfidenceInterval(limits=(low, high), level=confidence_level) -def compute_binomial_interval(sr_a: pd.Series, - sr_b: pd.Series, - method: str = 'clopper-pearson', - confidence_level: float = 0.95) -> ConfidenceInterval: +def compute_binomial_interval( + sr_a: pd.Series, + sr_b: pd.Series, + method: str = "clopper-pearson", + confidence_level: float = 0.95, +) -> ConfidenceInterval: """ Calculate a confidence interval for this distance metric. Args: sr_a: value of a binary variable sr_b: value of a binary variable - method: Optional; default is 'clopper-pearson' + method: ty.Optional; default is 'clopper-pearson' confidence_level: Level on which confidence interval is computed Returns: @@ -109,5 +127,7 @@ def compute_binomial_interval(sr_a: pd.Series, p = sr_a.mean() n = len(sr_a) interval = binomial_proportion_interval(p, n, method, confidence_level) - cinterval = ConfidenceInterval((interval.limits[0] - sr_b.mean(), interval.limits[1] - sr_b.mean()), interval.level) + cinterval = ConfidenceInterval( + (interval.limits[0] - sr_b.mean(), interval.limits[1] - sr_b.mean()), interval.level + ) return cinterval diff --git a/src/insight/metrics/metrics.py b/src/insight/metrics/metrics.py index de17683c..3de3a633 100644 --- a/src/insight/metrics/metrics.py +++ b/src/insight/metrics/metrics.py @@ -1,5 +1,5 @@ """This module contains various metrics used across synthesized.""" -from typing import Any, Dict, Optional, Sequence, Union, cast +import typing as ty import numpy as np import pandas as pd @@ -21,8 +21,8 @@ def check_column_types(cls, sr: pd.Series, check: Check = ColumnCheck()): return True def _compute_metric(self, sr: pd.Series): - mean = np.nanmean(sr.values - np.array(0, dtype=sr.dtype)) - return mean + np.array(0, dtype=sr.dtype) + mean = np.nanmean(sr.values - np.array(0, dtype=sr.dtype)) # type: ignore + return mean + np.array(0, dtype=sr.dtype) # type: ignore class StandardDeviation(OneColumnMetric): @@ -32,9 +32,9 @@ def __init__(self, check: Check = ColumnCheck(), remove_outliers: float = 0.0): super().__init__(check) self.remove_outliers = remove_outliers - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> ty.Dict[str, ty.Any]: dictionary = super().to_dict() - dictionary.update({'remove_outliers': self.remove_outliers}) + dictionary.update({"remove_outliers": self.remove_outliers}) return dictionary @classmethod @@ -45,13 +45,15 @@ def check_column_types(cls, sr: pd.Series, check: Check = ColumnCheck()): def _compute_metric(self, sr: pd.Series): values = np.sort(sr.values) # type: ignore - values = values[int(len(sr) * self.remove_outliers):int(len(sr) * (1.0 - self.remove_outliers))] + values = values[ + int(len(sr) * self.remove_outliers) : int(len(sr) * (1.0 - self.remove_outliers)) + ] trimmed_sr = pd.Series(values, name=sr.name) affine_mean = Mean(upload_to_database=False) d = trimmed_sr - affine_mean(trimmed_sr) u = d / np.array(1, dtype=d.dtype) - s = np.sqrt(np.sum(u ** 2)) + s = np.sqrt(np.sum(u**2)) return s * np.array(1, dtype=d.dtype) @@ -100,6 +102,7 @@ class CramersV(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates there is no association between the variables, and 1 indicates maximal association (i.e. one variable is completely determined by the other). """ + name = "cramers_v" @classmethod @@ -126,9 +129,7 @@ def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): col = pd.Series(data=col, index=table_orig.columns) itab = np.outer(row, col) - probs = pd.DataFrame( - data=itab, index=table_orig.index, columns=table_orig.columns - ) + probs = pd.DataFrame(data=itab, index=table_orig.index, columns=table_orig.columns) fit = table.sum() * probs expected = fit.to_numpy() @@ -147,10 +148,13 @@ class EarthMoversDistance(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, and a value of 1 indicates they follow completely different distributions. """ + name = "earth_movers_distance" @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if not check.categorical(sr_a) or not check.categorical(sr_b): return False return True @@ -191,10 +195,13 @@ class KullbackLeiblerDivergence(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, and a value of 1 indicates they follow completely different distributions. """ + name = "kullback_leibler_divergence" @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if check.continuous(sr_a) and check.continuous(sr_b): return True if check.categorical(sr_a) and check.categorical(sr_b): @@ -219,10 +226,13 @@ class JensenShannonDivergence(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, and a value of 1 indicates they follow completely different distributions. """ + name = "jensen_shannon_divergence" @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if check.continuous(sr_a) and check.continuous(sr_b): return True if check.categorical(sr_a) and check.categorical(sr_b): @@ -247,10 +257,13 @@ class HellingerDistance(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, and a value of 1 indicates they follow completely different distributions. """ + name = "hellinger_distance" @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if check.continuous(sr_a) and check.continuous(sr_b): return True if check.categorical(sr_a) and check.categorical(sr_b): @@ -266,7 +279,9 @@ def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): The hellinger distance between sr_a and sr_b. """ (p, q) = zipped_hist((sr_a, sr_b), check=self.check) - return np.linalg.norm(np.sqrt(cast(pd.Series, p)) - np.sqrt(cast(pd.Series, q))) / np.sqrt(2) + return np.linalg.norm( + np.sqrt(ty.cast(pd.Series, p)) - np.sqrt(ty.cast(pd.Series, q)) + ) / np.sqrt(2) class Norm(TwoColumnMetric): @@ -294,19 +309,22 @@ class Norm(TwoColumnMetric): >>> norm(sr1, sr2) 0.0 """ + name = "norm" def __init__(self, check: Check = ColumnCheck(), ord: float = 2.0): super().__init__(check) self.ord = ord - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> ty.Dict[str, ty.Any]: dictionary = super().to_dict() - dictionary.update({'ord': self.ord}) + dictionary.update({"ord": self.ord}) return dictionary @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if check.continuous(sr_a) and check.continuous(sr_b): return True if check.categorical(sr_a) and check.categorical(sr_b): @@ -323,7 +341,7 @@ def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): """ (p, q) = zipped_hist((sr_a, sr_b), check=self.check) if p is not None and q is not None: - return np.linalg.norm(cast(pd.Series, p) - cast(pd.Series, q), ord=self.ord) # type: ignore + return np.linalg.norm(ty.cast(pd.Series, p) - ty.cast(pd.Series, q), ord=self.ord) # type: ignore return None @@ -334,7 +352,7 @@ class EarthMoversDistanceBinned(TwoColumnMetric): an ordinal range. If the latter, they must have equal binning. Args: - bin_edges: Optional; If given, this must be an iterable of bin edges for x and y, + bin_edges: ty.Optional; If given, this must be an iterable of bin edges for x and y, i.e. the output of np.histogram_bin_edges. If None, then it is assumed that the data represent counts of nominal categories, with no meaningful distance between bins. @@ -371,21 +389,26 @@ class EarthMoversDistanceBinned(TwoColumnMetric): 0.06876915155978315 """ + name = "earth_movers_distance_binned" - def __init__(self, - check: Check = ColumnCheck(), - bin_edges: Optional[Union[pd.Series, Sequence, np.ndarray]] = None): + def __init__( + self, + check: Check = ColumnCheck(), + bin_edges: ty.Optional[ty.Union[pd.Series, ty.Sequence, np.ndarray]] = None, + ): super().__init__(check) self.bin_edges = bin_edges - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> ty.Dict[str, ty.Any]: dictionary = super().to_dict() - dictionary.update({'bin_edges': self.bin_edges}) + dictionary.update({"bin_edges": self.bin_edges}) return dictionary @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: # Histograms can appear to be continuous even if they are categorical in nature return True @@ -400,23 +423,23 @@ def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): The earth mover's distance. """ if sr_a.sum() == 0 and sr_b.sum() == 0: - return 0. + return 0.0 elif sr_a.sum() == 0 or sr_b.sum() == 0: - return 1. + return 1.0 # normalise counts for consistency with scipy.stats.wasserstein - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): x = np.nan_to_num(sr_a / sr_a.sum()) y = np.nan_to_num(sr_b / sr_b.sum()) if self.bin_edges is None: # if bins not given, histograms are assumed to be counts of nominal categories, - # and therefore distances between bins are meaningless. Set to all distances to + # and therefore distances between bins are meaningless. ty.Set to all distances to # unity to model this. distance = 0.5 * np.sum(np.abs(x.astype(np.float64) - y.astype(np.float64))) else: # otherwise, use pair-wise euclidean distances between bin centers for scale data - bin_centers = self.bin_edges[:-1] + np.diff(self.bin_edges) / 2. + bin_centers = self.bin_edges[:-1] + np.diff(self.bin_edges) / 2.0 distance = wasserstein_distance(bin_centers, bin_centers, u_weights=x, v_weights=y) return distance @@ -428,6 +451,7 @@ class BhattacharyyaCoefficient(TwoColumnMetric): and 0 indicates lack of overlap between the distributions. Bhattacharyya coefficient is closely related to Hellinger distance. """ + name = "bhattacharyya_coefficient" @classmethod @@ -440,7 +464,7 @@ def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = Col def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): (p, q) = zipped_hist((sr_a, sr_b), check=self.check) - return np.sum(np.sqrt(cast(pd.Series, p) * cast(pd.Series, q))) + return np.sum(np.sqrt(ty.cast(pd.Series, p) * ty.cast(pd.Series, q))) class TotalVariationDistance(TwoColumnMetric): @@ -449,10 +473,13 @@ class TotalVariationDistance(TwoColumnMetric): The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, and a value of 1 indicates they follow completely different distributions. """ + name = "total_variation_distance" @classmethod - def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()) -> bool: + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: if check.continuous(sr_a) and check.continuous(sr_b): return True if check.categorical(sr_a) and check.categorical(sr_b): @@ -461,4 +488,4 @@ def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = Col def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): (p, q) = zipped_hist((sr_a, sr_b), check=self.check) - return np.linalg.norm(cast(pd.Series, p) - cast(pd.Series, q), ord=1) / 2 + return np.linalg.norm(ty.cast(pd.Series, p) - ty.cast(pd.Series, q), ord=1) / 2 diff --git a/src/insight/metrics/metrics_usage.py b/src/insight/metrics/metrics_usage.py index 8bf4bac6..f0740da3 100644 --- a/src/insight/metrics/metrics_usage.py +++ b/src/insight/metrics/metrics_usage.py @@ -1,5 +1,5 @@ +import typing as ty from itertools import permutations -from typing import Union import numpy as np import pandas as pd @@ -27,14 +27,10 @@ def __init__(self, metric: OneColumnMetric): def _compute_result(self, df: pd.DataFrame) -> pd.DataFrame: columns_map = { - col: self._metric( - df[col], dataset_name=df.attrs.get("name", "") + f"_{col}" - ) + col: self._metric(df[col], dataset_name=df.attrs.get("name", "") + f"_{col}") for col in df.columns } - result = pd.DataFrame( - data=columns_map.values(), index=df.columns, columns=["metric_val"] - ) + result = pd.DataFrame(data=columns_map.values(), index=df.columns, columns=["metric_val"]) result.name = self._metric.name return result @@ -82,7 +78,7 @@ def summarize_result(self, result: pd.DataFrame): Args: result: the result of the metric computation. """ - return result.abs().max().max() # max().max() = max in each col -> max across cols + return result.abs().max().max() # max().max() = max in each col -> max across cols def __init__(self, metric: TwoColumnMetric): self._corr_matrix = CorrMatrix(metric) @@ -90,7 +86,7 @@ def __init__(self, metric: TwoColumnMetric): def _compute_result( self, df_old: pd.DataFrame, df_new: pd.DataFrame - ) -> Union[pd.DataFrame, None]: + ) -> ty.Union[pd.DataFrame, None]: corr_matrix_old = self._corr_matrix(df=df_old) corr_matrix_new = self._corr_matrix(df=df_new) @@ -116,9 +112,7 @@ def __init__(self, metric: TwoColumnMetric): self._metric = metric self.name = f"{metric.name}_map" - def _compute_result( - self, df_old: pd.DataFrame, df_new: pd.DataFrame - ) -> pd.DataFrame: + def _compute_result(self, df_old: pd.DataFrame, df_new: pd.DataFrame) -> pd.DataFrame: columns_map = { col: self._metric( df_old[col], diff --git a/src/insight/metrics/utils.py b/src/insight/metrics/utils.py index b9da0c46..19ad2bdc 100644 --- a/src/insight/metrics/utils.py +++ b/src/insight/metrics/utils.py @@ -1,4 +1,4 @@ -from typing import Callable, Optional, Tuple, Union +import typing as ty import numpy as np import pandas as pd @@ -7,28 +7,30 @@ def zipped_hist( - data: Tuple[pd.Series, ...], + data: ty.Tuple[pd.Series, ...], check: Check = ColumnCheck(), - bin_edges: Optional[np.ndarray] = None, + bin_edges: ty.Optional[np.ndarray] = None, normalize: bool = True, ret_bins: bool = False, -) -> Union[Tuple[pd.Series, ...], Tuple[Tuple[pd.Series, ...], Optional[np.ndarray]]]: +) -> ty.Union[ + ty.Tuple[pd.Series, ...], ty.Tuple[ty.Tuple[pd.Series, ...], ty.Optional[np.ndarray]] +]: """Bins a tuple of series' and returns the aligned histograms. Args: - data (Tuple[pd.Series, ...]): + data (ty.Tuple[pd.Series, ...]): A tuple consisting of the series' to be binned. All series' must have the same dtype. - bin_edges (Optional[np.ndarray], optional): + bin_edges (ty.Optional[np.ndarray], optional): Bin edges to bin continuous data by. Defaults to None. normalize (bool, optional): Normalize the histograms, turning them into pdfs. Defaults to True. ret_bins (bool, optional): Returns the bin edges used in the histogram. Defaults to False. - distr_type (Optional[str]): + distr_type (ty.Optional[str]): The type of distribution of the target attribute. Can be "categorical" or "continuous". If None the type of distribution is inferred based on the data in the column. Defaults to None. Returns: - Union[Tuple[np.ndarray, ...], Tuple[Tuple[np.ndarray, ...], Optional[np.ndarray]]]: + ty.Union[ty.Tuple[np.ndarray, ...], ty.Tuple[ty.Tuple[np.ndarray, ...], ty.Optional[np.ndarray]]]: A tuple of np.ndarrays consisting of each histogram for the input data. Additionally, returns bins if ret_bins is True. """ @@ -55,7 +57,7 @@ def zipped_hist( space = joint.unique() dicts = [sr.value_counts(normalize=normalize) for sr in data] - hists = [np.array([d.get(val, 0) for val in space]) for d in dicts] + hists = [np.array([d.get(val, 0) for val in space]) for d in dicts] # type: ignore[arg-type] ps = [pd.Series(hist) for hist in hists] @@ -65,16 +67,21 @@ def zipped_hist( return tuple(ps) -def bootstrap_statistic(data: Union[Tuple[pd.Series], Tuple[pd.Series, pd.Series]], - statistic: Union[Callable[[pd.Series, pd.Series], float], Callable[[pd.Series], float]], - n_samples: int = 1000, sample_size=None) -> np.ndarray: +def bootstrap_statistic( + data: ty.Union[ty.Tuple[pd.Series], ty.Tuple[pd.Series, pd.Series]], + statistic: ty.Union[ + ty.Callable[[pd.Series, pd.Series], float], ty.Callable[[pd.Series], float] + ], + n_samples: int = 1000, + sample_size=None, +) -> np.ndarray: """ Compute the samples of a statistic estimate using the bootstrap method. Args: data: Data on which to compute the statistic. statistic: Function that computes the statistic. - n_samples: Optional; Number of bootstrap samples to perform. + n_samples: ty.Optional; Number of bootstrap samples to perform. Returns: The bootstrap samples. @@ -92,15 +99,18 @@ def get_sample_idx(x): return statistic_samples -def bootstrap_binned_statistic(data: Tuple[pd.Series, pd.Series], statistic: Callable[[pd.Series, pd.Series], float], - n_samples: int = 1000) -> np.ndarray: +def bootstrap_binned_statistic( + data: ty.Tuple[pd.Series, pd.Series], + statistic: ty.Callable[[pd.Series, pd.Series], float], + n_samples: int = 1000, +) -> np.ndarray: """ Compute the samples of a binned statistic estimate using the bootstrap method. Args: data: Data for which to compute the statistic. statistic: Function that computes the statistic. - n_samples: Optional; Number of bootstrap samples to perform. + n_samples: ty.Optional; Number of bootstrap samples to perform. Returns: The bootstrap samples. @@ -108,7 +118,7 @@ def bootstrap_binned_statistic(data: Tuple[pd.Series, pd.Series], statistic: Cal statistic_samples = np.empty(n_samples) - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): p_x = np.nan_to_num(data[0] / data[0].sum()) p_y = np.nan_to_num(data[1] / data[1].sum()) diff --git a/src/insight/plot.py b/src/insight/plot.py index 3950833e..fa17a2c6 100644 --- a/src/insight/plot.py +++ b/src/insight/plot.py @@ -2,8 +2,8 @@ import logging import math import os +import typing as ty import warnings -from typing import Any, Dict, List, Tuple, Union import matplotlib as mpl import matplotlib.pyplot as plt @@ -55,7 +55,7 @@ def set_plotting_style(): ) -def obtain_figure(ax: Axes = None, figsize: Tuple[int, int] = DEFAULT_FIGSIZE): +def obtain_figure(ax: Axes = None, figsize: ty.Tuple[int, int] = DEFAULT_FIGSIZE): """ Obtains the figure that is associated with the passed in Axes objects. If not ax object is specified, generates a new figure along with a new axe object. Returns the figure and the ax as a tuple. @@ -72,30 +72,30 @@ def obtain_figure(ax: Axes = None, figsize: Tuple[int, int] = DEFAULT_FIGSIZE): def axes_grid( - col_titles: List[str], - row_titles: List[str], - ax: Union[Axes, SubplotBase] = None, + col_titles: ty.List[str], + row_titles: ty.List[str], + ax: ty.Union[Axes, SubplotBase] = None, sharey: bool = True, - wspace: float = None, - hspace: float = None, - height_ratios: float = None, - width_ratios: float = None, + wspace: ty.Optional[float] = None, + hspace: ty.Optional[float] = None, + height_ratios: ty.Optional[float] = None, + width_ratios: ty.Optional[float] = None, ): """ - Subdivides the given axes into a grid len(col_titles) by len(row_titles) and labels each section. - Args: - col_titles: Title for each column. - row_titles: Title for each row. - ax: The axes to subdivide. - sharey: If all the figures should share the same y-axis. - wspace: Horizontal space between the figures. - hspace: Vertical space between the figures. - height_ratios: A list that details the relative height of each section. - width_ratios: A list that details the relative height of each section. - - Returns: - List of the newly generated axes. - """ + Subdivides the given axes into a grid len(col_titles) by len(row_titles) and labels each section. + Args: + col_titles: Title for each column. + row_titles: Title for each row. + ax: The axes to subdivide. + sharey: If all the figures should share the same y-axis. + wspace: Horizontal space between the figures. + hspace: Vertical space between the figures. + height_ratios: A list that details the relative height of each section. + width_ratios: A list that details the relative height of each section. + + Returns: + ty.List of the newly generated axes. + """ cols = len(col_titles) rows = len(row_titles) assert cols > 0 and rows > 0 @@ -104,10 +104,16 @@ def axes_grid( ax.set_axis_off() sp_spec = ax.get_subplotspec() - sgs = sp_spec.subgridspec(rows, cols, wspace=wspace, hspace=hspace, height_ratios=height_ratios, - width_ratios=width_ratios) + sgs = sp_spec.subgridspec( + rows, + cols, + wspace=wspace, + hspace=hspace, + height_ratios=height_ratios, + width_ratios=width_ratios, + ) fig = ax.figure - col_axes: List[mpl.axes.Axes] = list() + col_axes: ty.List[mpl.axes.Axes] = list() ax = fig.add_subplot(sgs[:, 0]) ax.set_title(col_titles[0]) @@ -174,7 +180,7 @@ def adjust_tick_labels(ax: Axes): ax.tick_params("y", length=3, width=1, which="major", color="#D7E0FE") -def text_only(text: str, ax: Union[Axes, SubplotBase] = None) -> Figure: +def text_only(text: str, ax: ty.Union[Axes, SubplotBase] = None) -> Figure: """ Plots the given text over the provided ax. If no ax is provided, generates a new ax. Args: @@ -240,11 +246,11 @@ def cross_table(counts: pd.DataFrame, title: str, ax: Axes = None) -> Figure: def cross_tables( - df_test: pd.DataFrame, - df_synth: pd.DataFrame, - col_a: str, - col_b: str, - figsize: Tuple[float, float] = (15, 11), + df_test: pd.DataFrame, + df_synth: pd.DataFrame, + col_a: str, + col_b: str, + figsize: ty.Tuple[float, float] = (15, 11), ) -> Figure: """ Plots two cross tables for two datasets for easier comparison of the datasets. @@ -292,7 +298,7 @@ def cross_tables( return f -def _get_color_scheme(color_scheme, num_cols) -> List[str]: +def _get_color_scheme(color_scheme, num_cols) -> ty.List[str]: """ Completes the color scheme based on the current matplotlib style. If the color scheme has enough colors to draw the specified number of columns, returns the color_scheme unmodified. @@ -310,7 +316,7 @@ def _get_color_scheme(color_scheme, num_cols) -> List[str]: return color_scheme -def _get_series_names(source_names, num_cols) -> List[str]: +def _get_series_names(source_names, num_cols) -> ty.List[str]: """ Gives names to the unnamed columns. If source names is less than the number of columns, names each column as unnamed with an index. @@ -322,18 +328,18 @@ def _get_series_names(source_names, num_cols) -> List[str]: A list of strings that represents the names of the serieses. """ if source_names is None: - source_names = ["orig", "synth"] + [f'unnamed{i}' for i in range(num_cols - 2)] + source_names = ["orig", "synth"] + [f"unnamed{i}" for i in range(num_cols - 2)] elif len(source_names) < num_cols: - source_names += [f'unnamed{i}' for i in range(num_cols - 2)] + source_names += [f"unnamed{i}" for i in range(num_cols - 2)] return source_names def categorical( - cols: List[pd.Series], - sample_size=10_000, - ax: Union[Axes, SubplotBase] = None, - color_scheme: List[str] = None, - series_source_names: List[str] = None + cols: ty.List[pd.Series], + sample_size=10_000, + ax: ty.Optional[ty.Union[Axes, SubplotBase]] = None, + color_scheme: ty.Optional[ty.List[str]] = None, + series_source_names: ty.Optional[ty.List[str]] = None, ) -> Figure: """ Plots the categorical distribution of the specified series side by side. @@ -358,7 +364,7 @@ def categorical( for i, col in enumerate(cols): cols[i] = col.dropna() if len(cols[i]) == 0: - return text_only(f'Column at index {i} is either empty of full of Na\'s') + return text_only(f"Column at index {i} is either empty of full of Na's") df_cols.append(pd.DataFrame(cols[i])) sample_size = min(sample_size, min([len(col) for col in cols])) @@ -377,7 +383,7 @@ def categorical( alpha=0.7, palette=color_palette, ax=ax, - ec='#ffffff' + ec="#ffffff", ) adjust_tick_labels(ax) @@ -389,10 +395,10 @@ def categorical( def continuous_column( col: pd.Series, - col_name: str = None, - color: str = None, - kde_kws: Dict[Any, Any] = None, - ax: Axes = None + col_name: ty.Optional[str] = None, + color: ty.Optional[str] = None, + kde_kws: ty.Optional[ty.Dict[ty.Any, ty.Any]] = None, + ax: ty.Optional[Axes] = None, ) -> Figure: """Plots a pdf of the given continuous series. @@ -407,7 +413,7 @@ def continuous_column( A figure with a pdf of the series plotted on top of it. """ kde_kws = {} if kde_kws is None else kde_kws - col_name = col.name if col_name is None else col_name + label = col.name if col_name is None else col_name fig, ax = obtain_figure(ax) try: @@ -417,7 +423,7 @@ def continuous_column( lw=1, alpha=0.5, fill=True, - label=col_name, + label=label, ax=ax, **kde_kws, ) @@ -428,7 +434,7 @@ def continuous_column( alpha=0.5, fill=True, color=color, - label=col_name, + label=label, ax=ax, **kde_kws, ) @@ -439,12 +445,12 @@ def continuous_column( def continuous( - cols: List[pd.Series], + cols: ty.List[pd.Series], remove_outliers: float = 0.0, sample_size=10_000, - ax: Union[Axes, SubplotBase] = None, - color_scheme: List[str] = None, - series_source_names: List[str] = None + ax: ty.Optional[ty.Union[Axes, SubplotBase]] = None, + color_scheme: ty.Optional[ty.List[str]] = None, + series_source_names: ty.Optional[ty.List[str]] = None, ) -> Figure: """ plot the pdfs of all the serieses specified by cols. @@ -469,9 +475,9 @@ def continuous( percentiles = [remove_outliers * 100.0 / 2, 100 - remove_outliers * 100.0 / 2] for i, col in enumerate(cols): - cols[i] = pd.to_numeric(col.dropna(), errors='coerce').dropna() + cols[i] = pd.to_numeric(col.dropna(), errors="coerce").dropna() if len(cols[i]) == 0: - return text_only(f'Column at index {i} is empty of full of Na\'s.') + return text_only(f"Column at index {i} is empty of full of Na's.") cols[i] = cols[i].sample(min(sample_size, len(cols[i]))) start, end = np.percentile(cols[0], percentiles) @@ -481,7 +487,7 @@ def continuous( for i, col in enumerate(cols): cols[i] = col[(start <= col) & (col <= end)] if len(cols[i]) == 0: - return text_only(f'Column at index {i} is out of range of the first column.') + return text_only(f"Column at index {i} is out of range of the first column.") if not all([col.nunique() >= 2 for col in cols]): kde_kws = {} @@ -500,14 +506,14 @@ def continuous( def dataset( - dfs: List[pd.DataFrame], + dfs: ty.List[pd.DataFrame], columns=None, remove_outliers: float = 0.0, - figsize: Tuple[float, float] = None, + figsize: ty.Optional[ty.Tuple[float, float]] = None, figure_cols: int = 2, sample_size: int = 10_000, max_categories: int = 10, - check=ColumnCheck() + check=ColumnCheck(), ) -> Figure: """ Plot the columns of all the data-frames that are passed in. @@ -532,7 +538,7 @@ def dataset( figure_rows = math.ceil(len(columns) / figure_cols) figsize = (6 * figure_cols + 2, 5 * figure_rows + 2) if figsize is None else figsize fig = plt.figure(figsize=figsize) - fig.suptitle('Distributions') + fig.suptitle("Distributions") if len(columns) == 0: return fig diff --git a/src/insight/py.typed b/src/insight/py.typed new file mode 100644 index 00000000..ff0c4e17 --- /dev/null +++ b/src/insight/py.typed @@ -0,0 +1,2 @@ +# This must be at the top of the py.typed file +partial diff --git a/tests/test_database/test_db.py b/tests/test_database/test_db.py index 3b101195..a16ccfdb 100644 --- a/tests/test_database/test_db.py +++ b/tests/test_database/test_db.py @@ -71,7 +71,7 @@ def column(dataset): """ Gives a column from the dataset that is used for testing. """ - return dataset['age'] + return dataset["age"] def verify_results_table(session, result): @@ -81,7 +81,7 @@ def verify_results_table(session, result): results_table = session.execute(text("SELECT * FROM Result")).fetchall()[0] assert results_table[0] == results_table[1] == results_table[2] == results_table[3] == 1 assert isclose(results_table[4], result) - assert datetime.strptime(results_table[5], '%Y-%m-%d %H:%M:%S').date() == date.today() + assert datetime.strptime(results_table[5], "%Y-%m-%d %H:%M:%S").date() == date.today() def verify_dataset_table(session, table_name, num_rows, num_cols): @@ -119,7 +119,7 @@ def test_one_column_metric_queries_default_params(db_session, column): metric(column) verify_results_table(db_session, metric(column)) verify_dataset_table(db_session, "Series_age", len(column), 1) - verify_metric_table(db_session, 'mean', 'OneColumnMetric') + verify_metric_table(db_session, "mean", "OneColumnMetric") verify_version_table(db_session, "Unversioned") @@ -148,7 +148,9 @@ def test_dataframe_queries_default_params(db_session, dataset): metric = metrics.OneColumnMap(metrics.Mean(upload_to_database=False)) result = metric.summarize_result(metric(dataset, dataset_name="test_dataset")) verify_results_table(db_session, result) - verify_dataset_table(db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1]) + verify_dataset_table( + db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1] + ) verify_metric_table(db_session, "mean_map", "DataFrameMetric") verify_version_table(db_session, "Unversioned") @@ -156,14 +158,18 @@ def test_dataframe_queries_default_params(db_session, dataset): def test_dataframe_queries_modified_params(db_session, dataset): metric = metrics.OneColumnMap(metrics.Mean(upload_to_database=False)) metric(dataset, dataset_name="test_dataset") - verify_dataset_table(db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1]) + verify_dataset_table( + db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1] + ) def test_two_dataframe_queries_default_params(db_session, dataset): metric = metrics.TwoColumnMap(metrics.KullbackLeiblerDivergence(upload_to_database=False)) result = metric.summarize_result(metric(dataset, dataset, dataset_name="test_dataset")) verify_results_table(db_session, result) - verify_dataset_table(db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1]) + verify_dataset_table( + db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1] + ) verify_metric_table(db_session, "kullback_leibler_divergence_map", "TwoDataFrameMetrics") verify_version_table(db_session, "Unversioned") @@ -171,4 +177,6 @@ def test_two_dataframe_queries_default_params(db_session, dataset): def test_two_dataframe_queries_modified_params(db_session, dataset): metric = metrics.TwoColumnMap(metrics.KullbackLeiblerDivergence(upload_to_database=False)) metric(dataset, dataset, dataset_name="test_dataset") - verify_dataset_table(db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1]) + verify_dataset_table( + db_session, "test_dataset", num_rows=dataset.shape[0], num_cols=dataset.shape[1] + ) diff --git a/tests/test_metrics/test_check.py b/tests/test_metrics/test_check.py index ad3c1a2a..8c2bad2a 100644 --- a/tests/test_metrics/test_check.py +++ b/tests/test_metrics/test_check.py @@ -7,17 +7,25 @@ from insight import ColumnCheck -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def df(): - df = pd.DataFrame({ - 'string_col': np.random.choice(['A', 'B', 'C', 'D', 'E'], size=1000), - 'bool_col': np.random.choice([False, True], size=1000).astype('?'), - 'date_col': pd.to_datetime(18_000 + np.random.normal(500, 50, size=1000).astype(int), unit='D'), - 'int_col': np.random.randint(10, 100000, 1000), - 'float_col': np.random.normal(0.0, 1.0, size=1000), - 'int_bool_col': np.random.choice([0, 1], size=1000), - 'ordered_cat_col': pd.Categorical(np.random.choice(["b", "d", "c"], size=1000), categories=["b", "c", "d"], ordered=True) - }) + df = pd.DataFrame( + { + "string_col": np.random.choice(["A", "B", "C", "D", "E"], size=1000), + "bool_col": np.random.choice([False, True], size=1000).astype("?"), + "date_col": pd.to_datetime( + 18_000 + np.random.normal(500, 50, size=1000).astype(int), unit="D" + ), + "int_col": np.random.randint(10, 100000, 1000), + "float_col": np.random.normal(0.0, 1.0, size=1000), + "int_bool_col": np.random.choice([0, 1], size=1000), + "ordered_cat_col": pd.Categorical( + np.random.choice(["b", "d", "c"], size=1000), + categories=["b", "c", "d"], + ordered=True, + ), + } + ) return df @@ -51,10 +59,12 @@ def verify_columns_types(df, categorical_cols, continuous_cols, affine_cols, ord def test_column_check(df): - categorical_cols = set(['string_col', 'bool_col', 'int_bool_col', 'ordered_cat_col']) - continuous_cols = set(['int_col', 'float_col']) - affine_cols = set(['date_col', 'int_col', 'float_col']) - ordinal_cols = set(['ordered_cat_col', 'date_col', 'float_col', 'int_col', 'int_bool_col', 'bool_col']) + categorical_cols = set(["string_col", "bool_col", "int_bool_col", "ordered_cat_col"]) + continuous_cols = set(["int_col", "float_col"]) + affine_cols = set(["date_col", "int_col", "float_col"]) + ordinal_cols = set( + ["ordered_cat_col", "date_col", "float_col", "int_col", "int_bool_col", "bool_col"] + ) verify_columns_types(df.copy(), categorical_cols, continuous_cols, affine_cols, ordinal_cols) @@ -78,5 +88,11 @@ def test_check_ordinal(): sr = pd.Series([3 for _ in range(100)]) assert check.ordinal(sr) is True - sr = pd.Series(pd.Categorical(np.random.choice(["t2", "t1", "t0"], size=100), categories=["t0", "t1", "t2"], ordered=True)) + sr = pd.Series( + pd.Categorical( + np.random.choice(["t2", "t1", "t0"], size=100), + categories=["t0", "t1", "t2"], + ordered=True, + ) + ) assert check.ordinal(sr) is True diff --git a/tests/test_metrics/test_confidence_interval.py b/tests/test_metrics/test_confidence_interval.py index 63a81f85..13c0dbf9 100644 --- a/tests/test_metrics/test_confidence_interval.py +++ b/tests/test_metrics/test_confidence_interval.py @@ -2,14 +2,19 @@ import pytest from insight.metrics import EarthMoversDistance, HellingerDistance -from insight.metrics.confidence_interval import compute_binomial_interval, compute_bootstrap_interval +from insight.metrics.confidence_interval import ( + compute_binomial_interval, + compute_bootstrap_interval, +) @pytest.mark.parametrize( - 'metric, data', [ - (EarthMoversDistance(), (pd.Series(['a', 'b', 'c']), pd.Series(['c', 'b', 'a']))), - (HellingerDistance(), (pd.Series([1, 2, 3]), pd.Series([0, 0, 0]))) - ]) + "metric, data", + [ + (EarthMoversDistance(), (pd.Series(["a", "b", "c"]), pd.Series(["c", "b", "a"]))), + (HellingerDistance(), (pd.Series([1, 2, 3]), pd.Series([0, 0, 0]))), + ], +) def test_bootstrap_interval(metric, data): conf_interval = compute_bootstrap_interval(metric, data[0], data[0]) assert conf_interval.level == 0.95 @@ -21,11 +26,15 @@ def test_bootstrap_interval(metric, data): def test_binomial_interval(): - conf_interval = compute_binomial_interval(pd.Series([1, 1]), pd.Series([0, 0]), confidence_level=0.99) + conf_interval = compute_binomial_interval( + pd.Series([1, 1]), pd.Series([0, 0]), confidence_level=0.99 + ) assert conf_interval.level == 0.99 assert conf_interval.limits[0] is not None and conf_interval.limits[1] is not None - conf_interval = compute_binomial_interval(pd.Series([1, 0]), pd.Series([1, 0]), confidence_level=0.80) + conf_interval = compute_binomial_interval( + pd.Series([1, 0]), pd.Series([1, 0]), confidence_level=0.80 + ) assert conf_interval.level == 0.80 assert conf_interval.limits[0] is not None and conf_interval.limits[1] is not None diff --git a/tests/test_metrics/test_metrics.py b/tests/test_metrics/test_metrics.py index 47d1cba3..70fccfa1 100644 --- a/tests/test_metrics/test_metrics.py +++ b/tests/test_metrics/test_metrics.py @@ -34,13 +34,15 @@ total_variation_distance = TotalVariationDistance() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def df(): - df = pd.read_csv("https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/biased/compas.csv") + df = pd.read_csv( + "https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/biased/compas.csv" + ) return df -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def group1(df): pred1 = df["Ethnicity"] == "Caucasian" target_attr = "RawScore" @@ -48,7 +50,7 @@ def group1(df): return group1 -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def group2(df): pred2 = df["Ethnicity"] == "African-American" target_attr = "RawScore" @@ -56,9 +58,9 @@ def group2(df): return group2 -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def group3(group2): - group3 = group2.sort_values()[len(group2) // 2:] + group3 = group2.sort_values()[len(group2) // 2 :] return group3 @@ -73,15 +75,17 @@ def data2(): def test_mean(): - sr_a = pd.Series(np.arange(100), name='a') + sr_a = pd.Series(np.arange(100), name="a") val_a = mean(sr=sr_a) assert val_a == 49.5 - sr_b = pd.Series(np.datetime64('2020-01-01') + np.arange(0, 3, step=1).astype('m8[D]'), name='b') + sr_b = pd.Series( + np.datetime64("2020-01-01") + np.arange(0, 3, step=1).astype("m8[D]"), name="b" + ) val_b = mean(sr=sr_b) - assert val_b == np.datetime64('2020-01-02') + assert val_b == np.datetime64("2020-01-02") - sr_c = pd.Series(['a', 'b', 'c', 'd'], name='c') + sr_c = pd.Series(["a", "b", "c", "d"], name="c") val_c = mean(sr=sr_c) assert val_c is None @@ -91,73 +95,73 @@ def test_base_to_dict(): Tests the basic variation of _Metric.to_dict(). """ dict_mean = mean.to_dict() - assert dict_mean['name'] == 'mean' + assert dict_mean["name"] == "mean" dict_kendalltau = kendall_tau.to_dict() - assert dict_kendalltau['name'] == 'kendall_tau_correlation' + assert dict_kendalltau["name"] == "kendall_tau_correlation" dict_cramers_v = cramers_v.to_dict() - assert dict_cramers_v['name'] == 'cramers_v' + assert dict_cramers_v["name"] == "cramers_v" dict_emd = emd.to_dict() - assert dict_emd['name'] == 'earth_movers_distance' + assert dict_emd["name"] == "earth_movers_distance" dict_kl_divergence = kl_divergence.to_dict() - assert dict_kl_divergence['name'] == 'kullback_leibler_divergence' + assert dict_kl_divergence["name"] == "kullback_leibler_divergence" dict_js_divergence = js_divergence.to_dict() - assert dict_js_divergence['name'] == 'jensen_shannon_divergence' + assert dict_js_divergence["name"] == "jensen_shannon_divergence" dict_hellinger_distance = hellinger_distance.to_dict() - assert dict_hellinger_distance['name'] == 'hellinger_distance' + assert dict_hellinger_distance["name"] == "hellinger_distance" dict_bc_coef = bhattacharyya_coefficient.to_dict() - assert dict_bc_coef['name'] == 'bhattacharyya_coefficient' + assert dict_bc_coef["name"] == "bhattacharyya_coefficient" def test_base_from_dict(): """ Tests the basic variation of _Metric.from_dict. """ - dict_mean = {'name': 'mean'} + dict_mean = {"name": "mean"} new_mean = Mean.from_dict(dict_mean) assert isinstance(new_mean, Mean) - dict_kendall_tau = {'name': 'kendall_tau_correlation'} + dict_kendall_tau = {"name": "kendall_tau_correlation"} new_kendall_tau = KendallTauCorrelation.from_dict(dict_kendall_tau) assert isinstance(new_kendall_tau, KendallTauCorrelation) - dict_cramers_v = {'name': 'cramers_v'} + dict_cramers_v = {"name": "cramers_v"} new_cramers_v = CramersV.from_dict(dict_cramers_v) assert isinstance(new_cramers_v, CramersV) - dict_emd = {'name': 'earth_movers_distance'} + dict_emd = {"name": "earth_movers_distance"} new_emd = EarthMoversDistance.from_dict(dict_emd) assert isinstance(new_emd, EarthMoversDistance) - dict_kl_divergence = {'name': 'kullback_leibler_divergence'} + dict_kl_divergence = {"name": "kullback_leibler_divergence"} new_kl_divergence = KullbackLeiblerDivergence.from_dict(dict_kl_divergence) assert isinstance(new_kl_divergence, KullbackLeiblerDivergence) - dict_js_divergence = {'name': 'jensen_shannon_divergence'} + dict_js_divergence = {"name": "jensen_shannon_divergence"} new_js_divergence = JensenShannonDivergence.from_dict(dict_js_divergence) assert isinstance(new_js_divergence, JensenShannonDivergence) - dict_hellinger_distance = {'name': 'hellinger_distance'} + dict_hellinger_distance = {"name": "hellinger_distance"} new_hellinger_distance = HellingerDistance.from_dict(dict_hellinger_distance) assert isinstance(new_hellinger_distance, HellingerDistance) - dict_bc_coef = {'name': 'bhattacharyya_coefficient'} + dict_bc_coef = {"name": "bhattacharyya_coefficient"} new_bc_coef = BhattacharyyaCoefficient.from_dict(dict_bc_coef) assert isinstance(new_bc_coef, BhattacharyyaCoefficient) def test_from_dict_different_class(): - dict_mean = {'name': 'mean'} + dict_mean = {"name": "mean"} new_mean = BhattacharyyaCoefficient.from_dict(dict_mean) assert isinstance(new_mean, Mean) - dict_norm = {'name': 'norm', 'ord': 1} + dict_norm = {"name": "norm", "ord": 1} new_norm = HellingerDistance.from_dict(dict_norm) assert isinstance(new_norm, Norm) @@ -165,15 +169,17 @@ def test_from_dict_different_class(): def test_standard_deviation(): - sr_a = pd.Series(np.random.normal(0, 1, 100), name='a') + sr_a = pd.Series(np.random.normal(0, 1, 100), name="a") val_a = std_dev(sr=sr_a) assert val_a is not None - sr_b = pd.Series(np.datetime64('2020-01-01') + np.arange(0, 20, step=1).astype('m8[D]'), name='b') + sr_b = pd.Series( + np.datetime64("2020-01-01") + np.arange(0, 20, step=1).astype("m8[D]"), name="b" + ) val_b = std_dev(sr=sr_b) assert val_b is not None - sr_c = pd.Series(['a', 'b', 'c', 'd'], name='c') + sr_c = pd.Series(["a", "b", "c", "d"], name="c") val_c = std_dev(sr=sr_c) assert val_c is None @@ -183,25 +189,29 @@ def test_standard_deviation_to_dict(): Tests the to_dict method that is specific to StandardDeviation metric. """ dict_std_dev = std_dev.to_dict() - assert dict_std_dev['name'] == 'standard_deviation' - assert np.isclose(dict_std_dev['remove_outliers'], 0) + assert dict_std_dev["name"] == "standard_deviation" + assert np.isclose(dict_std_dev["remove_outliers"], 0) def test_em_distance(): - sr_a = pd.Series(np.random.normal(0, 1, 100), name='a') - sr_b = pd.Series(['a', 'b', 'c', 'd'], name='b') + sr_a = pd.Series(np.random.normal(0, 1, 100), name="a") + sr_b = pd.Series(["a", "b", "c", "d"], name="b") assert emd(sr_a, sr_a) is None assert emd(sr_b, sr_b) is not None def test_kt_correlation(): - sr_a = pd.Series(np.random.normal(0, 1, 100), name='a') - sr_b = pd.Series(np.random.normal(0, 1, 5), name='b') - sr_c = pd.Series(sr_b.values + np.random.normal(0, 0.8, 5), name='c') - sr_d = pd.Series(['a', 'b', 'c', 'd'], name='d') - sr_e = pd.Series(list("abbccc"), dtype=pd.CategoricalDtype(categories=list("abc"), ordered=True)) - sr_f = pd.Series(list("feeddd"), dtype=pd.CategoricalDtype(categories=list("fed"), ordered=True)) + sr_a = pd.Series(np.random.normal(0, 1, 100), name="a") + sr_b = pd.Series(np.random.normal(0, 1, 5), name="b") + sr_c = pd.Series(sr_b.values + np.random.normal(0, 0.8, 5), name="c") + sr_d = pd.Series(["a", "b", "c", "d"], name="d") + sr_e = pd.Series( + list("abbccc"), dtype=pd.CategoricalDtype(categories=list("abc"), ordered=True) + ) + sr_f = pd.Series( + list("feeddd"), dtype=pd.CategoricalDtype(categories=list("fed"), ordered=True) + ) kt_corr = KendallTauCorrelation() @@ -212,13 +222,15 @@ def test_kt_correlation(): def test_cramers_v_basic(): - sr_a = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3] * 100, name='a') - sr_b = pd.Series([1, 2, 3, 2, 3, 1, 3, 1, 2] * 100, name='b') + sr_a = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3] * 100, name="a") + sr_b = pd.Series([1, 2, 3, 2, 3, 1, 3, 1, 2] * 100, name="b") - assert cramers_v(sr_a, sr_a) > 0.99 # This metric -> cramers_v for large N (makes it more robust to outliers) - assert cramers_v(sr_a, sr_b) == 0. + assert ( + cramers_v(sr_a, sr_a) > 0.99 + ) # This metric -> cramers_v for large N (makes it more robust to outliers) + assert cramers_v(sr_a, sr_b) == 0.0 - sr_c = pd.Series(np.random.normal(0, 1, 1000), name='c') + sr_c = pd.Series(np.random.normal(0, 1, 1000), name="c") assert cramers_v(sr_c, sr_c) is None @@ -233,18 +245,18 @@ def test_cramers_v_compas(df): categorical_cols.append(col) for col_grp in combinations(categorical_cols, 2): - assert (cramers_v(df[col_grp[0]], df[col_grp[1]]) is not None) + assert cramers_v(df[col_grp[0]], df[col_grp[1]]) is not None def test_repr(): metric = EarthMoversDistance() - assert repr(metric) == 'EarthMoversDistance()' + assert repr(metric) == "EarthMoversDistance()" def test_str(): metric = EarthMoversDistance() - metric.name = 'earth_movers_distance' - assert str(metric) == 'earth_movers_distance' + metric.name = "earth_movers_distance" + assert str(metric) == "earth_movers_distance" def test_kl_divergence(group1): @@ -255,7 +267,6 @@ def test_kl_divergence(group1): def test_kl_divergence_with_custom_check(): - class CustomCheck(ColumnCheck): def infer_dtype(self, sr: pd.Series) -> pd.Series: col = sr.copy() @@ -296,8 +307,8 @@ def test_norm_to_dict_ord_one(): parameter. """ dict_norm_one = norm_ord1.to_dict() - assert dict_norm_one['name'] == 'norm' - assert np.isclose(dict_norm_one['ord'], 1) + assert dict_norm_one["name"] == "norm" + assert np.isclose(dict_norm_one["ord"], 1) def test_norm_to_dict_ord_default(): @@ -306,12 +317,12 @@ def test_norm_to_dict_ord_default(): parameter. """ dict_norm_two = norm.to_dict() - assert dict_norm_two['name'] == 'norm' - assert np.isclose(dict_norm_two['ord'], 2) + assert dict_norm_two["name"] == "norm" + assert np.isclose(dict_norm_two["ord"], 2) def test_norm_from_dict_ord_one(): - dict_norm_one = {'name': 'norm', 'ord': 1} + dict_norm_one = {"name": "norm", "ord": 1} new_norm_one = Norm.from_dict(dict_norm_one) assert isinstance(new_norm_one, Norm) @@ -319,7 +330,7 @@ def test_norm_from_dict_ord_one(): def test_norm_from_dict_ord_default(): - dict_norm = {'name': 'norm'} + dict_norm = {"name": "norm"} new_norm = Norm.from_dict(dict_norm) assert isinstance(new_norm, Norm) @@ -362,8 +373,8 @@ def test_emd_distance_binned_no_bins_to_dict(): """ emdb = EarthMoversDistanceBinned() dict_emdb = emdb.to_dict() - assert dict_emdb['name'] == 'earth_movers_distance_binned' - assert dict_emdb['bin_edges'] is None + assert dict_emdb["name"] == "earth_movers_distance_binned" + assert dict_emdb["bin_edges"] is None def test_emd_distance_binned_no_bins_from_dict(): @@ -371,7 +382,7 @@ def test_emd_distance_binned_no_bins_from_dict(): Tests that the binned earth mover's distance can be successfully be built from a dictionary when no bin edges are specified. """ - dict_emdb = {'name': 'earth_movers_distance_binned'} + dict_emdb = {"name": "earth_movers_distance_binned"} new_emdb = EarthMoversDistanceBinned.from_dict(dict_emdb) assert isinstance(new_emdb, EarthMoversDistanceBinned) @@ -387,8 +398,8 @@ def test_emd_distance_binned_to_dict(): emdb = EarthMoversDistanceBinned(bin_edges=bin_edges) dict_emdb = emdb.to_dict() - assert dict_emdb['name'] == 'earth_movers_distance_binned' - assert np.allclose(dict_emdb['bin_edges'], bin_edges) + assert dict_emdb["name"] == "earth_movers_distance_binned" + assert np.allclose(dict_emdb["bin_edges"], bin_edges) def test_emd_distance_binned_from_dict(): @@ -398,7 +409,7 @@ def test_emd_distance_binned_from_dict(): """ a = pd.Series(np.random.normal(loc=10, scale=1.0, size=10000)) bin_edges = np.histogram_bin_edges(a, bins=100) - dict_emdb = {'name': 'earth_movers_distance_binned', 'bin_edges': bin_edges} + dict_emdb = {"name": "earth_movers_distance_binned", "bin_edges": bin_edges} new_emdb = EarthMoversDistanceBinned.from_dict(dict_emdb) assert isinstance(new_emdb, EarthMoversDistanceBinned) @@ -432,7 +443,9 @@ def test_bhattacharyya_coefficient_hellinger_distance_relation(group1, group2, g """ Tests that the BhattacharyyaCoefficient conforms to its relationship with hellinger_distance. """ - assert np.isclose(bhattacharyya_coefficient(group1, group3), 1 - hellinger_distance(group1, group3) ** 2) + assert np.isclose( + bhattacharyya_coefficient(group1, group3), 1 - hellinger_distance(group1, group3) ** 2 + ) def test_total_variation_distance_complete_overlap(group1, group2, group3): @@ -462,4 +475,6 @@ def test_total_variation_distance_hellinger_inequality_preserved(group1, group3) Tests that the TotalVariation distance preserves its inequality relationship with hellinger distance. """ assert total_variation_distance(group1, group3) > hellinger_distance(group1, group3) ** 2 - assert total_variation_distance(group1, group3) < hellinger_distance(group1, group3) * np.sqrt(2) + assert total_variation_distance(group1, group3) < hellinger_distance(group1, group3) * np.sqrt( + 2 + ) diff --git a/tests/test_metrics/test_metrics_usage.py b/tests/test_metrics/test_metrics_usage.py index b3269fdd..ab695c38 100644 --- a/tests/test_metrics/test_metrics_usage.py +++ b/tests/test_metrics/test_metrics_usage.py @@ -6,9 +6,15 @@ from insight.metrics import CorrMatrix, CramersV, DiffCorrMatrix, EarthMoversDistance, TwoColumnMap -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): - df = pd.read_csv('https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/templates/credit.csv').dropna().reset_index(drop=True) + df = ( + pd.read_csv( + "https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/templates/credit.csv" + ) + .dropna() + .reset_index(drop=True) + ) categorical_cols = [] continuous_cols = [] @@ -31,11 +37,11 @@ def test_two_column_map(data): col_map = TwoColumnMap(emd) emd_map_df = col_map(df1, df2) - assert col_map.name == f'{str(emd)}_map' + assert col_map.name == f"{str(emd)}_map" - assert set(emd_map_df.columns.to_list()) == set(['metric_val']) - assert all(not np.isnan(emd_map_df['metric_val'][cat]) for cat in categorical_cols) - assert all(np.isnan(emd_map_df['metric_val'][cont]) for cont in continuous_cols) + assert set(emd_map_df.columns.to_list()) == set(["metric_val"]) + assert all(not np.isnan(emd_map_df["metric_val"][cat]) for cat in categorical_cols) + assert all(np.isnan(emd_map_df["metric_val"][cont]) for cont in continuous_cols) def test_metric_matrix(data): @@ -45,16 +51,42 @@ def test_metric_matrix(data): cmv = CramersV() cmt = CorrMatrix(cmv) - assert cmt.name == f'{str(cmv)}_matrix' + assert cmt.name == f"{str(cmv)}_matrix" cmv_val_df = cmt(df) - assert all(np.isnan(cmv_val_df[cont1][cont2]) and np.isnan(cmv_val_df[cont2][cont1]) for cont1 in continuous_cols for cont2 in continuous_cols) - assert all(np.isnan(cmv_val_df[cat][cont]) and np.isnan(cmv_val_df[cont][cat]) for cat in categorical_cols for cont in continuous_cols) - assert all(not np.isnan(cmv_val_df[cat1][cat2]) and not np.isnan(cmv_val_df[cat2][cat1]) for cat1 in categorical_cols for cat2 in categorical_cols if cat1 != cat2) + assert all( + np.isnan(cmv_val_df[cont1][cont2]) and np.isnan(cmv_val_df[cont2][cont1]) + for cont1 in continuous_cols + for cont2 in continuous_cols + ) + assert all( + np.isnan(cmv_val_df[cat][cont]) and np.isnan(cmv_val_df[cont][cat]) + for cat in categorical_cols + for cont in continuous_cols + ) + assert all( + not np.isnan(cmv_val_df[cat1][cat2]) and not np.isnan(cmv_val_df[cat2][cat1]) + for cat1 in categorical_cols + for cat2 in categorical_cols + if cat1 != cat2 + ) cmv_diff_mat = DiffCorrMatrix(cmv) diff = cmv_diff_mat(df1, df2) - assert cmv_diff_mat.name == f'diff_{str(cmv)}' - assert all(np.isnan(diff[cont1][cont2]) and np.isnan(diff[cont2][cont1]) for cont1 in continuous_cols for cont2 in continuous_cols) - assert all(np.isnan(diff[cat][cont]) and np.isnan(diff[cont][cat]) for cat in categorical_cols for cont in continuous_cols) - assert all(not np.isnan(diff[cat1][cat2]) and not np.isnan(diff[cat2][cat1]) for cat1 in categorical_cols for cat2 in categorical_cols if cat1 != cat2) + assert cmv_diff_mat.name == f"diff_{str(cmv)}" + assert all( + np.isnan(diff[cont1][cont2]) and np.isnan(diff[cont2][cont1]) + for cont1 in continuous_cols + for cont2 in continuous_cols + ) + assert all( + np.isnan(diff[cat][cont]) and np.isnan(diff[cont][cat]) + for cat in categorical_cols + for cont in continuous_cols + ) + assert all( + not np.isnan(diff[cat1][cat2]) and not np.isnan(diff[cat2][cat1]) + for cat1 in categorical_cols + for cat2 in categorical_cols + if cat1 != cat2 + ) assert not np.isnan(cmv_diff_mat.summarize_result(diff)) diff --git a/tests/test_metrics/test_plotting.py b/tests/test_metrics/test_plotting.py index 2f6ef939..19e0d637 100644 --- a/tests/test_metrics/test_plotting.py +++ b/tests/test_metrics/test_plotting.py @@ -4,59 +4,61 @@ from insight.plot import categorical, continuous, cross_table, cross_tables, dataset, text_only -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def df(): - df = pd.read_csv('https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/templates/adult.csv') + df = pd.read_csv( + "https://raw.githubusercontent.com/synthesized-io/datasets/master/tabular/templates/adult.csv" + ) return df -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def df_half_a(df): - df_half_a = df.iloc[:len(df) // 2, :] + df_half_a = df.iloc[: len(df) // 2, :] return df_half_a @pytest.fixture(scope="module") def df_half_b(df): - df_half_b = df.iloc[len(df) // 2:, :] + df_half_b = df.iloc[len(df) // 2 :, :] return df_half_b def test_plot_two_cross_tables_not_equal_not_none(df_half_a, df_half_b): - categories_a = pd.concat((df_half_a['workclass'], df_half_b['workclass'])).unique() - categories_b = pd.concat((df_half_a['marital-status'], df_half_b['marital-status'])).unique() + categories_a = pd.concat((df_half_a["workclass"], df_half_b["workclass"])).unique() + categories_b = pd.concat((df_half_a["marital-status"], df_half_b["marital-status"])).unique() categories_a.sort() categories_b.sort() fig_a = cross_table( pd.crosstab( - pd.Categorical(df_half_a['workclass'], categories_a, ordered=True), - pd.Categorical(df_half_b['marital-status'], categories_b, ordered=True), - dropna=False + pd.Categorical(df_half_a["workclass"], categories_a, ordered=True), + pd.Categorical(df_half_b["marital-status"], categories_b, ordered=True), + dropna=False, ), - title='test' - ); + title="test", + ) assert fig_a is not None fig_b = cross_table( pd.crosstab( - pd.Categorical(df_half_a['workclass'], categories_a, ordered=True), - pd.Categorical(df_half_b['workclass'], categories_a, ordered=True), - dropna=False + pd.Categorical(df_half_a["workclass"], categories_a, ordered=True), + pd.Categorical(df_half_b["workclass"], categories_a, ordered=True), + dropna=False, ), - title='test' - ); + title="test", + ) assert fig_b is not None assert fig_a != fig_b def test_plot_two_pairs_cross_tables_not_equal_not_none(df_half_a, df_half_b): - fig_a = cross_tables(df_half_a, df_half_b, col_a='workclass', col_b='income') + fig_a = cross_tables(df_half_a, df_half_b, col_a="workclass", col_b="income") assert fig_a is not None - fig_b = cross_tables(df_half_a, df_half_b, col_a='marital-status', col_b='income') + fig_b = cross_tables(df_half_a, df_half_b, col_a="marital-status", col_b="income") assert fig_b is not None assert fig_a != fig_b @@ -68,20 +70,20 @@ def test_plot_text_only_not_none(df_half_a, df_half_b): def test_plot_two_categorical_distribution_not_equal_not_none(df_half_a, df_half_b): - fig_a = categorical([df_half_a['workclass'], df_half_b['workclass']]) + fig_a = categorical([df_half_a["workclass"], df_half_b["workclass"]]) assert fig_a is not None - fig_b = categorical([df_half_a['workclass']]) + fig_b = categorical([df_half_a["workclass"]]) assert fig_b is not None assert fig_a != fig_b def test_plot_two_continuous_not_equal_not_none(df_half_a, df_half_b): - fig_a = continuous([df_half_a['fnlwgt'], df_half_b['fnlwgt']]) + fig_a = continuous([df_half_a["fnlwgt"], df_half_b["fnlwgt"]]) assert fig_a is not None - fig_b = continuous([df_half_a['fnlwgt']]) + fig_b = continuous([df_half_a["fnlwgt"]]) assert fig_b is not None assert fig_a != fig_b