Refactor typing stubs (#155)

* added black * bumped mypy to 1.5.1 * updated typing usages * added py.typed and specified that insight pkg is typed in build-time
synthesized-io · Nov 9, 2023 · 259a874 · 259a874
1 parent c97ba2a
commit 259a874
Show file tree

Hide file tree

Showing 24 changed files with 642 additions and 439 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,8 +14,16 @@ repos:
       - id: mixed-line-ending
       - id: requirements-txt-fixer
       - id: trailing-whitespace
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.961
+    rev: v1.5.1
     hooks:
       - id: mypy
         files: src
@@ -26,8 +34,3 @@ repos:
         args: [--install-types, --non-interactive]
         # Note that using the --install-types is problematic if running in
         # parallel as mutating the pre-commit env at runtime breaks cache.
-  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-...
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,7 +90,7 @@ dependencies = [
   write_to = "src/insight/version.py"
 
   [tool.setuptools.package-data]
-  insight = ["src/insight/fonts/SourceSansPro-Regular.ttf"]
+  insight = ["src/insight/fonts/SourceSansPro-Regular.ttf", "py.typed"]
 
   [tool.pytest.ini_options]
   junit_suite_name = "unit"
@@ -102,6 +102,8 @@ dependencies = [
 
   [tool.mypy]
   plugins = "numpy.typing.mypy_plugin, sqlalchemy.ext.mypy.plugin"
+  disallow_any_generics = false
+  disable_error_code = "type-arg"
 
   [[tool.mypy.overrides]]
   module = "pandas.*"
@@ -120,9 +122,13 @@ dependencies = [
   ignore_missing_imports = true
 
   [tool.isort]
+  profile = "black"
   multi_line_output = 3
   include_trailing_comma = true
   force_grid_wrap = 0
   use_parentheses = true
   ensure_newline_before_comments = true
-  line_length = 120
+  line_length = 100
+
+  [tool.black]
+  line-length = 100
diff --git a/src/insight/__init__.py b/src/insight/__init__.py
@@ -1,4 +1,4 @@
 from . import metrics, plot
 from .check import ColumnCheck
 
-__all__ = ['ColumnCheck', 'plot', 'metrics']
+__all__ = ["ColumnCheck", "plot", "metrics"]
diff --git a/src/insight/alembic/env.py b/src/insight/alembic/env.py
@@ -63,23 +63,24 @@ def run_migrations_online() -> None:
     if url is None:
         raise ValueError("No sqlalchemy.url specified in config file")
 
-    config.set_main_option("sqlalchemy.url", url.format(
-        POSTGRES_USER=POSTGRES_USER,
-        POSTGRES_PASSWORD=POSTGRES_PASSWORD,
-        POSTGRES_HOST=POSTGRES_HOST,
-        POSTGRES_PORT=POSTGRES_PORT,
-        POSTGRES_DATABASE=POSTGRES_DATABASE
-    ))
+    config.set_main_option(
+        "sqlalchemy.url",
+        url.format(
+            POSTGRES_USER=POSTGRES_USER,
+            POSTGRES_PASSWORD=POSTGRES_PASSWORD,
+            POSTGRES_HOST=POSTGRES_HOST,
+            POSTGRES_PORT=POSTGRES_PORT,
+            POSTGRES_DATABASE=POSTGRES_DATABASE,
+        ),
+    )
     connectable = engine_from_config(
         config.get_section(config.config_ini_section) or {},
         prefix="sqlalchemy.",
         poolclass=pool.NullPool,
     )
 
     with connectable.connect() as connection:
-        context.configure(
-            connection=connection, target_metadata=target_metadata
-        )
+        context.configure(connection=connection, target_metadata=target_metadata)
 
         with context.begin_transaction():
             context.run_migrations()

diff --git a/src/insight/alembic/main.py b/src/insight/alembic/main.py
@@ -7,9 +7,10 @@
 here = os.path.dirname(os.path.abspath(__file__))
 
 args = sys.argv[1:]
-args = args if len(args) > 0 else ["upgrade", "head"] # default
+args = args if len(args) > 0 else ["upgrade", "head"]  # default
 alembic_args = [
-    '-c', os.path.join(here, 'alembic.ini'),
+    "-c",
+    os.path.join(here, "alembic.ini"),
 ] + args
 
 

diff --git a/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py b/src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py
@@ -9,54 +9,67 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision = '9aca5ae68ff5'
+revision = "9aca5ae68ff5"
 down_revision = None
 branch_labels = None
 depends_on = None
 
 
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('dataset',
-    sa.Column('id', sa.INTEGER(), nullable=False),
-    sa.Column('name', sa.VARCHAR(length=200), nullable=False),
-    sa.Column('num_rows', sa.INTEGER(), nullable=True),
-    sa.Column('num_columns', sa.INTEGER(), nullable=True),
-    sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "dataset",
+        sa.Column("id", sa.INTEGER(), nullable=False),
+        sa.Column("name", sa.VARCHAR(length=200), nullable=False),
+        sa.Column("num_rows", sa.INTEGER(), nullable=True),
+        sa.Column("num_columns", sa.INTEGER(), nullable=True),
+        sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_table('metric',
-    sa.Column('id', sa.INTEGER(), nullable=False),
-    sa.Column('name', sa.VARCHAR(length=100), nullable=False),
-    sa.Column('category', sa.VARCHAR(length=100), nullable=True),
-    sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "metric",
+        sa.Column("id", sa.INTEGER(), nullable=False),
+        sa.Column("name", sa.VARCHAR(length=100), nullable=False),
+        sa.Column("category", sa.VARCHAR(length=100), nullable=True),
+        sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_table('version',
-    sa.Column('id', sa.INTEGER(), nullable=False),
-    sa.Column('name', sa.VARCHAR(length=50), nullable=True),
-    sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "version",
+        sa.Column("id", sa.INTEGER(), nullable=False),
+        sa.Column("name", sa.VARCHAR(length=50), nullable=True),
+        sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_table('result',
-    sa.Column('id', sa.INTEGER(), nullable=False),
-    sa.Column('metric_id', sa.INTEGER(), nullable=True),
-    sa.Column('dataset_id', sa.INTEGER(), nullable=True),
-    sa.Column('version_id', sa.INTEGER(), nullable=True),
-    sa.Column('value', sa.FLOAT(), nullable=True),
-    sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
-    sa.ForeignKeyConstraint(['dataset_id'], ['dataset.id'], ),
-    sa.ForeignKeyConstraint(['metric_id'], ['metric.id'], ),
-    sa.ForeignKeyConstraint(['version_id'], ['version.id'], ),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "result",
+        sa.Column("id", sa.INTEGER(), nullable=False),
+        sa.Column("metric_id", sa.INTEGER(), nullable=True),
+        sa.Column("dataset_id", sa.INTEGER(), nullable=True),
+        sa.Column("version_id", sa.INTEGER(), nullable=True),
+        sa.Column("value", sa.FLOAT(), nullable=True),
+        sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["dataset_id"],
+            ["dataset.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["metric_id"],
+            ["metric.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["version_id"],
+            ["version.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
     )
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_table('result')
-    op.drop_table('version')
-    op.drop_table('metric')
-    op.drop_table('dataset')
+    op.drop_table("result")
+    op.drop_table("version")
+    op.drop_table("metric")
+    op.drop_table("dataset")
     # ### end Alembic commands ###
diff --git a/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py b/src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py
@@ -9,25 +9,21 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision = 'd2198fd60b0e'
-down_revision = '9aca5ae68ff5'
+revision = "d2198fd60b0e"
+down_revision = "9aca5ae68ff5"
 branch_labels = None
 depends_on = None
 
 
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column('result', sa.Column('run_id', sa.VARCHAR(length=50), nullable=True, default=None))
-    op.alter_column('version', 'name',
-               existing_type=sa.VARCHAR(length=50),
-               nullable=False)
+    op.add_column("result", sa.Column("run_id", sa.VARCHAR(length=50), nullable=True, default=None))
+    op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=False)
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.alter_column('version', 'name',
-               existing_type=sa.VARCHAR(length=50),
-               nullable=True)
-    op.drop_column('result', 'run_id')
+    op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=True)
+    op.drop_column("result", "run_id")
     # ### end Alembic commands ###
diff --git a/src/insight/check.py b/src/insight/check.py
@@ -1,5 +1,5 @@
+import typing as ty
 from abc import ABC, abstractmethod
-from typing import cast
 
 import numpy as np
 import pandas as pd
@@ -27,13 +27,13 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
         n_empty_str = 0 if not pd.api.types.is_string_dtype(col) else sr.eq("").sum()
 
         # Try to convert it to numeric
-        if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != 'M':
+        if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != "M":
             col_num = pd.to_numeric(col, errors="coerce")
             if col_num.isna().sum() == n_nans + n_empty_str:
                 col = col_num
 
         # Try to convert it to date
-        if col.dtype.kind == "O" or col.dtype.kind == 'M':
+        if col.dtype.kind == "O" or col.dtype.kind == "M":
             try:
                 col_date = pd.to_datetime(col, errors="coerce")
             except TypeError:
@@ -48,7 +48,7 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
         elif out_dtype in ("i", "u", "f", "f8", "i8", "u8"):
             return pd.to_numeric(col, errors="coerce")
 
-        return cast(pd.Series, col.astype(out_dtype, errors="ignore"))
+        return ty.cast(pd.Series, col.astype(out_dtype, errors="ignore"))  # type: ignore
 
     @abstractmethod
     def continuous(self, sr: pd.Series):
@@ -83,9 +83,8 @@ class ColumnCheck(Check):
             Categorical threshold log multiplier.
             Default value is 2.5.
     """
-    def __init__(self,
-                 min_num_unique: int = 10,
-                 ctl_mult: float = 2.5):
+
+    def __init__(self, min_num_unique: int = 10, ctl_mult: float = 2.5):
         self.min_num_unique = min_num_unique
         self.ctl_mult = ctl_mult
 
@@ -95,9 +94,9 @@ def continuous(self, sr: pd.Series) -> bool:
         """
         sr = self.infer_dtype(sr)
         sr_dtype = str(sr.dtype)
-        if len(sr.unique()) >= max(self.min_num_unique,
-                                   self.ctl_mult * np.log(len(sr)))\
-           and sr_dtype in ("float64", "int64"):
+        if len(sr.unique()) >= max(
+            self.min_num_unique, self.ctl_mult * np.log(len(sr))
+        ) and sr_dtype in ("float64", "int64"):
             return True
         return False
 
@@ -106,7 +105,9 @@ def categorical(self, sr: pd.Series) -> bool:
         if pd.api.types.is_categorical_dtype(sr) is True:
             return True
         sr = self.infer_dtype(sr)
-        if sr.dtype.kind == "M":  # TODO: Need to implement ability to deal with dates well in metrics
+        if (
+            sr.dtype.kind == "M"
+        ):  # TODO: Need to implement ability to deal with dates well in metrics
             return False
 
         if not self.continuous(sr):
@@ -119,14 +120,13 @@ def ordinal(self, sr: pd.Series) -> bool:
         Columns which are categorical in nature, but contain numbers/dates/bool
         are ordinal too. E.g. [2, 1, 1, 2, 7, 2, 1, 7]
         """
-        if (pd.api.types.is_categorical_dtype(sr) is True
-            and sr.cat.ordered is True)\
-           or pd.api.types.is_bool_dtype(sr) is True:
+        if (
+            pd.api.types.is_categorical_dtype(sr) is True and sr.cat.ordered is True
+        ) or pd.api.types.is_bool_dtype(sr) is True:
             return True
 
         sr_inferred = self.infer_dtype(sr)
-        if sr_inferred.dtype in ("float64", "int64")\
-           or sr_inferred.dtype.kind in ("M", "m"):
+        if sr_inferred.dtype in ("float64", "int64") or sr_inferred.dtype.kind in ("M", "m"):
             return True
         return False
 
@@ -135,7 +135,6 @@ def affine(self, sr: pd.Series) -> bool:
         Continuous columns along with the columns of type DateTime
         and the Timedelta are affine
         """
-        if self.continuous(sr) is True\
-           or self.infer_dtype(sr).dtype.kind in ("M", "m"):
+        if self.continuous(sr) is True or self.infer_dtype(sr).dtype.kind in ("M", "m"):
             return True
         return False
diff --git a/src/insight/database/schema.py b/src/insight/database/schema.py
@@ -5,6 +5,7 @@
 Base = declarative_base()
 mapped_column = Column
 
+
 class Dataset(Base):
     __tablename__ = "dataset"