Skip to content

Commit

Permalink
Refactor typing stubs (#155)
Browse files Browse the repository at this point in the history
* added black

* bumped mypy to 1.5.1

* updated typing usages

* added py.typed and specified that insight pkg is typed in build-time
  • Loading branch information
marqueewinq authored Nov 9, 2023
1 parent c97ba2a commit 259a874
Show file tree
Hide file tree
Showing 24 changed files with 642 additions and 439 deletions.
15 changes: 9 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,16 @@ repos:
- id: mixed-line-ending
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.961
rev: v1.5.1
hooks:
- id: mypy
files: src
Expand All @@ -26,8 +34,3 @@ repos:
args: [--install-types, --non-interactive]
# Note that using the --install-types is problematic if running in
# parallel as mutating the pre-commit env at runtime breaks cache.
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
...
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ dependencies = [
write_to = "src/insight/version.py"

[tool.setuptools.package-data]
insight = ["src/insight/fonts/SourceSansPro-Regular.ttf"]
insight = ["src/insight/fonts/SourceSansPro-Regular.ttf", "py.typed"]

[tool.pytest.ini_options]
junit_suite_name = "unit"
Expand All @@ -102,6 +102,8 @@ dependencies = [

[tool.mypy]
plugins = "numpy.typing.mypy_plugin, sqlalchemy.ext.mypy.plugin"
disallow_any_generics = false
disable_error_code = "type-arg"

[[tool.mypy.overrides]]
module = "pandas.*"
Expand All @@ -120,9 +122,13 @@ dependencies = [
ignore_missing_imports = true

[tool.isort]
profile = "black"
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
line_length = 120
line_length = 100

[tool.black]
line-length = 100
2 changes: 1 addition & 1 deletion src/insight/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import metrics, plot
from .check import ColumnCheck

__all__ = ['ColumnCheck', 'plot', 'metrics']
__all__ = ["ColumnCheck", "plot", "metrics"]
21 changes: 11 additions & 10 deletions src/insight/alembic/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,24 @@ def run_migrations_online() -> None:
if url is None:
raise ValueError("No sqlalchemy.url specified in config file")

config.set_main_option("sqlalchemy.url", url.format(
POSTGRES_USER=POSTGRES_USER,
POSTGRES_PASSWORD=POSTGRES_PASSWORD,
POSTGRES_HOST=POSTGRES_HOST,
POSTGRES_PORT=POSTGRES_PORT,
POSTGRES_DATABASE=POSTGRES_DATABASE
))
config.set_main_option(
"sqlalchemy.url",
url.format(
POSTGRES_USER=POSTGRES_USER,
POSTGRES_PASSWORD=POSTGRES_PASSWORD,
POSTGRES_HOST=POSTGRES_HOST,
POSTGRES_PORT=POSTGRES_PORT,
POSTGRES_DATABASE=POSTGRES_DATABASE,
),
)
connectable = engine_from_config(
config.get_section(config.config_ini_section) or {},
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)

with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
context.configure(connection=connection, target_metadata=target_metadata)

with context.begin_transaction():
context.run_migrations()
Expand Down
5 changes: 3 additions & 2 deletions src/insight/alembic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
here = os.path.dirname(os.path.abspath(__file__))

args = sys.argv[1:]
args = args if len(args) > 0 else ["upgrade", "head"] # default
args = args if len(args) > 0 else ["upgrade", "head"] # default
alembic_args = [
'-c', os.path.join(here, 'alembic.ini'),
"-c",
os.path.join(here, "alembic.ini"),
] + args


Expand Down
81 changes: 47 additions & 34 deletions src/insight/alembic/versions/9aca5ae68ff5_add_initial_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,54 +9,67 @@
from alembic import op

# revision identifiers, used by Alembic.
revision = '9aca5ae68ff5'
revision = "9aca5ae68ff5"
down_revision = None
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('dataset',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('name', sa.VARCHAR(length=200), nullable=False),
sa.Column('num_rows', sa.INTEGER(), nullable=True),
sa.Column('num_columns', sa.INTEGER(), nullable=True),
sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint('id')
op.create_table(
"dataset",
sa.Column("id", sa.INTEGER(), nullable=False),
sa.Column("name", sa.VARCHAR(length=200), nullable=False),
sa.Column("num_rows", sa.INTEGER(), nullable=True),
sa.Column("num_columns", sa.INTEGER(), nullable=True),
sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint("id"),
)
op.create_table('metric',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('name', sa.VARCHAR(length=100), nullable=False),
sa.Column('category', sa.VARCHAR(length=100), nullable=True),
sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint('id')
op.create_table(
"metric",
sa.Column("id", sa.INTEGER(), nullable=False),
sa.Column("name", sa.VARCHAR(length=100), nullable=False),
sa.Column("category", sa.VARCHAR(length=100), nullable=True),
sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint("id"),
)
op.create_table('version',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('name', sa.VARCHAR(length=50), nullable=True),
sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint('id')
op.create_table(
"version",
sa.Column("id", sa.INTEGER(), nullable=False),
sa.Column("name", sa.VARCHAR(length=50), nullable=True),
sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
sa.PrimaryKeyConstraint("id"),
)
op.create_table('result',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('metric_id', sa.INTEGER(), nullable=True),
sa.Column('dataset_id', sa.INTEGER(), nullable=True),
sa.Column('version_id', sa.INTEGER(), nullable=True),
sa.Column('value', sa.FLOAT(), nullable=True),
sa.Column('created_at', sa.TIMESTAMP(), nullable=True),
sa.ForeignKeyConstraint(['dataset_id'], ['dataset.id'], ),
sa.ForeignKeyConstraint(['metric_id'], ['metric.id'], ),
sa.ForeignKeyConstraint(['version_id'], ['version.id'], ),
sa.PrimaryKeyConstraint('id')
op.create_table(
"result",
sa.Column("id", sa.INTEGER(), nullable=False),
sa.Column("metric_id", sa.INTEGER(), nullable=True),
sa.Column("dataset_id", sa.INTEGER(), nullable=True),
sa.Column("version_id", sa.INTEGER(), nullable=True),
sa.Column("value", sa.FLOAT(), nullable=True),
sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
sa.ForeignKeyConstraint(
["dataset_id"],
["dataset.id"],
),
sa.ForeignKeyConstraint(
["metric_id"],
["metric.id"],
),
sa.ForeignKeyConstraint(
["version_id"],
["version.id"],
),
sa.PrimaryKeyConstraint("id"),
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('result')
op.drop_table('version')
op.drop_table('metric')
op.drop_table('dataset')
op.drop_table("result")
op.drop_table("version")
op.drop_table("metric")
op.drop_table("dataset")
# ### end Alembic commands ###
16 changes: 6 additions & 10 deletions src/insight/alembic/versions/d2198fd60b0e_added_result_run_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,21 @@
from alembic import op

# revision identifiers, used by Alembic.
revision = 'd2198fd60b0e'
down_revision = '9aca5ae68ff5'
revision = "d2198fd60b0e"
down_revision = "9aca5ae68ff5"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('result', sa.Column('run_id', sa.VARCHAR(length=50), nullable=True, default=None))
op.alter_column('version', 'name',
existing_type=sa.VARCHAR(length=50),
nullable=False)
op.add_column("result", sa.Column("run_id", sa.VARCHAR(length=50), nullable=True, default=None))
op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=False)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column('version', 'name',
existing_type=sa.VARCHAR(length=50),
nullable=True)
op.drop_column('result', 'run_id')
op.alter_column("version", "name", existing_type=sa.VARCHAR(length=50), nullable=True)
op.drop_column("result", "run_id")
# ### end Alembic commands ###
35 changes: 17 additions & 18 deletions src/insight/check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import typing as ty
from abc import ABC, abstractmethod
from typing import cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -27,13 +27,13 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
n_empty_str = 0 if not pd.api.types.is_string_dtype(col) else sr.eq("").sum()

# Try to convert it to numeric
if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != 'M':
if col.dtype.kind not in ("i", "u", "f") and col.dtype.kind != "M":
col_num = pd.to_numeric(col, errors="coerce")
if col_num.isna().sum() == n_nans + n_empty_str:
col = col_num

# Try to convert it to date
if col.dtype.kind == "O" or col.dtype.kind == 'M':
if col.dtype.kind == "O" or col.dtype.kind == "M":
try:
col_date = pd.to_datetime(col, errors="coerce")
except TypeError:
Expand All @@ -48,7 +48,7 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
elif out_dtype in ("i", "u", "f", "f8", "i8", "u8"):
return pd.to_numeric(col, errors="coerce")

return cast(pd.Series, col.astype(out_dtype, errors="ignore"))
return ty.cast(pd.Series, col.astype(out_dtype, errors="ignore")) # type: ignore

@abstractmethod
def continuous(self, sr: pd.Series):
Expand Down Expand Up @@ -83,9 +83,8 @@ class ColumnCheck(Check):
Categorical threshold log multiplier.
Default value is 2.5.
"""
def __init__(self,
min_num_unique: int = 10,
ctl_mult: float = 2.5):

def __init__(self, min_num_unique: int = 10, ctl_mult: float = 2.5):
self.min_num_unique = min_num_unique
self.ctl_mult = ctl_mult

Expand All @@ -95,9 +94,9 @@ def continuous(self, sr: pd.Series) -> bool:
"""
sr = self.infer_dtype(sr)
sr_dtype = str(sr.dtype)
if len(sr.unique()) >= max(self.min_num_unique,
self.ctl_mult * np.log(len(sr)))\
and sr_dtype in ("float64", "int64"):
if len(sr.unique()) >= max(
self.min_num_unique, self.ctl_mult * np.log(len(sr))
) and sr_dtype in ("float64", "int64"):
return True
return False

Expand All @@ -106,7 +105,9 @@ def categorical(self, sr: pd.Series) -> bool:
if pd.api.types.is_categorical_dtype(sr) is True:
return True
sr = self.infer_dtype(sr)
if sr.dtype.kind == "M": # TODO: Need to implement ability to deal with dates well in metrics
if (
sr.dtype.kind == "M"
): # TODO: Need to implement ability to deal with dates well in metrics
return False

if not self.continuous(sr):
Expand All @@ -119,14 +120,13 @@ def ordinal(self, sr: pd.Series) -> bool:
Columns which are categorical in nature, but contain numbers/dates/bool
are ordinal too. E.g. [2, 1, 1, 2, 7, 2, 1, 7]
"""
if (pd.api.types.is_categorical_dtype(sr) is True
and sr.cat.ordered is True)\
or pd.api.types.is_bool_dtype(sr) is True:
if (
pd.api.types.is_categorical_dtype(sr) is True and sr.cat.ordered is True
) or pd.api.types.is_bool_dtype(sr) is True:
return True

sr_inferred = self.infer_dtype(sr)
if sr_inferred.dtype in ("float64", "int64")\
or sr_inferred.dtype.kind in ("M", "m"):
if sr_inferred.dtype in ("float64", "int64") or sr_inferred.dtype.kind in ("M", "m"):
return True
return False

Expand All @@ -135,7 +135,6 @@ def affine(self, sr: pd.Series) -> bool:
Continuous columns along with the columns of type DateTime
and the Timedelta are affine
"""
if self.continuous(sr) is True\
or self.infer_dtype(sr).dtype.kind in ("M", "m"):
if self.continuous(sr) is True or self.infer_dtype(sr).dtype.kind in ("M", "m"):
return True
return False
1 change: 1 addition & 0 deletions src/insight/database/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Base = declarative_base()
mapped_column = Column


class Dataset(Base):
__tablename__ = "dataset"

Expand Down
Loading

0 comments on commit 259a874

Please sign in to comment.