Skip to content

Commit

Permalink
Update to frictionless >=5.16
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Oct 20, 2024
1 parent 9bc11c1 commit 167cc20
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 89 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ in progress
- Added support for Python 3.12 and 3.13
- Adjusted SQL DDL for sqlalchemy-cratedb 0.40.0
- Adjusted ddlgenerator wrapper for pandas 2
- Updated to frictionless >=5.16


2023-03-09 0.1.0
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ dependencies = [
"colorama<1",
"crash",
"ddlgenerator<0.2",
"frictionless[excel,json,ods,parquet,sql]<5.19",
"frictionless[excel,json,ods,parquet,sql]>=5.16,<5.19",
"fsspec[gcs,github,http,s3]==2024.9.0",
"json_stream<3",
"line-protocol-parser<2",
Expand Down
6 changes: 3 additions & 3 deletions skeem/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def info():
context_settings={"max_content_width": 120},
)
@click.argument("input", type=str, required=True)
@click.option("--dialect", type=str, required=False, help="Select SQLAlchemy dialect for generating SQL")
@click.option("--dialect", type=str, required=True, help="Select SQLAlchemy dialect for generating SQL")
@click.option("--table-name", type=str, required=False, help="Specify table name used in DDL statement")
@click.option(
"--primary-key",
Expand All @@ -124,10 +124,10 @@ def info():
@click.pass_context
def infer_ddl(
ctx: click.Context,
input: t.Optional[t.Union[Path, str]] = None, # noqa: A002
input: t.Union[Path, str], # noqa: A002
dialect: str,
address: t.Optional[str] = None,
content_type: t.Optional[str] = None,
dialect: t.Optional[str] = None,
table_name: t.Optional[str] = None,
primary_key: t.Optional[str] = None,
backend: t.Optional[str] = "ddlgen",
Expand Down
21 changes: 14 additions & 7 deletions skeem/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from pathlib import Path

import pandas as pd
from frictionless import Control

from skeem.autopk import infer_pk
from skeem.exception import UnknownContentType
from skeem.frictionless.resource import TableSampleResource
from skeem.model import Resource, SqlResult, SqlTarget
from skeem.settings import FRICTIONLESS_CONTENT_TYPES
from skeem.type import ContentType
Expand Down Expand Up @@ -81,12 +83,14 @@ def _ddl_frictionless(self) -> SqlResult:
warnings.filterwarnings("ignore", category=GuessedAtParserWarning)

import frictionless.formats
import sqlalchemy as sa
from ddlgenerator.ddlgenerator import _dump
from frictionless.formats import ExcelControl, OdsControl

from skeem.ddlgen.ddlgenerator import TablePlus

# Sanity checks.
if not self.target.dialect:
raise ValueError("Inferring the database schema needs an SQLAlchemy dialect")

Check warning on line 92 in skeem/core.py

View check run for this annotation

Codecov / codecov/patch

skeem/core.py#L92

Added line #L92 was not covered by tests

frictionless_args: t.Dict[str, t.Union[str, t.IO]] = {}
if self.resource.path is not None:
frictionless_args["path"] = str(self.resource.path)
Expand All @@ -103,15 +107,15 @@ def _ddl_frictionless(self) -> SqlResult:
raise ValueError("Unable to read any data")

# Define resource controls.
control = None
control: t.Union[Control, None] = None
if self.resource.type is ContentType.ODS:
control = OdsControl(sheet=self.resource.address or 1)
elif self.resource.type is ContentType.XLSX:
control = ExcelControl(sheet=self.resource.address or 1)

# Open resource.
logger.info(f"Opening resource {frictionless_args} with {control}")
resource = frictionless.Resource(**frictionless_args, control=control)
logger.info(f"Opening resource {frictionless_args}. type={self.resource.type}, control={control}")
resource = TableSampleResource(**frictionless_args, control=control) # type: ignore[arg-type]

# When primary key is not given, try to infer it from the data.
# TODO: Make `infer_pk` obtain a `Resource` instance, and/or refactor as method.
Expand All @@ -126,8 +130,7 @@ def _ddl_frictionless(self) -> SqlResult:

# Infer schema.
logger.info("Inferring schema")
engine = sa.create_mock_engine(sa.engine.make_url(f"{self.target.dialect}://"), executor=_dump)
mapper = frictionless.formats.sql.SqlMapper(engine)
mapper = frictionless.formats.sql.SqlMapper(dialect=self.target.dialect)
descriptor = resource.to_descriptor()

# Either `schema` is already present, or it needs to be established by invoking `describe` first.
Expand All @@ -144,6 +147,10 @@ def _ddl_frictionless(self) -> SqlResult:
pk_field.required = True
schema.primary_key = [self.target.primary_key]

# Sanity checks.
if not self.target.table_name:
raise ValueError("Table name must not be empty")

Check warning on line 152 in skeem/core.py

View check run for this annotation

Codecov / codecov/patch

skeem/core.py#L152

Added line #L152 was not covered by tests

# Create SQLAlchemy table from schema.
logger.info("Converging schema to SQLAlchemy")
table = mapper.write_schema(schema, table_name=self.target.table_name, with_metadata=False)
Expand Down
18 changes: 1 addition & 17 deletions skeem/frictionless/monkey.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,12 @@
from .loader_stream import read_byte_stream_create
from .pandas_plugin import create_parser
from .parser_jsonl import read_cell_stream_create
from .parser_xlsx import read_loader
from .resource import ResourcePlus


def activate():
patch_core()
patch_modules()


def patch_core():
"""
Override sample size for all `frictionless.Resource` instances.
"""
import frictionless

frictionless.resource.Resource = ResourcePlus
frictionless.Resource = ResourcePlus


def patch_modules():
"""
Enhance `frictionless` loader and parser modules.
Expand All @@ -30,13 +17,10 @@ def patch_modules():
"""

import frictionless.formats.excel.parsers
import frictionless.formats.json.parsers
import frictionless.formats.pandas.plugin
import frictionless.schemes.aws.loaders.s3
import frictionless.schemes.stream.loader

frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create = read_cell_stream_create
frictionless.formats.JsonlParser.read_cell_stream_create = read_cell_stream_create
frictionless.formats.pandas.plugin.PandasPlugin.create_parser = create_parser
frictionless.schemes.aws.loaders.s3.S3Loader.read_byte_stream_create = s3_read_byte_stream_create
frictionless.schemes.stream.loader.StreamLoader.read_byte_stream_create = read_byte_stream_create
frictionless.formats.excel.parsers.XlsxParser.read_loader = read_loader
4 changes: 2 additions & 2 deletions skeem/frictionless/parser_jsonl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import frictionless.formats
from frictionless.formats import JsonlParser

read_cell_stream_create_original = frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create
read_cell_stream_create_original = JsonlParser.read_cell_stream_create


def read_cell_stream_create(self):
Expand Down
52 changes: 0 additions & 52 deletions skeem/frictionless/parser_xlsx.py

This file was deleted.

14 changes: 7 additions & 7 deletions skeem/frictionless/resource.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from frictionless import Resource
from frictionless.resources import TableResource

from skeem.settings import PEEK_LINES


class ResourcePlus(Resource):
class TableSampleResource(TableResource):
"""
Override sample size for frictionless `Resource` instances.
"""

def __init__(self, *args, **kwargs):
from frictionless import Detector
def __attrs_post_init__(self):
if self.detector is None:
from frictionless import Detector

Check warning on line 13 in skeem/frictionless/resource.py

View check run for this annotation

Codecov / codecov/patch

skeem/frictionless/resource.py#L13

Added line #L13 was not covered by tests

if "detector" not in kwargs:
kwargs["detector"] = Detector(sample_size=PEEK_LINES)
super().__init__(*args, **kwargs)
self.detector = Detector(sample_size=PEEK_LINES)

Check warning on line 15 in skeem/frictionless/resource.py

View check run for this annotation

Codecov / codecov/patch

skeem/frictionless/resource.py#L15

Added line #L15 was not covered by tests
super().__attrs_post_init__()

0 comments on commit 167cc20

Please sign in to comment.