From 167cc20d644a849dba168456d681329d10a09e9d Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 20 Oct 2024 22:27:35 +0200 Subject: [PATCH] Update to frictionless >=5.16 --- CHANGES.rst | 1 + pyproject.toml | 2 +- skeem/cli.py | 6 ++-- skeem/core.py | 21 ++++++++---- skeem/frictionless/monkey.py | 18 +---------- skeem/frictionless/parser_jsonl.py | 4 +-- skeem/frictionless/parser_xlsx.py | 52 ------------------------------ skeem/frictionless/resource.py | 14 ++++---- 8 files changed, 29 insertions(+), 89 deletions(-) delete mode 100644 skeem/frictionless/parser_xlsx.py diff --git a/CHANGES.rst b/CHANGES.rst index c22fb81..e05d9f4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,7 @@ in progress - Added support for Python 3.12 and 3.13 - Adjusted SQL DDL for sqlalchemy-cratedb 0.40.0 - Adjusted ddlgenerator wrapper for pandas 2 +- Updated to frictionless >=5.16 2023-03-09 0.1.0 diff --git a/pyproject.toml b/pyproject.toml index 51cd528..0aebd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ dependencies = [ "colorama<1", "crash", "ddlgenerator<0.2", - "frictionless[excel,json,ods,parquet,sql]<5.19", + "frictionless[excel,json,ods,parquet,sql]>=5.16,<5.19", "fsspec[gcs,github,http,s3]==2024.9.0", "json_stream<3", "line-protocol-parser<2", diff --git a/skeem/cli.py b/skeem/cli.py index d724927..b13c637 100644 --- a/skeem/cli.py +++ b/skeem/cli.py @@ -102,7 +102,7 @@ def info(): context_settings={"max_content_width": 120}, ) @click.argument("input", type=str, required=True) -@click.option("--dialect", type=str, required=False, help="Select SQLAlchemy dialect for generating SQL") +@click.option("--dialect", type=str, required=True, help="Select SQLAlchemy dialect for generating SQL") @click.option("--table-name", type=str, required=False, help="Specify table name used in DDL statement") @click.option( "--primary-key", @@ -124,10 +124,10 @@ def info(): @click.pass_context def infer_ddl( ctx: click.Context, - input: t.Optional[t.Union[Path, str]] = None, # noqa: A002 + input: t.Union[Path, str], # noqa: A002 + dialect: str, address: t.Optional[str] = None, content_type: t.Optional[str] = None, - dialect: t.Optional[str] = None, table_name: t.Optional[str] = None, primary_key: t.Optional[str] = None, backend: t.Optional[str] = "ddlgen", diff --git a/skeem/core.py b/skeem/core.py index 3be35d0..a3cf249 100644 --- a/skeem/core.py +++ b/skeem/core.py @@ -4,9 +4,11 @@ from pathlib import Path import pandas as pd +from frictionless import Control from skeem.autopk import infer_pk from skeem.exception import UnknownContentType +from skeem.frictionless.resource import TableSampleResource from skeem.model import Resource, SqlResult, SqlTarget from skeem.settings import FRICTIONLESS_CONTENT_TYPES from skeem.type import ContentType @@ -81,12 +83,14 @@ def _ddl_frictionless(self) -> SqlResult: warnings.filterwarnings("ignore", category=GuessedAtParserWarning) import frictionless.formats - import sqlalchemy as sa - from ddlgenerator.ddlgenerator import _dump from frictionless.formats import ExcelControl, OdsControl from skeem.ddlgen.ddlgenerator import TablePlus + # Sanity checks. + if not self.target.dialect: + raise ValueError("Inferring the database schema needs an SQLAlchemy dialect") + frictionless_args: t.Dict[str, t.Union[str, t.IO]] = {} if self.resource.path is not None: frictionless_args["path"] = str(self.resource.path) @@ -103,15 +107,15 @@ def _ddl_frictionless(self) -> SqlResult: raise ValueError("Unable to read any data") # Define resource controls. - control = None + control: t.Union[Control, None] = None if self.resource.type is ContentType.ODS: control = OdsControl(sheet=self.resource.address or 1) elif self.resource.type is ContentType.XLSX: control = ExcelControl(sheet=self.resource.address or 1) # Open resource. - logger.info(f"Opening resource {frictionless_args} with {control}") - resource = frictionless.Resource(**frictionless_args, control=control) + logger.info(f"Opening resource {frictionless_args}. type={self.resource.type}, control={control}") + resource = TableSampleResource(**frictionless_args, control=control) # type: ignore[arg-type] # When primary key is not given, try to infer it from the data. # TODO: Make `infer_pk` obtain a `Resource` instance, and/or refactor as method. @@ -126,8 +130,7 @@ def _ddl_frictionless(self) -> SqlResult: # Infer schema. logger.info("Inferring schema") - engine = sa.create_mock_engine(sa.engine.make_url(f"{self.target.dialect}://"), executor=_dump) - mapper = frictionless.formats.sql.SqlMapper(engine) + mapper = frictionless.formats.sql.SqlMapper(dialect=self.target.dialect) descriptor = resource.to_descriptor() # Either `schema` is already present, or it needs to be established by invoking `describe` first. @@ -144,6 +147,10 @@ def _ddl_frictionless(self) -> SqlResult: pk_field.required = True schema.primary_key = [self.target.primary_key] + # Sanity checks. + if not self.target.table_name: + raise ValueError("Table name must not be empty") + # Create SQLAlchemy table from schema. logger.info("Converging schema to SQLAlchemy") table = mapper.write_schema(schema, table_name=self.target.table_name, with_metadata=False) diff --git a/skeem/frictionless/monkey.py b/skeem/frictionless/monkey.py index 1ec746c..2bf0106 100644 --- a/skeem/frictionless/monkey.py +++ b/skeem/frictionless/monkey.py @@ -2,25 +2,12 @@ from .loader_stream import read_byte_stream_create from .pandas_plugin import create_parser from .parser_jsonl import read_cell_stream_create -from .parser_xlsx import read_loader -from .resource import ResourcePlus def activate(): - patch_core() patch_modules() -def patch_core(): - """ - Override sample size for all `frictionless.Resource` instances. - """ - import frictionless - - frictionless.resource.Resource = ResourcePlus - frictionless.Resource = ResourcePlus - - def patch_modules(): """ Enhance `frictionless` loader and parser modules. @@ -30,13 +17,10 @@ def patch_modules(): """ import frictionless.formats.excel.parsers - import frictionless.formats.json.parsers - import frictionless.formats.pandas.plugin import frictionless.schemes.aws.loaders.s3 import frictionless.schemes.stream.loader - frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create = read_cell_stream_create + frictionless.formats.JsonlParser.read_cell_stream_create = read_cell_stream_create frictionless.formats.pandas.plugin.PandasPlugin.create_parser = create_parser frictionless.schemes.aws.loaders.s3.S3Loader.read_byte_stream_create = s3_read_byte_stream_create frictionless.schemes.stream.loader.StreamLoader.read_byte_stream_create = read_byte_stream_create - frictionless.formats.excel.parsers.XlsxParser.read_loader = read_loader diff --git a/skeem/frictionless/parser_jsonl.py b/skeem/frictionless/parser_jsonl.py index 0470578..0c5f643 100644 --- a/skeem/frictionless/parser_jsonl.py +++ b/skeem/frictionless/parser_jsonl.py @@ -1,6 +1,6 @@ -import frictionless.formats +from frictionless.formats import JsonlParser -read_cell_stream_create_original = frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create +read_cell_stream_create_original = JsonlParser.read_cell_stream_create def read_cell_stream_create(self): diff --git a/skeem/frictionless/parser_xlsx.py b/skeem/frictionless/parser_xlsx.py deleted file mode 100644 index b5d69b5..0000000 --- a/skeem/frictionless/parser_xlsx.py +++ /dev/null @@ -1,52 +0,0 @@ -import atexit -import os -import shutil -import tempfile - -from frictionless.formats.excel.control import ExcelControl -from frictionless.resource.resource import Resource -from frictionless.system import system - - -def read_loader(self): - """ - Patched for Python 3.12. - - https://github.com/frictionlessdata/frictionless-py/issues/1642 - https://github.com/frictionlessdata/frictionless-py/pull/1684 - """ - control = ExcelControl.from_dialect(self.resource.dialect) - loader = system.create_loader(self.resource) - if not loader.remote: - return loader.open() - - # Remote - # Create copy for remote source - # For remote stream we need local copy (will be deleted on close by Python) - # https://docs.python.org/3.5/library/tempfile.html#tempfile.TemporaryFile - if loader.remote: - path = self.resource.normpath - - # Cached - if control.workbook_cache is not None and path in control.workbook_cache: - # TODO: rebase on using resource without system? - resource = Resource(path, scheme="file", format="xlsx") - resource.infer(sample=False) - loader = system.create_loader(resource) - return loader.open() - - with loader as loader: - delete = control.workbook_cache is None - target = tempfile.NamedTemporaryFile(delete=delete) - shutil.copyfileobj(loader.byte_stream, target) - target.seek(0) - if not delete: - control.workbook_cache[path] = target.name # type: ignore - atexit.register(os.remove, target.name) - # TODO: rebase on using resource without system? - resource = Resource(target, scheme="stream", format="xlsx") - resource.infer(sample=False) - loader = system.create_loader(resource) - return loader.open() - - return None diff --git a/skeem/frictionless/resource.py b/skeem/frictionless/resource.py index 95e87e4..5a9239c 100644 --- a/skeem/frictionless/resource.py +++ b/skeem/frictionless/resource.py @@ -1,16 +1,16 @@ -from frictionless import Resource +from frictionless.resources import TableResource from skeem.settings import PEEK_LINES -class ResourcePlus(Resource): +class TableSampleResource(TableResource): """ Override sample size for frictionless `Resource` instances. """ - def __init__(self, *args, **kwargs): - from frictionless import Detector + def __attrs_post_init__(self): + if self.detector is None: + from frictionless import Detector - if "detector" not in kwargs: - kwargs["detector"] = Detector(sample_size=PEEK_LINES) - super().__init__(*args, **kwargs) + self.detector = Detector(sample_size=PEEK_LINES) + super().__attrs_post_init__()