From 0e60e0b3382a166093fbbb140b0f126d47940bcd Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Mon, 16 Sep 2024 10:11:46 +0200 Subject: [PATCH] fix: Pandas parser does fail to parse integer or boolean only dataframes (#1683) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fixes #1678 Converting the Series returned by `iterrows()` to a dict converts `np.int64` type to python's native `int` type and fixes the bug (same with booleans). + Adding non-regression tests - I was also concerned with the next lines, especially : `if value is np.nan: value = None` - it was untested, so I added a test. It looks like `to_dict` would not change the behavior of `np.nan` conversion (see side note), so I left this code unchanged. - Primary keys are returned as `int`s or `tuple[int]`, no `np.int64` there - `Timestamps` types are kept unchanged, so the `if isinstance(value, pd.Timestamp):` still applies. ## Side note `np.nan` behavior is quite strange with `df.iterrows()` : in a number column, it will be converted to `float("nan")`, whereas in string column it will be kept as `np.nan`. Adding `to_dict()` to the row Series does not change the types. --- .../formats/pandas/__spec__/test_parser.py | 44 ++++++++++++++++--- frictionless/formats/pandas/parser.py | 2 +- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/frictionless/formats/pandas/__spec__/test_parser.py b/frictionless/formats/pandas/__spec__/test_parser.py index cb60d791da..02e526d34b 100644 --- a/frictionless/formats/pandas/__spec__/test_parser.py +++ b/frictionless/formats/pandas/__spec__/test_parser.py @@ -2,6 +2,7 @@ from decimal import Decimal import isodate +import numpy as np import pandas as pd import pytz from dateutil.tz import tzoffset, tzutc @@ -14,13 +15,44 @@ def test_pandas_parser(): - dataframe = pd.DataFrame(data={"id": [1, 2], "name": ["english", "中国人"]}) + test_cases = [ + { + "name": "Integer type only dataframe, cf issue 1678", + "df_data": {"int": [1]}, + "expected_header": ["int"], + "expected_rows": [{"int": 1}], + }, + { + "name": "Boolean type only dataframe, cf issue 1678", + "df_data": {"bool": [True]}, + "expected_header": ["bool"], + "expected_rows": [{"bool": True}], + }, + { + "name": "Mixed types dataframe, chinese characters", + "df_data": {"id": [1, 2], "name": ["english", "中国人"]}, + "expected_header": ["id", "name"], + "expected_rows": [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ], + }, + ] + for tc in test_cases: + dataframe = pd.DataFrame(data=tc["df_data"]) + + with TableResource(data=dataframe) as resource: + assert resource.header == tc["expected_header"], tc["name"] + assert resource.read_rows() == tc["expected_rows"], tc["name"] + + +def test_pandas_parser_with_nan(): + dataframe = pd.DataFrame(data={"x": [np.nan]}) + with TableResource(data=dataframe) as resource: - assert resource.header == ["id", "name"] - assert resource.read_rows() == [ - {"id": 1, "name": "english"}, - {"id": 2, "name": "中国人"}, - ] + test_name = 'np.nan converted to Decimal("NaN")' + row = resource.read_rows()[0] + assert row["x"].is_nan(), test_name def test_pandas_parser_from_dataframe_with_primary_key_having_datetime(): diff --git a/frictionless/formats/pandas/parser.py b/frictionless/formats/pandas/parser.py index 3823640efd..da98149234 100644 --- a/frictionless/formats/pandas/parser.py +++ b/frictionless/formats/pandas/parser.py @@ -53,7 +53,7 @@ def read_cell_stream_create(self): pk = pk if isinstance(pk, tuple) else [pk] # type: ignore value = pk[schema.primary_key.index(field.name)] # type: ignore else: - value = item[field.name] + value = item.to_dict()[field.name] if value is np.nan: value = None elif isinstance(value, pd.Timestamp):