From 0e60e0b3382a166093fbbb140b0f126d47940bcd Mon Sep 17 00:00:00 2001
From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com>
Date: Mon, 16 Sep 2024 10:11:46 +0200
Subject: [PATCH] fix: Pandas parser does fail to parse integer or boolean only
 dataframes (#1683)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- fixes #1678

Converting the Series returned by `iterrows()` to a dict converts
`np.int64` type to python's native `int` type and fixes the bug (same
with booleans).

+ Adding non-regression tests

- I was also concerned with the next lines, especially : `if value is
np.nan: value = None`
- it was untested, so I added a test. It looks like `to_dict` would not
change the behavior of `np.nan` conversion (see side note), so I left
this code unchanged.
- Primary keys are returned as `int`s or `tuple[int]`, no `np.int64`
there
- `Timestamps` types are kept unchanged, so the `if isinstance(value,
pd.Timestamp):` still applies.

## Side note

`np.nan` behavior is quite strange with `df.iterrows()` : in a number
column, it will be converted to `float("nan")`, whereas in string column
it will be kept as `np.nan`. Adding `to_dict()` to the row Series does
not change the types.
---
 .../formats/pandas/__spec__/test_parser.py    | 44 ++++++++++++++++---
 frictionless/formats/pandas/parser.py         |  2 +-
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/frictionless/formats/pandas/__spec__/test_parser.py b/frictionless/formats/pandas/__spec__/test_parser.py
index cb60d791da..02e526d34b 100644
--- a/frictionless/formats/pandas/__spec__/test_parser.py
+++ b/frictionless/formats/pandas/__spec__/test_parser.py
@@ -2,6 +2,7 @@
 from decimal import Decimal
 
 import isodate
+import numpy as np
 import pandas as pd
 import pytz
 from dateutil.tz import tzoffset, tzutc
@@ -14,13 +15,44 @@
 
 
 def test_pandas_parser():
-    dataframe = pd.DataFrame(data={"id": [1, 2], "name": ["english", "中国人"]})
+    test_cases = [
+        {
+            "name": "Integer type only dataframe, cf issue 1678",
+            "df_data": {"int": [1]},
+            "expected_header": ["int"],
+            "expected_rows": [{"int": 1}],
+        },
+        {
+            "name": "Boolean type only dataframe, cf issue 1678",
+            "df_data": {"bool": [True]},
+            "expected_header": ["bool"],
+            "expected_rows": [{"bool": True}],
+        },
+        {
+            "name": "Mixed types dataframe, chinese characters",
+            "df_data": {"id": [1, 2], "name": ["english", "中国人"]},
+            "expected_header": ["id", "name"],
+            "expected_rows": [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ],
+        },
+    ]
+    for tc in test_cases:
+        dataframe = pd.DataFrame(data=tc["df_data"])
+
+        with TableResource(data=dataframe) as resource:
+            assert resource.header == tc["expected_header"], tc["name"]
+            assert resource.read_rows() == tc["expected_rows"], tc["name"]
+
+
+def test_pandas_parser_with_nan():
+    dataframe = pd.DataFrame(data={"x": [np.nan]})
+
     with TableResource(data=dataframe) as resource:
-        assert resource.header == ["id", "name"]
-        assert resource.read_rows() == [
-            {"id": 1, "name": "english"},
-            {"id": 2, "name": "中国人"},
-        ]
+        test_name = 'np.nan converted to Decimal("NaN")'
+        row = resource.read_rows()[0]
+        assert row["x"].is_nan(), test_name
 
 
 def test_pandas_parser_from_dataframe_with_primary_key_having_datetime():
diff --git a/frictionless/formats/pandas/parser.py b/frictionless/formats/pandas/parser.py
index 3823640efd..da98149234 100644
--- a/frictionless/formats/pandas/parser.py
+++ b/frictionless/formats/pandas/parser.py
@@ -53,7 +53,7 @@ def read_cell_stream_create(self):
                     pk = pk if isinstance(pk, tuple) else [pk]  # type: ignore
                     value = pk[schema.primary_key.index(field.name)]  # type: ignore
                 else:
-                    value = item[field.name]
+                    value = item.to_dict()[field.name]
                 if value is np.nan:
                     value = None
                 elif isinstance(value, pd.Timestamp):