From 824a2738dc0b2cdd8dce2d4256c9dc34bb589e6b Mon Sep 17 00:00:00 2001 From: Hedeer El Showk <144284759+hedeershowk@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:50:45 -0400 Subject: [PATCH] BUG: add pyarrow autogenerated prefix (#55115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add pyarrow autogenerated prefix * whats new bug fix * test with no head and pyarrow * only test pyarrow * BUG: This fixes #55009 (`raw=True` caused `apply` method of `DataFrame` to ignore passed arguments) (#55089) * fixes #55009 * update documentation * write documentation * add test * change formatting * cite DataDrame directly in docs Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * PR review feedback * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * alphabetical whatsnew --------- Co-authored-by: Martin Šícho Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 6 ++++++ pandas/tests/io/parser/test_header.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0760840f9950a..445b93705cde5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -314,6 +314,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 765a4ffcd2cb9..35965c90ee7fb 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -130,6 +130,12 @@ def handle_warning(invalid_row): ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d72174c40478e..d6eab59074dd6 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -684,3 +684,21 @@ def test_header_delim_whitespace(all_parsers): result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="string[pyarrow]", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") + tm.assert_frame_equal(result, expected)