diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index eeafc411874..ff00c484501 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -438,7 +438,6 @@ add_library( src/io/text/bgzip_data_chunk_source.cu src/io/text/bgzip_utils.cpp src/io/text/multibyte_split.cu - src/io/utilities/arrow_io_source.cpp src/io/utilities/base64_utilities.cpp src/io/utilities/column_buffer.cpp src/io/utilities/column_buffer_strings.cu diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp deleted file mode 100644 index ed5c839cbb4..00000000000 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "datasource.hpp" - -#include - -#include -#include - -#include -#include -#include - -namespace CUDF_EXPORT cudf { -namespace io { -/** - * @addtogroup io_datasources - * @{ - * @file - */ - -/** - * @brief Implementation class for reading from an Apache Arrow file. The file - * could be a memory-mapped file or other implementation supported by Arrow. - */ -class arrow_io_source : public datasource { - public: - /** - * @brief Constructs an object from an Apache Arrow Filesystem URI - * - * @param arrow_uri Apache Arrow Filesystem URI - */ - explicit arrow_io_source(std::string const& arrow_uri); - - /** - * @brief Constructs an object from an `arrow` source object. - * - * @param file The `arrow` object from which the data is read - */ - explicit arrow_io_source(std::shared_ptr file) - : arrow_file(std::move(file)) - { - } - - /** - * @brief Returns a buffer with a subset of data from the `arrow` source. - * - * @param offset The offset in bytes from which to read - * @param size The number of bytes to read - * @return A buffer with the read data - */ - std::unique_ptr host_read(size_t offset, size_t size) override; - - /** - * @brief Reads a selected range from the `arrow` source into a preallocated buffer. - * - * @param[in] offset The offset in bytes from which to read - * @param[in] size The number of bytes to read - * @param[out] dst The preallocated buffer to read into - * @return The number of bytes read - */ - size_t host_read(size_t offset, size_t size, uint8_t* dst) override; - /** - * @brief Returns the size of the data in the `arrow` source. - * - * @return The size of the data in the `arrow` source - */ - [[nodiscard]] size_t size() const override; - - private: - std::shared_ptr filesystem; - std::shared_ptr arrow_file; -}; - -/** @} */ // end of group -} // namespace io -} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp deleted file mode 100644 index 157240b8b08..00000000000 --- a/cpp/src/io/utilities/arrow_io_source.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -#include -#include -#include - -namespace cudf::io { - -/** - * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. - */ -class arrow_io_buffer : public datasource::buffer { - std::shared_ptr arrow_buffer; - - public: - explicit arrow_io_buffer(std::shared_ptr arrow_buffer) - : arrow_buffer(std::move(arrow_buffer)) - { - } - [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } - [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } -}; - -arrow_io_source::arrow_io_source(std::string const& arrow_uri) -{ - std::string const uri_start_delimiter = "//"; - std::string const uri_end_delimiter = "?"; - - auto const result = arrow::fs::FileSystemFromUri(arrow_uri); - CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); - filesystem = result.ValueOrDie(); - - // Parse the path from the URI - auto const start = [&]() { - auto const delim_start = arrow_uri.find(uri_start_delimiter); - return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size(); - }(); - auto const end = arrow_uri.find(uri_end_delimiter) - start; - auto const path = arrow_uri.substr(start, end); - - auto const in_stream = filesystem->OpenInputFile(path); - CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); - arrow_file = in_stream.ValueOrDie(); -} - -std::unique_ptr arrow_io_source::host_read(size_t offset, size_t size) -{ - auto const result = arrow_file->ReadAt(offset, size); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return std::make_unique(result.ValueOrDie()); -} - -size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst) -{ - auto const result = arrow_file->ReadAt(offset, size, dst); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return result.ValueOrDie(); -} - -[[nodiscard]] size_t arrow_io_source::size() const -{ - auto const result = arrow_file->GetSize(); - CUDF_EXPECTS(result.ok(), "Cannot get file size"); - return result.ValueOrDie(); -} - -} // namespace cudf::io diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 006b36add0e..ac77a362e1c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -321,7 +321,6 @@ ConfigureTest( ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp) ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp) -ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp) ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu) @@ -334,9 +333,6 @@ target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) ConfigureTest(FST_TEST io/fst/fst_test.cu) ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) -if(CUDF_ENABLE_ARROW_S3) - target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") -endif() # ################################################################################################## # * sort tests ------------------------------------------------------------------------------------ diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp deleted file mode 100644 index ffdf2c7e00f..00000000000 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -// Global environment for temporary files -auto const temp_env = static_cast( - ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); - -// Base test fixture for tests -struct ArrowIOTest : public cudf::test::BaseFixture {}; - -TEST_F(ArrowIOTest, URIFileSystem) -{ - const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; - std::ofstream outfile(file_name, std::ofstream::out); - outfile << "{\"a\":11, \"b\":1.1}\n{\"a\":22, \"b\":2.2}"; - outfile.close(); - - std::string file_uri = "file://" + file_name; - auto datasource = std::make_unique(file_uri); - - // Populate the JSON Reader Options - cudf::io::json_reader_options options = - cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true); - - // Read the JSON file from the LocalFileSystem - cudf::io::table_with_metadata tbl = cudf::io::read_json(options); - - ASSERT_EQ(2, tbl.tbl->num_columns()); - ASSERT_EQ(2, tbl.tbl->num_rows()); -} - -TEST_F(ArrowIOTest, S3FileSystem) -{ - std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; - - // Check to see if Arrow was built with support for S3. If not, ensure this - // test throws. If so, validate the S3 file contents. - auto const s3_unsupported = arrow::fs::FileSystemFromUri(s3_uri).status().IsNotImplemented(); - if (s3_unsupported) { - EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); - } else { - auto datasource = std::make_unique(s3_uri); - - // Populate the Parquet Reader Options - cudf::io::source_info src(datasource.get()); - std::vector single_column; - single_column.insert(single_column.begin(), "total_bill"); - cudf::io::parquet_reader_options_builder builder(src); - cudf::io::parquet_reader_options options = builder.columns(single_column).build(); - - // Read the Parquet file from S3 - cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options); - - ASSERT_EQ(1, tbl.tbl->num_columns()); // Only single column specified in reader_options - ASSERT_EQ(244, tbl.tbl->num_rows()); // known number of rows from the S3 file - } - -#ifdef ARROW_S3 - if (!s3_unsupported) { - // Verify that we are using Arrow with S3, and call finalize - // https://github.com/apache/arrow/issues/36974 - // This needs to be in a separate conditional to ensure we call - // finalize after all arrow_io_source instances have been deleted. - [[maybe_unused]] auto _ = arrow::fs::EnsureS3Finalized(); - } -#endif -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index ff433264446..dc14824d834 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -25,8 +25,8 @@ #include #include -#include #include +#include #include #include #include @@ -1197,30 +1197,6 @@ TEST_F(CsvReaderTest, HeaderOnlyFile) EXPECT_EQ(3, view.num_columns()); } -TEST_F(CsvReaderTest, ArrowFileSource) -{ - auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv"; - { - std::ofstream outfile(filepath, std::ofstream::out); - outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n"; - } - - std::shared_ptr infile; - ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok()); - - auto arrow_source = cudf::io::arrow_io_source{infile}; - cudf::io::csv_reader_options in_opts = - cudf::io::csv_reader_options::builder(cudf::io::source_info{&arrow_source}) - .dtypes({dtype()}); - auto result = cudf::io::read_csv(in_opts); - - auto const view = result.tbl->view(); - EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(type_id::INT8, view.column(0).type().id()); - - expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); -} - TEST_F(CsvReaderTest, InvalidFloatingPoint) { auto const filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv"; diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 0a485e26b71..576a698ba31 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -958,31 +957,6 @@ TEST_F(JsonReaderTest, NoDataFileValues) EXPECT_EQ(0, view.num_columns()); } -TEST_F(JsonReaderTest, ArrowFileSource) -{ - const std::string fname = temp_env->get_temp_dir() + "ArrowFileSource.csv"; - - std::ofstream outfile(fname, std::ofstream::out); - outfile << "[9]\n[8]\n[7]\n[6]\n[5]\n[4]\n[3]\n[2]\n"; - outfile.close(); - - std::shared_ptr infile; - ASSERT_TRUE(arrow::io::ReadableFile::Open(fname).Value(&infile).ok()); - - auto arrow_source = cudf::io::arrow_io_source{infile}; - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source}) - .dtypes({dtype()}) - .lines(true); - - cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - - EXPECT_EQ(result.tbl->num_columns(), 1); - EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}}); -} - TEST_P(JsonReaderParamTest, InvalidFloatingPoint) { auto const test_opt = GetParam(); diff --git a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd deleted file mode 100644 index 54a913a9ce3..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource -from libcpp.memory cimport shared_ptr -from libcpp.string cimport string -from pyarrow.includes.libarrow cimport CRandomAccessFile - - -cdef extern from "cudf/io/arrow_io_source.hpp" \ - namespace "cudf::io" nogil: - - cdef cppclass arrow_io_source(cudf_io_datasource.datasource): - arrow_io_source(const string& arrow_uri) except + - arrow_io_source(shared_ptr[CRandomAccessFile]) except +