Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into set_get_host_memory_resource_with_pa…
Browse files Browse the repository at this point in the history
…rams
  • Loading branch information
vuule authored May 22, 2024
2 parents d1ccbaf + f6cca50 commit ce92c7a
Show file tree
Hide file tree
Showing 30 changed files with 760 additions and 366 deletions.
14 changes: 2 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,8 @@ You can import `cudf` directly and use it like `pandas`:

```python
import cudf
import requests
from io import StringIO

url = "https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode("utf-8")

tips_df = cudf.read_csv(StringIO(content))
tips_df = cudf.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100

# display average tip by dining party size
Expand All @@ -36,13 +31,8 @@ supported operations and falling back to pandas when needed:
%load_ext cudf.pandas # pandas operations now use the GPU!

import pandas as pd
import requests
from io import StringIO

url = "https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode("utf-8")

tips_df = pd.read_csv(StringIO(content))
tips_df = pd.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100

# display average tip by dining party size
Expand Down
35 changes: 35 additions & 0 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ class csv_reader_options {
char _quotechar = '"';
// Whether a quote inside a value is double-quoted
bool _doublequote = true;
// Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
// _doublequote is true
bool _detect_whitespace_around_quotes = false;
// Names of columns to read as datetime
std::vector<std::string> _parse_dates_names;
// Indexes of columns to read as datetime
Expand Down Expand Up @@ -375,6 +378,17 @@ class csv_reader_options {
*/
[[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }

/**
* @brief Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @return `true` if detect_whitespace_around_quotes is enabled
*/
[[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const
{
return _detect_whitespace_around_quotes;
}

/**
* @brief Returns names of columns to read as datetime.
*
Expand Down Expand Up @@ -698,6 +712,14 @@ class csv_reader_options {
*/
void enable_doublequote(bool val) { _doublequote = val; }

/**
* @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @param val Boolean value to enable/disable
*/
void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down Expand Up @@ -1126,6 +1148,19 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @param val Boolean value to enable/disable
* @return this for chaining
*/
csv_reader_options_builder& detect_whitespace_around_quotes(bool val)
{
options._detect_whitespace_around_quotes = val;
return *this;
}

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down
16 changes: 13 additions & 3 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
if (dtypes[actual_col].id() == cudf::type_id::STRING) {
auto end = next_delimiter;
if (not options.keepquotes) {
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
if (not options.detect_whitespace_around_quotes) {
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
}
} else {
// If the string is quoted, whitespace around the quotes get removed as well
auto const trimmed_field = trim_whitespaces(field_start, end);
if ((*trimmed_field.first == options.quotechar) &&
(*(trimmed_field.second - 1) == options.quotechar)) {
field_start = trimmed_field.first + 1;
end = trimmed_field.second - 1;
}
}
}
auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -951,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
parse_opts.terminator = reader_opts.get_lineterminator();

if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
parse_opts.quotechar = reader_opts.get_quotechar();
parse_opts.keepquotes = false;
parse_opts.quotechar = reader_opts.get_quotechar();
parse_opts.keepquotes = false;
parse_opts.detect_whitespace_around_quotes =
reader_opts.is_enabled_detect_whitespace_around_quotes();
parse_opts.doublequote = reader_opts.is_enabled_doublequote();
} else {
parse_opts.quotechar = '\0';
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ struct parse_options_view {
char thousands;
char comment;
bool keepquotes;
bool detect_whitespace_around_quotes;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -80,6 +81,7 @@ struct parse_options {
char thousands;
char comment;
bool keepquotes;
bool detect_whitespace_around_quotes;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -105,6 +107,7 @@ struct parse_options {
thousands,
comment,
keepquotes,
detect_whitespace_around_quotes,
doublequote,
dayfirst,
skipblanklines,
Expand Down
41 changes: 41 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
view.column(1));
}

TEST_F(CsvReaderTest, StringsQuotesWhitespace)
{
std::vector<std::string> names{"line", "verse"};

auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << '\n';
outfile << "A,a" << '\n'; // unquoted no whitespace
outfile << " B,b" << '\n'; // unquoted leading whitespace
outfile << "C ,c" << '\n'; // unquoted trailing whitespace
outfile << " D ,d" << '\n'; // unquoted leading and trailing whitespace
outfile << "\"E\",e" << '\n'; // quoted no whitespace
outfile << "\"F\" ,f" << '\n'; // quoted trailing whitespace
outfile << " \"G\",g" << '\n'; // quoted leading whitespace
outfile << " \"H\" ,h" << '\n'; // quoted leading and trailing whitespace
outfile << " \" I \" ,i"
<< '\n'; // quoted leading and trailing whitespace with spaces inside quotes
}

cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
.names(names)
.dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
.quoting(cudf::io::quote_style::ALL)
.doublequote(false)
.detect_whitespace_around_quotes(true);
auto result = cudf::io::read_csv(in_opts);

auto const view = result.tbl->view();
ASSERT_EQ(2, view.num_columns());
ASSERT_EQ(type_id::STRING, view.column(0).type().id());
ASSERT_EQ(type_id::STRING, view.column(1).type().id());

expect_column_data_equal(
std::vector<std::string>{"A", " B", "C ", " D ", "E", "F", "G", "H", " I "},
view.column(0));
expect_column_data_equal(std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
view.column(1));
}

TEST_F(CsvReaderTest, SkiprowsNrows)
{
auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
Expand Down
74 changes: 71 additions & 3 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ def write_parquet(
object force_nullable_schema=False,
header_version="1.0",
use_dictionary=True,
object skip_compression=None,
object column_encoding=None,
object column_type_length=None,
object output_as_binary=None,
):
"""
Cython function to call into libcudf API, see `write_parquet`.
Expand Down Expand Up @@ -458,7 +462,12 @@ def write_parquet(
_set_col_metadata(
table[name]._column,
tbl_meta.column_metadata[i],
force_nullable_schema
force_nullable_schema,
None,
skip_compression,
column_encoding,
column_type_length,
output_as_binary
)

cdef map[string, string] tmp_user_data
Expand Down Expand Up @@ -810,16 +819,62 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
raise ValueError("Unsupported `compression` type")


cdef cudf_io_types.column_encoding _get_encoding_type(object encoding):
if encoding is None:
return cudf_io_types.column_encoding.USE_DEFAULT

enc = str(encoding).upper()
if enc == "PLAIN":
return cudf_io_types.column_encoding.PLAIN
elif enc == "DICTIONARY":
return cudf_io_types.column_encoding.DICTIONARY
elif enc == "DELTA_BINARY_PACKED":
return cudf_io_types.column_encoding.DELTA_BINARY_PACKED
elif enc == "DELTA_LENGTH_BYTE_ARRAY":
return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY
elif enc == "DELTA_BYTE_ARRAY":
return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY
elif enc == "BYTE_STREAM_SPLIT":
return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT
elif enc == "USE_DEFAULT":
return cudf_io_types.column_encoding.USE_DEFAULT
else:
raise ValueError("Unsupported `column_encoding` type")


cdef _set_col_metadata(
Column col,
column_in_metadata& col_meta,
bool force_nullable_schema=False,
str path=None,
object skip_compression=None,
object column_encoding=None,
object column_type_length=None,
object output_as_binary=None,
):
need_path = (skip_compression is not None or column_encoding is not None or
column_type_length is not None or output_as_binary is not None)
name = col_meta.get_name().decode('UTF-8') if need_path else None
full_path = path + "." + name if path is not None else name

if force_nullable_schema:
# Only set nullability if `force_nullable_schema`
# is true.
col_meta.set_nullability(True)

if skip_compression is not None and full_path in skip_compression:
col_meta.set_skip_compression(True)

if column_encoding is not None and full_path in column_encoding:
col_meta.set_encoding(_get_encoding_type(column_encoding[full_path]))

if column_type_length is not None and full_path in column_type_length:
col_meta.set_output_as_binary(True)
col_meta.set_type_length(column_type_length[full_path])

if output_as_binary is not None and full_path in output_as_binary:
col_meta.set_output_as_binary(True)

if isinstance(col.dtype, cudf.StructDtype):
for i, (child_col, name) in enumerate(
zip(col.children, list(col.dtype.fields))
Expand All @@ -828,13 +883,26 @@ cdef _set_col_metadata(
_set_col_metadata(
child_col,
col_meta.child(i),
force_nullable_schema
force_nullable_schema,
full_path,
skip_compression,
column_encoding,
column_type_length,
output_as_binary
)
elif isinstance(col.dtype, cudf.ListDtype):
if full_path is not None:
full_path = full_path + ".list"
col_meta.child(1).set_name("element".encode())
_set_col_metadata(
col.children[1],
col_meta.child(1),
force_nullable_schema
force_nullable_schema,
full_path,
skip_compression,
column_encoding,
column_type_length,
output_as_binary
)
elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
col_meta.set_decimal_precision(col.dtype.precision)
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \
cudf_io_types.quote_style get_quoting() except +
char get_quotechar() except +
bool is_enabled_doublequote() except +
bool is_enabled_updated_quotes_detection() except +
vector[string] get_parse_dates_names() except +
vector[int] get_parse_dates_indexes() except +
vector[string] get_parse_hex_names() except +
Expand Down Expand Up @@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \
void set_quoting(cudf_io_types.quote_style style) except +
void set_quotechar(char val) except +
void set_doublequote(bool val) except +
void set_detect_whitespace_around_quotes(bool val) except +
void set_parse_dates(vector[string]) except +
void set_parse_dates(vector[int]) except +
void set_parse_hex(vector[string]) except +
Expand Down Expand Up @@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \
) except +
csv_reader_options_builder& quotechar(char val) except +
csv_reader_options_builder& doublequote(bool val) except +
csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except +
csv_reader_options_builder& parse_dates(vector[string]) except +
csv_reader_options_builder& parse_dates(vector[int]) except +

Expand Down
18 changes: 17 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport uint8_t
from libc.stdint cimport int32_t, uint8_t
from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport shared_ptr, unique_ptr
Expand Down Expand Up @@ -57,6 +57,19 @@ cdef extern from "cudf/io/types.hpp" \
ADAPTIVE = 1,
ALWAYS = 2,

cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
cpdef enum class column_encoding:
USE_DEFAULT = -1
DICTIONARY = 0
PLAIN = 1
DELTA_BINARY_PACKED = 2
DELTA_LENGTH_BYTE_ARRAY =3
DELTA_BYTE_ARRAY = 4
BYTE_STREAM_SPLIT = 5
DIRECT = 6
DIRECT_V2 = 7
DICTIONARY_V2 = 8

cdef cppclass column_name_info:
string name
vector[column_name_info] children
Expand All @@ -81,6 +94,9 @@ cdef extern from "cudf/io/types.hpp" \
column_in_metadata& set_decimal_precision(uint8_t precision)
column_in_metadata& child(size_type i)
column_in_metadata& set_output_as_binary(bool binary)
column_in_metadata& set_type_length(int32_t type_length)
column_in_metadata& set_skip_compression(bool skip)
column_in_metadata& set_encoding(column_encoding enc)
string get_name()

cdef cppclass table_input_metadata:
Expand Down
Loading

0 comments on commit ce92c7a

Please sign in to comment.