Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] refactor to grid stride for data_type_detection kernel #14694

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,21 @@ set_source_files_properties(
PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64"
)

set_source_files_properties(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you've changed anything in csv_gpu.cu that requires CMake alterations. Was there an improperly resolved merge conflict somewhere?

src/io/csv/csv_gpu.cu src/io/csv/durations.cu src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu
)

set_source_files_properties(
src/binaryop/binaryop.cpp
src/jit/cache.cpp
src/rolling/detail/rolling_fixed_window.cu
src/rolling/detail/rolling_variable_window.cu
src/rolling/grouped_rolling.cu
src/rolling/rolling.cu
src/transform/transform.cpp
PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64"
)

set_target_properties(
cudf
PROPERTIES BUILD_RPATH "\$ORIGIN"
Expand Down
204 changes: 103 additions & 101 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -179,115 +179,117 @@ __global__ void __launch_bounds__(csvparse_block_dim)

// ThreadIds range per block, so also need the blockId
// This is entry into the fields; threadId is an element within `num_records`
auto const rec_id = grid_1d::global_thread_id();
auto const rec_id_next = rec_id + 1;

// we can have more threads than data, make sure we are not past the end of the data
if (rec_id_next >= row_offsets.size()) { return; }

auto field_start = raw_csv + row_offsets[rec_id];
auto const row_end = raw_csv + row_offsets[rec_id_next];

auto next_field = field_start;
int col = 0;
int actual_col = 0;

// Going through all the columns of a given record
while (col < column_flags.size() && field_start < row_end) {
auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts);

// Checking if this is a column that the user wants --- user can filter columns
if (column_flags[col] & column_parse::inferred) {
// points to last character in the field
auto const field_len = static_cast<size_t>(next_delimiter - field_start);
if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
atomicAdd(&d_column_data[actual_col].null_count, 1);
} else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
atomicAdd(&d_column_data[actual_col].bool_count, 1);
} else if (cudf::io::is_infinity(field_start, next_delimiter)) {
atomicAdd(&d_column_data[actual_col].float_count, 1);
} else {
long count_number = 0;
long count_decimal = 0;
long count_thousands = 0;
long count_slash = 0;
long count_dash = 0;
long count_plus = 0;
long count_colon = 0;
long count_string = 0;
long count_exponent = 0;

// Modify field_start & end to ignore whitespace and quotechars
// This could possibly result in additional empty fields
auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter);
auto const trimmed_field_len = trimmed_field_range.second - trimmed_field_range.first;

for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) {
if (is_digit(*cur)) {
count_number++;
continue;
}
if (*cur == opts.decimal) {
count_decimal++;
continue;
}
if (*cur == opts.thousands) {
count_thousands++;
continue;
}
// Looking for unique characters that will help identify column types.
switch (*cur) {
case '-': count_dash++; break;
case '+': count_plus++; break;
case '/': count_slash++; break;
case ':': count_colon++; break;
case 'e':
case 'E':
if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1)
count_exponent++;
break;
default: count_string++; break;

for (auto rec_id = grid_1d::global_thread_id(); rec_id < row_offsets.size();
rec_id += grid_1d::grid_stride()) {
// we can have more threads than data, make sure we are not past the end of the data
auto const rec_id_next = rec_id + 1;

if (rec_id_next >= row_offsets.size()) { return; }

auto field_start = raw_csv + row_offsets[rec_id];
auto const row_end = raw_csv + row_offsets[rec_id_next];

auto next_field = field_start;
int col = 0;
int actual_col = 0;
while (col < column_flags.size() && field_start < row_end) {
auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts);

// Checking if this is a column that the user wants --- user can filter columns
if (column_flags[col] & column_parse::inferred) {
// points to last character in the field
auto const field_len = static_cast<size_t>(next_delimiter - field_start);
if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
atomicAdd(&d_column_data[actual_col].null_count, 1);
} else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
atomicAdd(&d_column_data[actual_col].bool_count, 1);
} else if (cudf::io::is_infinity(field_start, next_delimiter)) {
atomicAdd(&d_column_data[actual_col].float_count, 1);
} else {
long count_number = 0;
long count_decimal = 0;
long count_thousands = 0;
long count_slash = 0;
long count_dash = 0;
long count_plus = 0;
long count_colon = 0;
long count_string = 0;
long count_exponent = 0;

// Modify field_start & end to ignore whitespace and quotechars
// This could possibly result in additional empty fields
auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter);
auto const trimmed_field_len = trimmed_field_range.second - trimmed_field_range.first;

for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) {
if (is_digit(*cur)) {
count_number++;
continue;
}
if (*cur == opts.decimal) {
count_decimal++;
continue;
}
if (*cur == opts.thousands) {
count_thousands++;
continue;
}
// Looking for unique characters that will help identify column types.
switch (*cur) {
case '-': count_dash++; break;
case '+': count_plus++; break;
case '/': count_slash++; break;
case ':': count_colon++; break;
case 'e':
case 'E':
if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1)
count_exponent++;
break;
default: count_string++; break;
}
}
}

// Integers have to have the length of the string
// Off by one if they start with a minus sign
auto const int_req_number_cnt =
trimmed_field_len - count_thousands -
((*trimmed_field_range.first == '-' || *trimmed_field_range.first == '+') &&
trimmed_field_len > 1);

if (column_flags[col] & column_parse::as_datetime) {
// PANDAS uses `object` dtype if the date is unparseable
if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) {
atomicAdd(&d_column_data[actual_col].datetime_count, 1);
// Integers have to have the length of the string
// Off by one if they start with a minus sign
auto const int_req_number_cnt =
trimmed_field_len - count_thousands -
((*trimmed_field_range.first == '-' || *trimmed_field_range.first == '+') &&
trimmed_field_len > 1);

if (column_flags[col] & column_parse::as_datetime) {
// PANDAS uses `object` dtype if the date is unparseable
if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) {
atomicAdd(&d_column_data[actual_col].datetime_count, 1);
} else {
atomicAdd(&d_column_data[actual_col].string_count, 1);
}
} else if (count_number == int_req_number_cnt) {
auto const is_negative = (*trimmed_field_range.first == '-');
auto const data_begin =
trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter(
data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
atomicAdd(ptr, 1);
} else if (is_floatingpoint(trimmed_field_len,
count_number,
count_decimal,
count_thousands,
count_dash + count_plus,
count_exponent)) {
atomicAdd(&d_column_data[actual_col].float_count, 1);
} else {
atomicAdd(&d_column_data[actual_col].string_count, 1);
}
} else if (count_number == int_req_number_cnt) {
auto const is_negative = (*trimmed_field_range.first == '-');
auto const data_begin =
trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter(
data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
atomicAdd(ptr, 1);
} else if (is_floatingpoint(trimmed_field_len,
count_number,
count_decimal,
count_thousands,
count_dash + count_plus,
count_exponent)) {
atomicAdd(&d_column_data[actual_col].float_count, 1);
} else {
atomicAdd(&d_column_data[actual_col].string_count, 1);
}
actual_col++;
}
actual_col++;
next_field = next_delimiter + 1;
field_start = next_field;
col++;
}
next_field = next_delimiter + 1;
field_start = next_field;
col++;
}
}

Expand Down