diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 273e82edf8b..054366f9d56 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -21,6 +21,7 @@ #include "io/utilities/trie.cuh" #include +#include #include #include #include @@ -398,12 +399,31 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) } } +namespace { + +/** + * @brief pack multiple row contexts together + * + * Pack four rowctx32_t values, where each value represents the output row context + * for one of four possible input contexts when parsing a character block. + * Each output state consists of the 2-bit row context state along with a 18-bit row count + * value (row count is assumed to be a local count that fits in 18-bit) + * The four 20-bit values are concatenated to form a 80-bit value, truncated to 64-bit + * since a block starting in a EOF state can only have a zero row count (and the output + * state corresponding to an EOF input state can only be EOF, so only the first 3 output + * states are included as parameters, and the EOF->EOF state transition is hardcoded) + */ +constexpr packed_rowctx_t pack_row_contexts(rowctx32_t ctx0, rowctx32_t ctx1, rowctx32_t ctx2) +{ + return (ctx0) | (static_cast(ctx1) << 20) | (static_cast(ctx2) << 40) | + (static_cast(ROW_CTX_EOF) << 60); +} + /* * @brief Merge two packed row contexts (each corresponding to a block of characters) * and return the packed row context corresponding to the merged character block */ -inline __device__ packed_rowctx_t merge_row_contexts(packed_rowctx_t first_ctx, - packed_rowctx_t second_ctx) +__device__ packed_rowctx_t merge_row_contexts(packed_rowctx_t first_ctx, packed_rowctx_t second_ctx) { uint32_t id0 = get_row_context(first_ctx, ROW_CTX_NONE) & 3; uint32_t id1 = get_row_context(first_ctx, ROW_CTX_QUOTE) & 3; @@ -443,7 +463,7 @@ constexpr __device__ uint32_t make_char_context(uint32_t id0, * The char_ctx value should be created via make_char_context, and its value should * have been evaluated at compile-time. */ -inline __device__ void merge_char_context(uint4& ctx, uint32_t char_ctx, uint32_t pos) +__device__ void merge_char_context(uint4& ctx, uint32_t char_ctx, uint32_t pos) { uint32_t id0 = (ctx.w >> 0) & 3; uint32_t id1 = (ctx.w >> 2) & 3; @@ -460,7 +480,7 @@ inline __device__ void merge_char_context(uint4& ctx, uint32_t char_ctx, uint32_ /* * Convert the context-with-row-bitmaps version to a packed row context */ -inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map) +__device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map) { return pack_row_contexts(make_row_context(__popc(ctx_map.x), (ctx_map.w >> 0) & 3), make_row_context(__popc(ctx_map.y), (ctx_map.w >> 2) & 3), @@ -470,7 +490,7 @@ inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map) /* * Selects the row bitmap corresponding to the given parser state */ -inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) +__device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) { return (ctxid == ROW_CTX_NONE) ? ctx_map.x : (ctxid == ROW_CTX_QUOTE) ? ctx_map.y @@ -495,7 +515,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) * @param t thread id (leaf node id) */ template -inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t) +__device__ void ctx_merge(device_span ctxtree, packed_rowctx_t* ctxb, uint32_t t) { uint64_t tmp = shuffle_xor(*ctxb, lanemask); if (!(t & tmask)) { @@ -517,13 +537,15 @@ inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint3 * @param[in] t thread id (leaf node id) */ template -inline __device__ void ctx_unmerge( - uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t) +__device__ void ctx_unmerge(uint32_t base, + device_span ctxtree, + uint32_t* ctx, + uint32_t* brow4, + uint32_t t) { - rowctx32_t ctxb_left, ctxb_right, ctxb_sum; - ctxb_sum = get_row_context(ctxtree[base], *ctx); - ctxb_left = get_row_context(ctxtree[(base)*2 + 0], *ctx); - ctxb_right = get_row_context(ctxtree[(base)*2 + 1], ctxb_left & 3); + auto const ctxb_sum = get_row_context(ctxtree[base], *ctx); + auto const ctxb_left = get_row_context(ctxtree[(base)*2 + 0], *ctx); + auto const ctxb_right = get_row_context(ctxtree[(base)*2 + 1], ctxb_left & 3); if (t & (rmask)) { *brow4 += (ctxb_sum & ~3) - (ctxb_right & ~3); *ctx = ctxb_left & 3; @@ -550,9 +572,9 @@ inline __device__ void ctx_unmerge( * @param[in] ctxb packed row context for the current character block * @param t thread id (leaf node id) */ -static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024], - packed_rowctx_t ctxb, - uint32_t t) +__device__ void rowctx_merge_transform(device_span ctxtree, + packed_rowctx_t ctxb, + uint32_t t) { ctxtree[512 + t] = ctxb; ctx_merge<1, 0x1, 256, 1>(ctxtree, &ctxb, t); @@ -584,8 +606,8 @@ static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024], * * @return Final row context and count (row_position*4 + context_id format) */ -static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxtree[1024], - uint32_t t) +__device__ rowctx32_t rowctx_inverse_merge_transform(device_span ctxtree, + uint32_t t) { uint32_t ctx = ctxtree[0] & 3; // Starting input context rowctx32_t brow4 = 0; // output row in block *4 @@ -602,6 +624,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt return brow4 + ctx; } +} // namespace /** * @brief Gather row offsets from CSV character data split into 16KB chunks @@ -653,8 +676,10 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) using block_reduce = typename cub::BlockReduce; __shared__ union { typename block_reduce::TempStorage bk_storage; - __align__(8) uint64_t ctxtree[rowofs_block_dim * 2]; + packed_rowctx_t ctxtree[rowofs_block_dim * 2]; } temp_storage; + auto const ctxtree_span = + device_span(temp_storage.ctxtree, rowofs_block_dim * 2); char const* end = start + (min(parse_pos + chunk_size, data_size) - start_offset); uint32_t t = threadIdx.x; @@ -723,16 +748,16 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) // Convert the long-form {rowmap,outctx}[inctx] version into packed version // {rowcount,ouctx}[inctx], then merge the row contexts of the 32-character blocks into // a single 16K-character block context - rowctx_merge_transform(temp_storage.ctxtree, pack_rowmaps(ctx_map), t); + rowctx_merge_transform(ctxtree_span, pack_rowmaps(ctx_map), t); // If this is the second phase, get the block's initial parser state and row counter if (offsets_out.data()) { - if (t == 0) { temp_storage.ctxtree[0] = row_ctx[blockIdx.x]; } + if (t == 0) { ctxtree_span[0] = row_ctx[blockIdx.x]; } __syncthreads(); // Walk back the transform tree with the known initial parser state - rowctx32_t ctx = rowctx_inverse_merge_transform(temp_storage.ctxtree, t); - uint64_t row = (temp_storage.ctxtree[0] >> 2) + (ctx >> 2); + rowctx32_t ctx = rowctx_inverse_merge_transform(ctxtree_span, t); + uint64_t row = (ctxtree_span[0] >> 2) + (ctx >> 2); uint32_t rows_out_of_range = 0; uint32_t rowmap = select_rowmap(ctx_map, ctx & 3); // Output row positions @@ -753,14 +778,14 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; } } else { // Just store the row counts and output contexts - if (t == 0) { row_ctx[blockIdx.x] = temp_storage.ctxtree[1]; } + if (t == 0) { row_ctx[blockIdx.x] = ctxtree_span[1]; } } } -size_t __host__ count_blank_rows(cudf::io::parse_options_view const& opts, - device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream) +size_t count_blank_rows(cudf::io::parse_options_view const& opts, + device_span data, + device_span row_offsets, + rmm::cuda_stream_view stream) { auto const newline = opts.skipblanklines ? opts.terminator : opts.comment; auto const comment = opts.comment != '\0' ? opts.comment : newline; @@ -775,10 +800,10 @@ size_t __host__ count_blank_rows(cudf::io::parse_options_view const& opts, }); } -device_span __host__ remove_blank_rows(cudf::io::parse_options_view const& options, - device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream) +device_span remove_blank_rows(cudf::io::parse_options_view const& options, + device_span data, + device_span row_offsets, + rmm::cuda_stream_view stream) { size_t d_size = data.size(); auto const newline = options.skipblanklines ? options.terminator : options.comment; @@ -804,8 +829,8 @@ cudf::detail::host_vector detect_column_types( rmm::cuda_stream_view stream) { // Calculate actual block count to use based on records count - int const block_size = csvparse_block_dim; - int const grid_size = (row_starts.size() + block_size - 1) / block_size; + auto const block_size = csvparse_block_dim; + auto const grid_size = cudf::util::div_rounding_up_safe(row_starts.size(), block_size); auto d_stats = detail::make_zeroed_device_uvector_async( num_active_columns, stream, cudf::get_current_device_resource_ref()); @@ -829,26 +854,26 @@ void decode_row_column_data(cudf::io::parse_options_view const& options, // Calculate actual block count to use based on records count auto const block_size = csvparse_block_dim; auto const num_rows = row_offsets.size() - 1; - auto const grid_size = (num_rows + block_size - 1) / block_size; + auto const grid_size = cudf::util::div_rounding_up_safe(num_rows, block_size); convert_csv_to_cudf<<>>( options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts); } -uint32_t __host__ gather_row_offsets(parse_options_view const& options, - uint64_t* row_ctx, - device_span const offsets_out, - device_span const data, - size_t chunk_size, - size_t parse_pos, - size_t start_offset, - size_t data_size, - size_t byte_range_start, - size_t byte_range_end, - size_t skip_rows, - rmm::cuda_stream_view stream) +uint32_t gather_row_offsets(parse_options_view const& options, + uint64_t* row_ctx, + device_span const offsets_out, + device_span const data, + size_t chunk_size, + size_t parse_pos, + size_t start_offset, + size_t data_size, + size_t byte_range_start, + size_t byte_range_end, + size_t skip_rows, + rmm::cuda_stream_view stream) { - uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes); + uint32_t dim_grid = cudf::util::div_rounding_up_safe(chunk_size, rowofs_block_bytes); gather_row_offsets_gpu<<>>( row_ctx, diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp index aa3d9f6c7b7..c2ac2ba2e67 100644 --- a/cpp/src/io/csv/csv_gpu.hpp +++ b/cpp/src/io/csv/csv_gpu.hpp @@ -74,26 +74,6 @@ inline __host__ __device__ rowctx32_t make_row_context(uint32_t row_count, uint3 return (row_count << 2) + out_ctx; } -/** - * @brief pack multiple row contexts together - * - * Pack four rowctx32_t values, where each value represents the output row context - * for one of four possible input contexts when parsing a character block. - * Each output state consists of the 2-bit row context state along with a 18-bit row count - * value (row count is assumed to be a local count that fits in 18-bit) - * The four 20-bit values are concatenated to form a 80-bit value, truncated to 64-bit - * since a block starting in a EOF state can only have a zero row count (and the output - * state corresponding to an EOF input state can only be EOF, so only the first 3 output - * states are included as parameters, and the EOF->EOF state transition is hardcoded) - */ -constexpr __host__ __device__ packed_rowctx_t pack_row_contexts(rowctx32_t ctx0, - rowctx32_t ctx1, - rowctx32_t ctx2) -{ - return (ctx0) | (static_cast(ctx1) << 20) | (static_cast(ctx2) << 40) | - (static_cast(ROW_CTX_EOF) << 60); -} - /** * @brief Unpack a row context (select one of the 4 contexts in packed form) */ @@ -113,8 +93,7 @@ inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx * @param packed_ctx row context of character block * @return total_row_count * 4 + output context id */ -inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx, - packed_rowctx_t packed_ctx) +inline rowctx64_t select_row_context(rowctx64_t sel_ctx, packed_rowctx_t packed_ctx) { auto ctxid = static_cast(sel_ctx & 3); rowctx32_t ctx = get_row_context(packed_ctx, ctxid);