Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.06' into 15708
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed May 14, 2024
2 parents dffae45 + 0f6ce63 commit e881370
Show file tree
Hide file tree
Showing 78 changed files with 2,161 additions and 919 deletions.
5 changes: 5 additions & 0 deletions .devcontainer/cuda11.8-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
Expand Down
5 changes: 5 additions & 0 deletions .devcontainer/cuda11.8-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
Expand Down
5 changes: 5 additions & 0 deletions .devcontainer/cuda12.2-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
"BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
Expand Down
5 changes: 5 additions & 0 deletions .devcontainer/cuda12.2-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
Expand Down
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ repos:
# project can specify its own first/third-party packages.
args: ["--config-root=python/", "--resolve-all-configs"]
files: python/.*
exclude: |
(?x)^(^python/cudf_polars/.*)
types_or: [python, cython, pyi]
- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.16.2
Expand All @@ -36,6 +38,7 @@ repos:
"python/cudf/cudf",
"python/custreamz/custreamz",
"python/cudf_kafka/cudf_kafka",
"python/cudf_polars/cudf_polars",
"python/dask_cudf/dask_cudf"]
pass_filenames: false
- repo: https://github.com/nbQA-dev/nbQA
Expand Down
2 changes: 1 addition & 1 deletion ci/configure_cpp_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ rapids-dependency-file-generator \
python -m pip install -r "${REQUIREMENTS_FILE}"
pyenv rehash

cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF
cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF
1 change: 1 addition & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,5 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
done
3 changes: 3 additions & 0 deletions ci/run_cudf_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
compute-sanitizer --tool memcheck custom_prealloc names.csv
compute-sanitizer --tool memcheck custom_with_malloc names.csv

compute-sanitizer --tool memcheck parquet_io
compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE

exit ${EXITCODE}
4 changes: 2 additions & 2 deletions cpp/benchmarks/json/json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#include <cudf/column/column_factories.hpp>
#include <cudf/json/json.hpp>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -170,7 +170,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
json_benchmark_row_builder jb{
desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
}
Expand Down
15 changes: 14 additions & 1 deletion cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,20 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"
)
endif()

rapids_cmake_install_lib_dir(lib_dir)
if(TARGET arrow_static)
get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
# The `arrow_static` library is leaking a dependency on the object libraries it was built with
# we need to remove this from the interface, since keeping them around would cause duplicate
# symbols and CMake export errors
if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute")
string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs
"${interface_libs}"
)
set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}")
get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
endif()
endif()
rapids_export(
BUILD Arrow
VERSION ${VERSION}
Expand Down
96 changes: 71 additions & 25 deletions cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
# libcudf C++ Developer Guide

This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
to these additional files for further documentation of libcudf best practices.
Expand Down Expand Up @@ -828,7 +828,7 @@ This iterator returns the validity of the underlying element (`true` or `false`)
The proliferation of data types supported by libcudf can result in long compile times. One area
where compile time was a problem is in types used to store indices, which can be any integer type.
The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
The "indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
used for index types (integers) without requiring a type-specific instance. It can be used for any
iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
Expand Down Expand Up @@ -856,6 +856,41 @@ thrust::lower_bound(rmm::exec_policy(stream),
thrust::less<Element>());
```
### Offset-normalizing iterators
Like the [indexalator](#index-normalizing-iterators),
the "offsetalator", or offset-normalizing iterator (`include/cudf/detail/offsetalator.cuh`), can be
used for offset column types (`INT32` or `INT64` only) without requiring a type-specific instance.
This is helpful when reading or building [strings columns](#strings-columns).
The normalized type is `int64` which means an `input_offsetsalator` will return `int64` type values
for both `INT32` and `INT64` offsets columns.
Likewise, an `output_offselator` can accept `int64` type values to store into either an
`INT32` or `INT64` output offsets column created appropriately.
Use the `cudf::detail::offsetalator_factory` to create an appropriate input or output iterator from an offsets column_view.
Example input iterator usage:
```c++
// convert the sizes to offsets
auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
output_sizes.begin(), output_sizes.end(), stream, mr);
auto d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
// use d_offsets to address the output row bytes
```

Example output iterator usage:

```c++
// create offsets column as either INT32 or INT64 depending on the number of bytes
auto offsets_column = cudf::strings::detail::create_offsets_child_column(total_bytes,
offsets_count,
stream, mr);
auto d_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
// write appropriate offset values to d_offsets
```
## Namespaces
### External
Expand Down Expand Up @@ -1241,18 +1276,20 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
Strings are represented as a column with a data device buffer and a child offsets column.
The parent column's type is `STRING` and its data holds all the characters across all the strings packed together
but its size represents the number of strings in the column, and its null mask represents the
validity of each string. To summarize, the strings column children are:
1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
string in a dense data buffer of all characters.
but its size represents the number of strings in the column and its null mask represents the
validity of each string.
With this representation, `data[offsets[i]]` is the first character of string `i`, and the
size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
this compound column representation of strings.
The strings column contains a single, non-nullable child column
of offset elements that indicates the byte position offset to the beginning of each
string in the dense data buffer of all characters. With this representation, `data[offsets[i]]` is the
first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`.
The following image shows an example of this compound column representation of strings.
![strings](strings.png)
The type of the offsets column is either `INT32` or `INT64` depending on the number of bytes in the data buffer.
See [`cudf::strings_view`](#cudfstrings_column_view-and-cudfstring_view) for more information on processing individual string rows.
## Structs columns
A struct is a nested data type with a set of child columns each representing an individual field
Expand Down Expand Up @@ -1295,7 +1332,7 @@ struct column's layout is as follows. (Note that null masks should be read from
}
```
The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of
The last struct row (index 3) is not null, but has a null value in the `INT32` field. Also, row 2 of
the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in
the null masks of both struct fields.
Expand Down Expand Up @@ -1351,46 +1388,55 @@ libcudf provides view types for nested column types as well as for the data elem
### cudf::strings_column_view and cudf::string_view
`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
`cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
read-only object instance that points to device memory inside the strings column. It's lifespan is
the same (or less) as the column it views.
A `cudf::strings_column_view` wraps a strings column and contains a parent
`cudf::column_view` as a view of the strings column and an offsets `cudf::column_view`
which is a child of the parent.
The parent view contains the offset, size, and validity mask for the strings column.
The offsets view is non-nullable with `offset()==0` and its own size.
Since the offset column type can be either `INT32` or `INT64` it is useful to use the
offset normalizing iterators [offsetalator](#offset-normalizing-iterators) to access individual offset values.
A `cudf::string_view` is a view of a single string and therefore
is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
data type for a `cudf::column` of type `INT32`. As its name implies, this is a
read-only object instance that points to device memory inside the strings column.
Its lifespan is the same (or less) as the column it views.
An individual strings column row and a `cudf::string_view` is limited to [`size_type`](#cudfsize_type) bytes.
Use the `column_device_view::element` method to access an individual row element. Like any other
column, do not call `element()` on a row that is null.
```c++
cudf::column_device_view d_strings;
cudf::strings_column_view scv;
auto d_strings = cudf::column_device_view::create(scv.parent(), stream);
...
if( d_strings.is_valid(row_index) ) {
string_view d_str = d_strings.element<string_view>(row_index);
...
}
```

A null string is not the same as an empty string. Use the `string_scalar` class if you need an
A null string is not the same as an empty string. Use the `cudf::string_scalar` class if you need an
instance of a class object to represent a null string.

The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
functions like `sort` without string-specific code. The data for a `string_view` instance is
The `cudf::string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
functions like `sort` without string-specific code. The data for a `cudf::string_view` instance is
required to be [UTF-8](#utf-8) and all operators and methods expect this encoding. Unless documented
otherwise, position and length parameters are specified in characters and not bytes. The class also
includes a `string_view::const_iterator` which can be used to navigate through individual characters
includes a `cudf::string_view::const_iterator` which can be used to navigate through individual characters
within the string.

`cudf::type_dispatcher` dispatches to the `string_view` data type when invoked on a `STRING` column.
`cudf::type_dispatcher` dispatches to the `cudf::string_view` data type when invoked on a `STRING` column.

#### UTF-8

The libcudf strings column only supports UTF-8 encoding for strings data.
[UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each
character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes.
For this reason, it is recommended to use the `string_view` class to access these characters for
For this reason, it is recommended to use the `cudf::string_view` class to access these characters for
most operations.

The `string_view.cuh` header also includes some utility methods for reading and writing
The `cudf/strings/detail/utf8.hpp` header also includes some utility methods for reading and writing
(`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.

### cudf::lists_column_view and cudf::lists_view
Expand Down
1 change: 1 addition & 0 deletions cpp/examples/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ build_example() {
build_example basic
build_example strings
build_example nested_types
build_example parquet_io
25 changes: 25 additions & 0 deletions cpp/examples/parquet_io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

cmake_minimum_required(VERSION 3.26.4)

include(../set_cuda_architecture.cmake)

# initialize cuda architecture
rapids_cuda_init_architectures(parquet_io)
rapids_cuda_set_architectures(RAPIDS)

project(
parquet_io
VERSION 0.0.1
LANGUAGES CXX CUDA
)

include(../fetch_dependencies.cmake)

# Configure your project here
add_executable(parquet_io parquet_io.cpp)
target_link_libraries(parquet_io PRIVATE cudf::cudf)
target_compile_features(parquet_io PRIVATE cxx_std_17)

install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
Binary file added cpp/examples/parquet_io/example.parquet
Binary file not shown.
Loading

0 comments on commit e881370

Please sign in to comment.