diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 4cf8564ab3a..5694362af8f 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -31,17 +31,41 @@ namespace detail { // intermediate data structure to compute `var`, `std` template struct var_std { - ResultType value; /// the value - ResultType value_squared; /// the value of squared - - CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) - : value(_value), value_squared(_value_squared){}; + // Uses the pairwise approach of Chan, Golub, and LeVeque, + // _Algorithms for computing the sample variance: analysis and + // recommendations_ (1983) + // https://doi.org/10.1080/00031305.1983.10483115 + // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf + // This is a modification of Youngs and Cramer's online approach. + ResultType running_sum; + ResultType running_square_deviations; + size_type count; + + CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0) + : running_sum(t), running_square_deviations(s), count(n){}; using this_t = var_std; CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { - return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); + // Updates as per equations 1.5a and 1.5b in the paper + // T_{1,m+n} = T_{1,m} + T_{m+1,n+1} + // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2 + // Here the first m samples are in this, the remaining n samples are in rhs. + auto const m = this->count; + auto const n = rhs.count; + // Avoid division by zero. + if (m == 0) { return rhs; } + if (n == 0) { return *this; } + auto const tm = this->running_sum; + auto const tn = rhs.running_sum; + auto const sm = this->running_square_deviations; + auto const sn = rhs.running_square_deviations; + auto const tmn = tm + tn; + auto const diff = ((static_cast(n) / m) * tm) - tn; + // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow + auto const smn = sm + sn + ((static_cast(m) / n) / (m + n)) * diff * diff; + return {tmn, smn, m + n}; }; }; @@ -50,10 +74,7 @@ template struct transformer_var_std { using OutputType = var_std; - CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) - { - return OutputType(value, value * value); - }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; }; }; // ------------------------------------------------------------------------ @@ -257,12 +278,7 @@ struct variance : public compound_op { cudf::size_type const& count, cudf::size_type const& ddof) { - ResultType mean = input.value / count; - ResultType asum = input.value_squared; - cudf::size_type div = count - ddof; - ResultType var = asum / div - ((mean * mean) * count) / div; - - return var; + return input.running_square_deviations / (count - ddof); }; }; }; diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 83c7b663980..d41d137a2c9 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -87,13 +88,25 @@ void validate_token_stream(device_span d_input, { CUDF_FUNC_RANGE(); if (!options.is_strict_validation()) { return; } + + rmm::device_uvector d_invalid = cudf::detail::make_zeroed_device_uvector_async( + tokens.size(), stream, cudf::get_current_device_resource_ref()); + using token_t = cudf::io::json::token_t; - cudf::detail::optional_trie trie_na = - cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto literals = options.get_na_values(); + literals.emplace_back("null"); // added these too to single trie + literals.emplace_back("true"); + literals.emplace_back("false"); + + cudf::detail::optional_trie trie_literals = + cudf::detail::create_serialized_trie(literals, stream); + cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie( + {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream); + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), - trie_na = trie_na_view, + trie_literals = cudf::detail::make_trie_view(trie_literals), + trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, @@ -101,24 +114,15 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || - substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { + auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start}); + if (is_literal) { return true; } + if (allow_nonnumeric) { + auto const is_nonnumeric = + serialized_trie_contains(trie_nonnumeric, {data + start, end - start}); + if (is_nonnumeric) { return true; } + } + auto c = data[start]; + if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::START; for (auto at = start; at < end; at++) { @@ -140,9 +144,6 @@ void validate_token_stream(device_span d_input, num_state = number_state::LEADING_ZERO; } else if (c >= '1' && c <= '9') { num_state = number_state::WHOLE; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -273,33 +274,44 @@ void validate_token_stream(device_span d_input, auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); - auto predicate = [tokens = tokens.begin(), - token_indices = token_indices.begin(), - validate_values, - validate_strings] __device__(auto i) -> bool { + auto predicate = cuda::proclaim_return_type([tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; - }; + }); + + auto conditional_invalidout_it = + cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type( + [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void { + if (x) { d_invalid[i] = true; } + })); + thrust::transform(rmm::exec_policy_nosync(stream), + count_it, + count_it + num_tokens, + conditional_invalidout_it, + predicate); using scan_type = write_if::scan_type; auto conditional_write = write_if{tokens.begin(), num_tokens}; auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); - auto transform_op = cuda::proclaim_return_type( - [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { - if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; - return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; - }); - auto binary_op = cuda::proclaim_return_type( + auto binary_op = cuda::proclaim_return_type( [] __device__(scan_type prev, scan_type curr) -> scan_type { auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); - return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + return {(curr.second ? curr.first : op_result), prev.second | curr.second}; + }); + auto transform_op = cuda::proclaim_return_type( + [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; }); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), + thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream), count_it, count_it + num_tokens, conditional_output_it, diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 6bc8b48832f..cd9fade164a 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -18,13 +18,18 @@ #include #include +#include #include +#include #include #include #include #include +#include +#include + namespace cudf { namespace reduction { namespace compound { @@ -53,9 +58,17 @@ std::unique_ptr compound_reduction(column_view const& col, { auto const valid_count = col.size() - col.null_count(); + // All null input produces all null output + if (valid_count == 0 || + // Only care about ddof for standard deviation and variance right now + valid_count <= ddof && (std::is_same_v || + std::is_same_v)) { + auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr); + result->set_valid_async(false, stream); + return result; + } // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); - std::unique_ptr result; Op compound_op{}; if (!cudf::is_dictionary(col.type())) { @@ -63,25 +76,21 @@ std::unique_ptr compound_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { auto it = thrust::make_transform_iterator( cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } - - // set scalar is valid - result->set_valid_async(col.null_count() < col.size(), stream); - return result; }; // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std) @@ -137,6 +146,7 @@ struct element_type_dispatcher { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error); return cudf::type_dispatcher( output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 1e9e13ded93..bdb98372836 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -33,8 +33,12 @@ #include #include +#include #include +#include +#include +#include #include using aggregation = cudf::aggregation; @@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean) expected_value_nulls); } +template +double calc_var(std::vector const& v, int ddof, std::vector const& mask = {}) +{ + auto const values = [&]() { + if (mask.empty()) { return v; } + std::vector masked{}; + thrust::copy_if( + v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; }); + return masked; + }(); + auto const valid_count = values.size(); + double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count; + double const sq_sum_of_differences = + std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) { + return acc + std::pow(v - mean, 2); + }); + return sq_sum_of_differences / (valid_count - ddof); +} + // This test is disabled for only a Debug build because a compiler error // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu #ifdef NDEBUG @@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [](std::vector& v, cudf::size_type valid_count, int ddof) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls std::vector v = convert_values(int_values); cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); auto const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(v, host_bools, T{0}); - - double var_nulls = calc_var(replaced_array, valid_count, ddof); - double std_nulls = std::sqrt(var_nulls); + double var_nulls = calc_var(v, ddof, host_bools); + double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } // ---------------------------------------------------------------------------- @@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [ddof](std::vector& v, cudf::size_type valid_count) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::fixed_width_column_wrapper col(int_values.begin(), int_values.end()); - double var = calc_var(int_values, int_values.size()); + double var = calc_var(int_values, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(int_values, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(int_values, host_bools, int{0}); - - double var_nulls = calc_var(replaced_array, valid_count); + double var_nulls = calc_var(int_values, ddof, host_bools); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } //------------------------------------------------------------------- @@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector v = convert_values(int_values); cudf::data_type output_type{cudf::type_to_id()}; - auto calc_var = [](std::vector const& v, cudf::size_type valid_count, cudf::size_type ddof) { - double mean = std::accumulate(v.cbegin(), v.cend(), double{0}); - mean /= valid_count; - double sum_of_sq = std::accumulate( - v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - auto const div = valid_count - ddof; - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); cudf::size_type const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector validity({true, true, false, true, true, true, false, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true); - - double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof); + double var_nulls = calc_var(v, ddof, validity); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, - var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, - std_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, + var_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, + std_nulls); } TYPED_TEST(DictionaryReductionTest, NthElement) diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index f12f809d5db..22cc1b5b8de 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf). Each PR also indicates whether it increases or decreases test coverage. +### Configuring pytest + +Pytest will accept configuration in [multiple different +files](https://docs.pytest.org/en/stable/reference/customize.html), +with a specified discovery and precedence order. Note in particular +that there is no automatic "include" mechanism, as soon as a matching +configuration file is found, discovery stops. + +For preference, so that all tool configuration lives in the same +place, we use `pyproject.toml`-based configuration. Test configuration +for a given package should live in that package's `pyproject.toml` +file. + +Where tests do not naturally belong to a project, for example the +`cudf.pandas` integration tests and the cuDF benchmarks, use a +`pytest.ini` file as close to the tests as possible. + ## Test organization How tests are organized depends on which of the following two groups they fall into: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index b5cd5ee42c3..2e03b589c8b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -5,3 +5,4 @@ nvtext :maxdepth: 1 edit_distance + generate_ngrams diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 8cc7df1ce7f..6bd6603d71b 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -612,13 +612,13 @@ void testWithSetOutputType() { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.666667f); + try (Scalar expected = Scalar.fromFloat(1.6666666f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.variance(DType.FLOAT32)) { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.2909945f); + try (Scalar expected = Scalar.fromFloat(1.2909944f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.standardDeviation(DType.FLOAT32)) { assertEquals(expected, result); diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..7fdf9258b7f 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,34 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.hash_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3b8dd05c13a..f6ab91f2f01 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -180,9 +180,12 @@ def var( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "var", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def std( self, @@ -190,9 +193,12 @@ def std( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "std", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index acd97c2047c..41ee94b72c8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2943,7 +2943,7 @@ def corr(self, other, method="pearson", min_periods=None): >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 + -0.20454263717316126 >>> ser1.corr(ser2, method="spearman") -0.5 """ diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini deleted file mode 100644 index 496a322ff80..00000000000 --- a/python/cudf/cudf/tests/pytest.ini +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -[pytest] -markers = - spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON` -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* - # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore - # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ - ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning - # PerformanceWarning from cupy warming up the JIT cache - ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning - # Ignore numba PEP 456 warning specific to arm machines - ignore:FNV hashing is not implemented in Numba.*:UserWarning -addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4a2345fc009..976b12a9ab5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2525,23 +2525,7 @@ def test_dti_asi8(): @pytest.mark.parametrize( "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], ) def test_dti_reduction(method, kwargs): pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini new file mode 100644 index 00000000000..46e2448ea24 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/pytest.ini @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# Note, this config file overrides the default "cudf" test config in +# ../pyproject.toml We do so deliberately because we have different +# treatment of markers and warnings +[pytest] +addopts = --tb=native --strict-config --strict-markers +empty_parameter_set_mark = fail_at_collect +xfail_strict = true diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2bbed40e34e..a74b7148c00 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import collections +import contextlib import copy import datetime import operator @@ -21,10 +22,15 @@ import pyarrow as pa import pytest from nbconvert.preprocessors import ExecutePreprocessor -from numba import NumbaDeprecationWarning, vectorize +from numba import ( + NumbaDeprecationWarning, + __version__ as numba_version, + vectorize, +) +from packaging import version from pytz import utc -from cudf.core._compat import PANDAS_GE_220 +from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import ( ProxyFallbackError, @@ -52,8 +58,6 @@ get_calendar, ) -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION - # Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow cudf = xpd._fsproxy_fast @@ -622,10 +626,6 @@ def test_array_function_series_fallback(series): tm.assert_equal(expect, got) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) def test_timedeltaproperties(series): psr, sr = series psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]") @@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex): assert isinstance(got, xpd.core.indexes.frozen.FrozenList) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas due to unsupported boxcar window type", -) def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) @@ -697,8 +693,14 @@ def test_rolling_win_type(): tm.assert_equal(result, expected) -@pytest.mark.skip( - reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009" +@pytest.mark.skipif( + version.parse(numba_version) < version.parse("0.59"), + reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009", +) +@pytest.mark.xfail( + version.parse(numba_version) >= version.parse("0.59") + and PANDAS_VERSION < version.parse("2.1"), + reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1", ) def test_rolling_apply_numba_engine(): def weighted_mean(x): @@ -709,7 +711,12 @@ def weighted_mean(x): pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - with pytest.warns(NumbaDeprecationWarning): + ctx = ( + contextlib.nullcontext() + if PANDAS_GE_210 + else pytest.warns(NumbaDeprecationWarning) + ) + with ctx: expect = pdf.rolling(2, method="table", min_periods=0).apply( weighted_mean, raw=True, engine="numba" ) @@ -1305,7 +1312,7 @@ def max_times_two(self): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0", ) def test_floordiv_array_vs_df(): @@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="pyarrow_numpy storage type was not supported in pandas-2.0.0", ) def test_arrow_string_arrays(): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 1b730ffd13c..c0776fd0de6 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -124,6 +124,27 @@ skip = [ "__init__.py", ] +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*", + # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() + "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", + # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning", + # PerformanceWarning from cupy warming up the JIT cache + "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning", + # Ignore numba PEP 456 warning specific to arm machines + "ignore:FNV hashing is not implemented in Numba.*:UserWarning" +] +markers = [ + "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`" +] +xfail_strict = true + [tool.rapids-build-backend] build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a1a3ec37842..87e19a2bccf 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -79,9 +79,12 @@ skip = [ ] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error" ] +xfail_strict = true [tool.scikit-build] build-dir = "build/{wheel_tag}" diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 06bb08953f1..3b1eff4a0d0 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] +__all__: list[str] = ["DataFrame", "Column"] -from cudf_polars.containers.column import Column, NamedColumn +from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 3fe3e5557cb..00186098e54 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -15,7 +15,7 @@ import polars as pl -__all__: list[str] = ["Column", "NamedColumn"] +__all__: list[str] = ["Column"] class Column: @@ -26,6 +26,9 @@ class Column: order: plc.types.Order null_order: plc.types.NullOrder is_scalar: bool + # Optional name, only ever set by evaluation of NamedExpr nodes + # The internal evaluation should not care about the name. + name: str | None def __init__( self, @@ -34,14 +37,12 @@ def __init__( is_sorted: plc.types.Sorted = plc.types.Sorted.NO, order: plc.types.Order = plc.types.Order.ASCENDING, null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + name: str | None = None, ): self.obj = column self.is_scalar = self.obj.size() == 1 - if self.obj.size() <= 1: - is_sorted = plc.types.Sorted.YES - self.is_sorted = is_sorted - self.order = order - self.null_order = null_order + self.name = name + self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order) @functools.cached_property def obj_scalar(self) -> plc.Scalar: @@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar: ) return plc.copying.get_element(self.obj, 0) + def rename(self, name: str | None, /) -> Self: + """ + Return a shallow copy with a new name. + + Parameters + ---------- + name + New name + + Returns + ------- + Shallow copy of self with new name set. + """ + new = self.copy() + new.name = name + return new + def sorted_like(self, like: Column, /) -> Self: """ - Copy sortedness properties from a column onto self. + Return a shallow copy with sortedness from like. Parameters ---------- @@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self: Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. See Also -------- set_sorted, copy_metadata """ - return self.set_sorted( - is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + return type(self)( + self.obj, + name=self.name, + is_sorted=like.is_sorted, + order=like.order, + null_order=like.null_order, ) - # TODO: Return Column once #16272 is fixed. - def astype(self, dtype: plc.DataType) -> plc.Column: + def astype(self, dtype: plc.DataType) -> Column: """ - Return the backing column as the requested dtype. + Cast the column to as the requested dtype. Parameters ---------- @@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column: the current one. """ if self.obj.type() != dtype: - return plc.unary.cast(self.obj, dtype) - return self.obj + return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like( + self + ) + return self def copy_metadata(self, from_: pl.Series, /) -> Self: """ @@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self: -------- set_sorted, sorted_like """ + self.name = from_.name if len(from_) <= 1: return self ascending = from_.flags["SORTED_ASC"] @@ -192,6 +216,7 @@ def copy(self) -> Self: is_sorted=self.is_sorted, order=self.order, null_order=self.null_order, + name=self.name, ) def mask_nans(self) -> Self: @@ -217,58 +242,3 @@ def nan_count(self) -> int: ) ).as_py() return 0 - - -class NamedColumn(Column): - """A column with a name.""" - - name: str - - def __init__( - self, - column: plc.Column, - name: str, - *, - is_sorted: plc.types.Sorted = plc.types.Sorted.NO, - order: plc.types.Order = plc.types.Order.ASCENDING, - null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, - ) -> None: - super().__init__( - column, is_sorted=is_sorted, order=order, null_order=null_order - ) - self.name = name - - def copy(self, *, new_name: str | None = None) -> Self: - """ - A shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. - - Returns - ------- - New column sharing data with self. - """ - return type(self)( - self.obj, - self.name if new_name is None else new_name, - is_sorted=self.is_sorted, - order=self.order, - null_order=self.null_order, - ) - - def mask_nans(self) -> Self: - """Return a shallow copy of self with nans masked out.""" - # Annoying, the inheritance is not right (can't call the - # super-type mask_nans), but will sort that by refactoring - # later. - if plc.traits.is_floating_point(self.obj.type()): - old_count = self.obj.null_count() - mask, new_count = plc.transform.nans_to_nulls(self.obj) - result = type(self)(self.obj.with_mask(mask, new_count), self.name) - if old_count == new_count: - return result.sorted_like(self) - return result - return self.copy() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index f3e3862d0cc..2c195f6637c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,43 +5,50 @@ from __future__ import annotations -import itertools from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pyarrow as pa import pylibcudf as plc import polars as pl -from cudf_polars.containers.column import NamedColumn +from cudf_polars.containers import Column from cudf_polars.utils import dtypes if TYPE_CHECKING: - from collections.abc import Mapping, Sequence, Set + from collections.abc import Iterable, Mapping, Sequence, Set from typing_extensions import Self - from cudf_polars.containers import Column - __all__: list[str] = ["DataFrame"] +# Pacify the type checker. DataFrame init asserts that all the columns +# have a string name, so let's narrow the type. +class NamedColumn(Column): + name: str + + class DataFrame: """A representation of a dataframe.""" - columns: list[NamedColumn] + column_map: dict[str, Column] table: plc.Table + columns: list[NamedColumn] - def __init__(self, columns: Sequence[NamedColumn]) -> None: - self.columns = list(columns) - self._column_map = {c.name: c for c in self.columns} - self.table = plc.Table([c.obj for c in columns]) + def __init__(self, columns: Iterable[Column]) -> None: + columns = list(columns) + if any(c.name is None for c in columns): + raise ValueError("All columns must have a name") + self.columns = [cast(NamedColumn, c) for c in columns] + self.column_map = {c.name: c for c in self.columns} + self.table = plc.Table([c.obj for c in self.columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)([c.copy() for c in self.columns]) + return type(self)(c.copy() for c in self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -51,42 +58,38 @@ def to_polars(self) -> pl.DataFrame: # https://github.com/pola-rs/polars/issues/11632 # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. - name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} + name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} table: pa.Table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) df: pl.DataFrame = pl.from_arrow(table) return df.rename(name_map).with_columns( - *( - pl.col(c.name).set_sorted( - descending=c.order == plc.types.Order.DESCENDING - ) - if c.is_sorted - else pl.col(c.name) - for c in self.columns - ) + pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING) + if c.is_sorted + else pl.col(c.name) + for c in self.columns ) @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" - return frozenset(c.name for c in self.columns) + return frozenset(self.column_map) @cached_property def column_names(self) -> list[str]: """Return a list of the column names.""" - return [c.name for c in self.columns] + return list(self.column_map) @cached_property def num_columns(self) -> int: """Number of columns.""" - return len(self.columns) + return len(self.column_map) @cached_property def num_rows(self) -> int: """Number of rows.""" - return 0 if len(self.columns) == 0 else self.table.num_rows() + return self.table.num_rows() if self.column_map else 0 @classmethod def from_polars(cls, df: pl.DataFrame) -> Self: @@ -111,12 +114,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self: # No-op if the schema is unchanged. d_table = plc.interop.from_arrow(table.cast(schema)) return cls( - [ - NamedColumn(column, h_col.name).copy_metadata(h_col) - for column, h_col in zip( - d_table.columns(), df.iter_columns(), strict=True - ) - ] + Column(column).copy_metadata(h_col) + for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True) ) @classmethod @@ -144,17 +143,14 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") return cls( - [ - NamedColumn(c, name) - for c, name in zip(table.columns(), names, strict=True) - ] + Column(c, name=name) for c, name in zip(table.columns(), names, strict=True) ) def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: """ - Copy sortedness from a dataframe onto self. + Return a shallow copy with sortedness copied from like. Parameters ---------- @@ -165,7 +161,7 @@ def sorted_like( Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. Raises ------ @@ -175,13 +171,12 @@ def sorted_like( if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset - self.columns = [ + return type(self)( c.sorted_like(other) if c.name in subset else c for c, other in zip(self.columns, like.columns, strict=True) - ] - return self + ) - def with_columns(self, columns: Sequence[NamedColumn]) -> Self: + def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self: """ Return a new dataframe with extra columns. @@ -189,6 +184,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ---------- columns Columns to add + replace_only + If true, then only replacements are allowed (matching by name). Returns ------- @@ -196,36 +193,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: Notes ----- - If column names overlap, newer names replace older ones. + If column names overlap, newer names replace older ones, and + appear in the same order as the original frame. """ - columns = list( - {c.name: c for c in itertools.chain(self.columns, columns)}.values() - ) - return type(self)(columns) + new = {c.name: c for c in columns} + if replace_only and not self.column_names_set.issuperset(new.keys()): + raise ValueError("Cannot replace with non-existing names") + return type(self)((self.column_map | new).values()) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)([c for c in self.columns if c.name not in names]) + return type(self)(column for column in self.columns if column.name not in names) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" - want = set(names) - if not want.issubset(self.column_names_set): - raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names]) - - def replace_columns(self, *columns: NamedColumn) -> Self: - """Return a new dataframe with columns replaced by name.""" - new = {c.name: c for c in columns} - if not set(new).issubset(self.column_names_set): - raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns]) + try: + return type(self)(self.column_map[name] for name in names) + except KeyError as e: + raise ValueError("Can't select missing names") from e def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) + return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns) - def select_columns(self, names: Set[str]) -> list[NamedColumn]: + def select_columns(self, names: Set[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index a418560b31c..f7775ceb238 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -27,7 +27,7 @@ from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr -from cudf_polars.containers import Column, NamedColumn +from cudf_polars.containers import Column from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: @@ -313,7 +313,7 @@ def evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> NamedColumn: + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -328,20 +328,15 @@ def evaluate( Returns ------- - NamedColumn attaching a name to an evaluated Column + Evaluated Column with name attached. See Also -------- :meth:`Expr.evaluate` for details, this function just adds the name to a column produced from an expression. """ - obj = self.value.evaluate(df, context=context, mapping=mapping) - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, + return self.value.evaluate(df, context=context, mapping=mapping).rename( + self.name ) def collect_agg(self, *, depth: int) -> AggInfo: @@ -428,7 +423,9 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - return df._column_map[self.name] + # Deliberately remove the name here so that we guarantee + # evaluation of the IR produces names. + return df.column_map[self.name].rename(None) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1c61075be22..e319c363a23 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -26,7 +26,7 @@ import polars as pl import cudf_polars.dsl.expr as expr -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: @@ -57,9 +57,7 @@ ] -def broadcast( - *columns: NamedColumn, target_length: int | None = None -) -> list[NamedColumn]: +def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]: """ Broadcast a sequence of columns to a common length. @@ -112,12 +110,12 @@ def broadcast( return [ column if column.obj.size() != 1 - else NamedColumn( + else Column( plc.Column.from_scalar(column.obj_scalar, nrows), - column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE, + name=column.name, ) for column in columns ] @@ -385,15 +383,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = NamedColumn( + index = Column( plc.filling.sequence(df.num_rows, init, step), - name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, + name=name, ) df = DataFrame([index, *df.columns]) - assert all(c.obj.type() == self.schema[c.name] for c in df.columns) + assert all( + c.obj.type() == self.schema[name] for name, c in df.column_map.items() + ) if self.predicate is None: return df else: @@ -588,15 +588,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - # TODO: names - raw_columns: list[NamedColumn] = [] + raw_columns: list[Column] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(NamedColumn(column, f"tmp{i}")) + raw_columns.append(Column(column, name=f"tmp{i}")) mapping = dict(zip(replacements, raw_columns, strict=True)) result_keys = [ - NamedColumn(gk, k.name) - for gk, k in zip(group_keys.columns(), keys, strict=True) + Column(grouped_key, name=key.name) + for key, grouped_key in zip(keys, group_keys.columns(), strict=True) ] result_subs = DataFrame(raw_columns) results = [ @@ -639,8 +638,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) broadcasted = [ - NamedColumn(reordered, b.name) - for reordered, b in zip( + Column(reordered, name=old.name) + for reordered, old in zip( ordered_table.columns(), broadcasted, strict=True ) ] @@ -787,20 +786,20 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # result, not the gather maps columns = plc.join.cross_join(left.table, right.table).columns() left_cols = [ - NamedColumn(new, old.name).sorted_like(old) + Column(new, name=old.name).sorted_like(old) for new, old in zip( columns[: left.num_columns], left.columns, strict=True ) ] right_cols = [ - NamedColumn( + Column( new, - old.name - if old.name not in left.column_names_set - else f"{old.name}{suffix}", + name=name + if name not in left.column_names_set + else f"{name}{suffix}", ) - for new, old in zip( - columns[left.num_columns :], right.columns, strict=True + for new, name in zip( + columns[left.num_columns :], right.column_names, strict=True ) ] return DataFrame([*left_cols, *right_cols]) @@ -838,18 +837,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how != "inner": - left = left.replace_columns( - *( - NamedColumn( + left = left.with_columns( + ( + Column( plc.replace.replace_nulls(left_col.obj, right_col.obj), - left_col.name, + name=left_col.name, ) for left_col, right_col in zip( left.select_columns(left_on.column_names_set), right.select_columns(right_on.column_names_set), strict=True, ) - ) + ), + replace_only=True, ) right = right.discard_columns(right_on.column_names_set) if how == "right": @@ -931,9 +931,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = self.df.evaluate(cache=cache) if self.subset is None: indices = list(range(df.num_columns)) + keys_sorted = all(c.is_sorted for c in df.column_map.values()) else: indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(df.columns[i].is_sorted for i in indices) + keys_sorted = all(df.column_map[name].is_sorted for name in self.subset) if keys_sorted: table = plc.stream_compaction.unique( df.table, @@ -954,10 +955,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) + # TODO: Is this sortedness setting correct result = DataFrame( [ - NamedColumn(c, old.name).sorted_like(old) - for c, old in zip(table.columns(), df.columns, strict=True) + Column(new, name=old.name).sorted_like(old) + for new, old in zip(table.columns(), df.columns, strict=True) ] ) if keys_sorted or self.stable: @@ -1008,30 +1010,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: sort_keys = broadcast( *(k.evaluate(df) for k in self.by), target_length=df.num_rows ) - names = {c.name: i for i, c in enumerate(df.columns)} # TODO: More robust identification here. - keys_in_result = [ - i - for k in sort_keys - if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj - ] + keys_in_result = { + k.name: i + for i, k in enumerate(sort_keys) + if k.name in df.column_map and k.obj is df.column_map[k.name].obj + } table = self.do_sort( df.table, plc.Table([k.obj for k in sort_keys]), self.order, self.null_order, ) - columns = [ - NamedColumn(c, old.name) - for c, old in zip(table.columns(), df.columns, strict=True) - ] - # If a sort key is in the result table, set the sortedness property - for k, i in enumerate(keys_in_result): - columns[i] = columns[i].set_sorted( - is_sorted=plc.types.Sorted.YES, - order=self.order[k], - null_order=self.null_order[k], - ) + columns: list[Column] = [] + for name, c in zip(df.column_map, table.columns(), strict=True): + column = Column(c, name=name) + # If a sort key is in the result table, set the sortedness property + if name in keys_in_result: + i = keys_in_result[name] + column = column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=self.order[i], + null_order=self.null_order[i], + ) + columns.append(column) return DataFrame(columns).slice(self.zlice) @@ -1080,7 +1082,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = self.df.evaluate(cache=cache) # This can reorder things. columns = broadcast( - *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + *(df.column_map[name] for name in self.schema), target_length=df.num_rows ) return DataFrame(columns) @@ -1125,7 +1127,7 @@ def __post_init__(self) -> None: old, new, _ = self.options # TODO: perhaps polars should validate renaming in the IR? if len(new) != len(set(new)) or ( - set(new) & (set(self.df.schema.keys() - set(old))) + set(new) & (set(self.df.schema.keys()) - set(old)) ): raise NotImplementedError("Duplicate new names in rename.") elif self.name == "unpivot": @@ -1170,7 +1172,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: npiv = len(pivotees) df = self.df.evaluate(cache=cache) index_columns = [ - NamedColumn(col, name) + Column(col, name=name) for col, name in zip( plc.reshape.tile(df.select(indices).table, npiv).columns(), indices, @@ -1191,13 +1193,16 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + [ + df.column_map[pivotee].astype(self.schema[value_name]).obj + for pivotee in pivotees + ] ) return DataFrame( [ *index_columns, - NamedColumn(variable_column, variable_name), - NamedColumn(value_column, value_name), + Column(variable_column, name=variable_name), + Column(value_column, name=value_name), ] ) else: @@ -1278,6 +1283,4 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) for df in dfs ] - return DataFrame( - list(itertools.chain.from_iterable(df.columns for df in dfs)), - ) + return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs)) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index bff44af1468..7837a275f20 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -201,21 +201,21 @@ the logical plan in any case, so is reasonably natural. # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have four (in +around their pylibcudf counterparts. We have three (in `cudf_polars/containers/`): 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` (a `Column` with an additional name) -4. `DataFrame` (a wrapper around a pylibcudf `Table`) +3. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `NamedColumn`s which each -hold a `Column` plus a string `name`. `NamedColumn`s are only ever -constructed via `NamedExpr`s, which are the top-level expression node -that lives inside an `IR` node. This means that the expression -evaluator never has to concern itself with column names: columns are -only ever decorated with names when constructing a `DataFrame`. +speaking, a `DataFrame` is just a mapping from string `name`s to +`Column`s, and thus also holds a pylibcudf `Table`. Names are only +attached to `Column`s and hence inserted into `DataFrames` via +`NamedExpr`s, which are the top-level expression nodes that live +inside an `IR` node. This means that the expression evaluator never +has to concern itself with column names: columns are only ever +decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they are sorted). We could imagine tracking more metadata, like minimum and diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f55031e0826..5345fad41a2 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -50,6 +50,11 @@ license-files = ["LICENSE"] version = {file = "cudf_polars/VERSION"} [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error" +] xfail_strict = true [tool.coverage.report] diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 19919877f84..1f26ab1af9f 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,13 +3,11 @@ from __future__ import annotations -from functools import partial - import pyarrow import pylibcudf as plc import pytest -from cudf_polars.containers import Column, NamedColumn +from cudf_polars.containers import Column def test_non_scalar_access_raises(): @@ -55,11 +53,10 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) -def test_mask_nans(typeid, constructor): +def test_mask_nans(typeid): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = constructor(plc.interop.from_arrow(values)) + column = Column(plc.interop.from_arrow(values)) masked = column.mask_nans() assert column.obj.null_count() == masked.obj.null_count() diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 39fb44d55a5..5c68fb8f0aa 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -8,18 +8,18 @@ import polars as pl -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -30,17 +30,17 @@ def test_select_missing_raises(): def test_replace_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) - replacement = df.columns[0].copy(new_name="b") + replacement = df.column_map["a"].copy().rename("b") with pytest.raises(ValueError): - df.replace_columns(replacement) + df.with_columns([replacement], replace_only=True) def test_from_table_wrong_names(): @@ -55,14 +55,23 @@ def test_from_table_wrong_names(): DataFrame.from_table(table, ["a", "b"]) +def test_unnamed_column_raise(): + payload = plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID + ) + + with pytest.raises(ValueError): + DataFrame([Column(payload, name="a"), Column(payload)]) + + def test_sorted_like_raises_mismatching_names(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -72,11 +81,11 @@ def test_sorted_like_raises_mismatching_names(): def test_shallow_copy(): - column = NamedColumn( + column = Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) column.set_sorted( is_sorted=plc.types.Sorted.YES, @@ -85,13 +94,13 @@ def test_shallow_copy(): ) df = DataFrame([column]) copy = df.copy() - copy.columns[0].set_sorted( + copy.column_map["a"].set_sorted( is_sorted=plc.types.Sorted.NO, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - assert df.columns[0].is_sorted == plc.types.Sorted.YES - assert copy.columns[0].is_sorted == plc.types.Sorted.NO + assert df.column_map["a"].is_sorted == plc.types.Sorted.YES + assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO def test_sorted_flags_preserved_empty(): @@ -100,7 +109,7 @@ def test_sorted_flags_preserved_empty(): gf = DataFrame.from_polars(df) - (a,) = gf.columns + a = gf.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 76c7648813a..2a37683478b 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -69,7 +69,7 @@ def test_setsorted(descending, nulls_last, with_nulls): df = translate_ir(q._ldf.visit()).evaluate(cache={}) - (a,) = df.columns + a = df.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES null_order = ( diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/cudf_polars/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py index 35aaef44e1f..e7770bfadac 100644 --- a/python/cudf_polars/tests/utils/test_broadcast.py +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -6,34 +6,35 @@ import pylibcudf as plc import pytest -from cudf_polars.containers import NamedColumn +from cudf_polars.containers import Column from cudf_polars.dsl.ir import broadcast @pytest.mark.parametrize("target", [4, None]) def test_broadcast_all_scalar(target): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns, target_length=target) expected = 1 if target is None else target + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == expected for column in result) def test_invalid_target_length(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -43,11 +44,11 @@ def test_invalid_target_length(): def test_broadcast_mismatching_column_lengths(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -58,16 +59,17 @@ def test_broadcast_mismatching_column_lengths(): @pytest.mark.parametrize("nrows", [0, 5]) def test_broadcast_with_scalars(nrows): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), nrows if i == 0 else 1, plc.MaskState.ALL_VALID, ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns) + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == nrows for column in result) diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/custreamz/custreamz/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index bae4b051cae..8c0130d2818 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -377,24 +377,16 @@ def test_setitem_overwrites(stream): [ ({}, "sum"), ({}, "mean"), - pytest.param({}, "min"), + ({}, "min"), pytest.param( {}, "median", marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), ), - pytest.param({}, "max"), - pytest.param( - {}, - "var", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), - pytest.param({}, "count"), - pytest.param( - {"ddof": 0}, - "std", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), + ({}, "max"), + ({}, "var"), + ({}, "count"), + ({"ddof": 0}, "std"), pytest.param( {"quantile": 0.5}, "quantile", diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 85ab0024bb5..af45f49d9b4 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -111,6 +111,8 @@ skip = [ ] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", "ignore:unclosed separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini deleted file mode 100644 index f572f85ca49..00000000000 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* -addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..5cf9874d595 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + assert all( + len(got) == max(0, len(s.as_py()) - ngram + 1) + for got, s in zip(pa_result, input_col) + ) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + ) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a8224f54e1c..be65142850f 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -95,6 +95,16 @@ skip = [ "__init__.py", ] +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*" +] +xfail_strict = true + [tool.rapids-build-backend] build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml"