Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JayjeetAtGithub committed Sep 10, 2024
1 parent bb54057 commit 38ee8c3
Showing 1 changed file with 8 additions and 292 deletions.
300 changes: 8 additions & 292 deletions cpp/benchmarks/ndsh/utilities.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<<<<<<< HEAD
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
Expand All @@ -21,106 +20,52 @@

#include <rmm/device_uvector.hpp>

// RMM memory resource creation utilities
auto make_cuda();
auto make_async();
auto make_pool();
auto make_managed();
auto make_managed_pool();
auto make_prefetch();
auto make_prefetch_pool();

std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
std::string const& rmm_mode);

/**
* @brief A class to represent a table with column names attached
*/
class table_with_names {
public:
table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
: tbl(std::move(tbl)), col_names(col_names)
{
}
: tbl(std::move(tbl)), col_names(col_names){};
/**
* @brief Return the table view
*/
[[nodiscard]] cudf::table_view table() const { return tbl->view(); }
[[nodiscard]] cudf::table_view table() const;
/**
* @brief Return the column view for a given column name
*
* @param col_name The name of the column
*/
[[nodiscard]] cudf::column_view column(std::string const& col_name) const
{
return tbl->view().column(col_id(col_name));
}
[[nodiscard]] cudf::column_view column(std::string const& col_name) const;
/**
* @param Return the column names of the table
*/
[[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
[[nodiscard]] std::vector<std::string> const& column_names() const;
/**
* @brief Translate a column name to a column index
*
* @param col_name The name of the column
*/
[[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
{
CUDF_FUNC_RANGE();
auto it = std::find(col_names.begin(), col_names.end(), col_name);
if (it == col_names.end()) {
std::string err_msg = "Column `" + col_name + "` not found";
throw std::runtime_error(err_msg);
}
return std::distance(col_names.begin(), it);
}
[[nodiscard]] cudf::size_type column_id(std::string const& col_name) const;
/**
* @brief Append a column to the table
*
* @param col The column to append
* @param col_name The name of the appended column
*/
table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
{
CUDF_FUNC_RANGE();
auto cols = tbl->release();
cols.push_back(std::move(col));
tbl = std::make_unique<cudf::table>(std::move(cols));
col_names.push_back(col_name);
return (*this);
}
table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name);
/**
* @brief Select a subset of columns from the table
*
* @param col_names The names of the columns to select
*/
[[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
{
CUDF_FUNC_RANGE();
std::vector<cudf::size_type> col_indices;
for (auto const& col_name : col_names) {
col_indices.push_back(col_id(col_name));
}
return tbl->select(col_indices);
}
[[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const;
/**
* @brief Write the table to a parquet file
*
* @param filepath The path to the parquet file
*/
void to_parquet(std::string const& filepath) const
{
CUDF_FUNC_RANGE();
auto const sink_info = cudf::io::sink_info(filepath);
cudf::io::table_metadata metadata;
metadata.schema_info =
std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
builder.metadata(table_input_metadata);
auto const options = builder.build();
cudf::io::write_parquet(options);
}
void to_parquet(std::string const& filepath) const;

private:
std::unique_ptr<cudf::table> tbl;
Expand Down Expand Up @@ -280,232 +225,3 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
void generate_parquet_data_sources(double scale_factor,
std::vector<std::string> const& table_names,
std::unordered_map<std::string, parquet_device_buffer>& sources);
=======
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/groupby.hpp>
#include <cudf/io/parquet.hpp>

#include <rmm/device_uvector.hpp>

/**
* @brief A class to represent a table with column names attached
*/
class table_with_names {
public:
table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
: tbl(std::move(tbl)), col_names(col_names){};
/**
* @brief Return the table view
*/
[[nodiscard]] cudf::table_view table() const;
/**
* @brief Return the column view for a given column name
*
* @param col_name The name of the column
*/
[[nodiscard]] cudf::column_view column(std::string const& col_name) const;
/**
* @param Return the column names of the table
*/
[[nodiscard]] std::vector<std::string> const& column_names() const;
/**
* @brief Translate a column name to a column index
*
* @param col_name The name of the column
*/
[[nodiscard]] cudf::size_type column_id(std::string const& col_name) const;
/**
* @brief Append a column to the table
*
* @param col The column to append
* @param col_name The name of the appended column
*/
table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name);
/**
* @brief Select a subset of columns from the table
*
* @param col_names The names of the columns to select
*/
[[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const;
/**
* @brief Write the table to a parquet file
*
* @param filepath The path to the parquet file
*/
void to_parquet(std::string const& filepath) const;

private:
std::unique_ptr<cudf::table> tbl;
std::vector<std::string> col_names;
};

/**
* @brief Inner join two tables and gather the result
*
* @param left_input The left input table
* @param right_input The right input table
* @param left_on The columns to join on in the left table
* @param right_on The columns to join on in the right table
* @param compare_nulls The null equality policy
*/
[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
cudf::table_view const& left_input,
cudf::table_view const& right_input,
std::vector<cudf::size_type> const& left_on,
std::vector<cudf::size_type> const& right_on,
cudf::null_equality compare_nulls);

/**
* @brief Apply an inner join operation to two tables
*
* @param left_input The left input table
* @param right_input The right input table
* @param left_on The columns to join on in the left table
* @param right_on The columns to join on in the right table
* @param compare_nulls The null equality policy
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
std::unique_ptr<table_with_names> const& left_input,
std::unique_ptr<table_with_names> const& right_input,
std::vector<std::string> const& left_on,
std::vector<std::string> const& right_on,
cudf::null_equality compare_nulls = cudf::null_equality::EQUAL);

/**
* @brief Apply a filter predicate to a table
*
* @param table The input table
* @param predicate The filter predicate
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate);

/**
* @brief Apply a boolean mask to a table
*
* @param table The input table
* @param mask The boolean mask
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask);

/**
* Struct representing group by key columns, value columns, and the type of aggregations to perform
* on the value columns
*/
struct groupby_context_t {
std::vector<std::string> keys;
std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
values;
};

/**
* @brief Apply a groupby operation to a table
*
* @param table The input table
* @param ctx The groupby context
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx);

/**
* @brief Apply an order by operation to a table
*
* @param table The input table
* @param sort_keys The sort keys
* @param sort_key_orders The sort key orders
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
std::unique_ptr<table_with_names> const& table,
std::vector<std::string> const& sort_keys,
std::vector<cudf::order> const& sort_key_orders);

/**
* @brief Apply a reduction operation to a column
*
* @param column The input column
* @param agg_kind The aggregation kind
* @param col_name The name of the output column
*/
[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
cudf::column_view const& column,
cudf::aggregation::Kind const& agg_kind,
std::string const& col_name);

/**
* @brief Read a parquet file into a table
*
* @param source_info The source of the parquet file
* @param columns The columns to read
* @param predicate The filter predicate to pushdown
*/
[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
cudf::io::source_info const& source_info,
std::vector<std::string> const& columns = {},
std::unique_ptr<cudf::ast::operation> const& predicate = nullptr);

/**
* @brief Generate the `std::tm` structure from year, month, and day
*
* @param year The year
* @param month The month
* @param day The day
*/
std::tm make_tm(int year, int month, int day);

/**
* @brief Calculate the number of days since the UNIX epoch
*
* @param year The year
* @param month The month
* @param day The day
*/
int32_t days_since_epoch(int year, int month, int day);

/**
* @brief Struct representing a parquet device buffer
*/
struct parquet_device_buffer {
parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
rmm::device_uvector<std::byte> d_buffer;
};

/**
* @brief Write a `cudf::table` to a parquet device buffer
*
* @param table The `cudf::table` to write
* @param col_names The column names of the table
* @param parquet_device_buffer The parquet device buffer to write the table to
*/
void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
std::vector<std::string> const& col_names,
parquet_device_buffer& source);

/**
* @brief Generate NDS-H tables and write to parquet device buffers
*
* @param scale_factor The scale factor of NDS-H tables to generate
* @param table_names The names of the tables to generate
* @param sources The parquet data sources to populate
*/
void generate_parquet_data_sources(double scale_factor,
std::vector<std::string> const& table_names,
std::unordered_map<std::string, parquet_device_buffer>& sources);
>>>>>>> 728de6c85b (Remove CUDF_FUNC_RANGES and add docstring)

0 comments on commit 38ee8c3

Please sign in to comment.