-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
6 changed files
with
593 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#pragma once | ||
#ifndef FRI_H | ||
#define FRI_H | ||
|
||
#include <cuda_runtime.h> | ||
|
||
#include "gpu-utils/device_context.cuh" | ||
|
||
namespace fri { | ||
|
||
struct FriConfig { | ||
device_context::DeviceContext ctx; | ||
bool are_evals_on_device; | ||
bool are_domain_elements_on_device; | ||
bool are_results_on_device; | ||
bool is_async; | ||
}; | ||
|
||
/** | ||
* @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha. | ||
* | ||
* @param evals Pointer to the array of evaluation in the current FRI layer. | ||
* @param domain_xs Pointer to a subset of line domain values. | ||
* @param alpha The folding factor used in the FRI protocol. | ||
* @param folded_evals Pointer to the array where the folded evaluations will be stored. | ||
* @param n The number of evaluations in the original layer (before folding). | ||
* | ||
* @tparam S The scalar field type used for domain_xs. | ||
* @tparam E The evaluation type, typically the same as the field element type. | ||
* | ||
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of | ||
* evaluations by half. | ||
*/ | ||
template <typename S, typename E> | ||
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg); | ||
|
||
/** | ||
* @brief Folds a layer of FRI evaluations from a circle into a line. | ||
* | ||
* This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol, | ||
* specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds | ||
* them into a line using the provided folding factor alpha. | ||
* | ||
* @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle. | ||
* @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals | ||
* represents. | ||
* @param alpha The folding factor used in the FRI protocol. | ||
* @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored. | ||
* @param n The number of evaluations in the original layer (before folding). | ||
* | ||
* @tparam S The scalar field type used for alpha and domain_ys. | ||
* @tparam E The evaluation type, typically the same as the field element type. | ||
* | ||
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of | ||
* evaluations by half. | ||
* @note This function is specifically designed for folding evaluations from a circular domain to a linear domain. | ||
*/ | ||
|
||
template <typename S, typename E> | ||
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg); | ||
|
||
} // namespace fri | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#include "fields/field_config.cuh" | ||
using namespace field_config; | ||
|
||
#include "fri.cu" | ||
#include "utils/utils.h" | ||
|
||
namespace fri { | ||
/** | ||
* Extern "C" version of [fold_line](@ref fold_line) function with the following values of | ||
* template parameters (where the field is given by `-DFIELD` env variable during build): | ||
* - `E` is the extension field type used for evaluations and alpha | ||
* - `S` is the scalar field type used for domain elements | ||
* @param line_eval Pointer to the array of evaluations on the line | ||
* @param domain_elements Pointer to the array of domain elements | ||
* @param alpha The folding factor | ||
* @param folded_evals Pointer to the array where folded evaluations will be stored | ||
* @param n The number of evaluations | ||
* @param ctx The device context; if the stream is not 0, then everything is run async | ||
* @return `cudaSuccess` if the execution was successful and an error code otherwise. | ||
*/ | ||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_line)( | ||
extension_t* line_eval, | ||
scalar_t* domain_elements, | ||
extension_t alpha, | ||
extension_t* folded_evals, | ||
uint64_t n, | ||
FriConfig& cfg) | ||
{ | ||
return fri::fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg); | ||
}; | ||
|
||
/** | ||
* Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of | ||
* template parameters (where the field is given by `-DFIELD` env variable during build): | ||
* - `E` is the extension field type used for evaluations and alpha | ||
* - `S` is the scalar field type used for domain elements | ||
* @param circle_evals Pointer to the array of evaluations on the circle | ||
* @param domain_elements Pointer to the array of domain elements | ||
* @param alpha The folding factor | ||
* @param folded_line_evals Pointer to the array where folded evaluations will be stored | ||
* @param n The number of evaluations | ||
* @param ctx The device context; if the stream is not 0, then everything is run async | ||
* @return `cudaSuccess` if the execution was successful and an error code otherwise. | ||
*/ | ||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_circle_into_line)( | ||
extension_t* circle_evals, | ||
scalar_t* domain_elements, | ||
extension_t alpha, | ||
extension_t* folded_line_evals, | ||
uint64_t n, | ||
FriConfig& cfg) | ||
{ | ||
return fri::fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg); | ||
}; | ||
} // namespace fri |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#include <cuda_runtime.h> | ||
|
||
#include "fri/fri.cuh" | ||
|
||
#include "fields/field.cuh" | ||
#include "gpu-utils/error_handler.cuh" | ||
#include "gpu-utils/device_context.cuh" | ||
|
||
namespace fri { | ||
|
||
namespace { | ||
template <typename S, typename E> | ||
__device__ void ibutterfly(E& v0, E& v1, const S& itwid) | ||
{ | ||
E tmp = v0; | ||
v0 = tmp + v1; | ||
v1 = (tmp - v1) * itwid; | ||
} | ||
|
||
template <typename S, typename E> | ||
__global__ void fold_line_kernel(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n) | ||
{ | ||
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx % 2 == 0 && idx < n) { | ||
E f_x = eval[idx]; // even | ||
E f_x_neg = eval[idx + 1]; // odd | ||
S x_domain = domain_xs[idx / 2]; | ||
ibutterfly(f_x, f_x_neg, S::inverse(x_domain)); | ||
auto folded_eval_idx = idx / 2; | ||
folded_eval[folded_eval_idx] = f_x + alpha * f_x_neg; | ||
} | ||
} | ||
|
||
template <typename S, typename E> | ||
__global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, E alpha, E alpha_sq, E* folded_eval, uint64_t n) | ||
{ | ||
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx % 2 == 0 && idx < n) { | ||
E f0_px = eval[idx]; | ||
E f1_px = eval[idx + 1]; | ||
ibutterfly(f0_px, f1_px, S::inverse(domain_ys[idx / 2])); | ||
E f_prime = f0_px + alpha * f1_px; | ||
auto folded_eval_idx = idx / 2; | ||
folded_eval[folded_eval_idx] = folded_eval[folded_eval_idx] * alpha_sq + f_prime; | ||
} | ||
} | ||
} // namespace | ||
|
||
template <typename S, typename E> | ||
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg) | ||
{ | ||
CHK_INIT_IF_RETURN(); | ||
|
||
cudaStream_t stream = cfg.ctx.stream; | ||
// Allocate and move line domain evals to device if necessary | ||
E* d_eval; | ||
if (!cfg.are_evals_on_device) { | ||
auto data_size = sizeof(E) * n; | ||
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream)); | ||
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream)); | ||
} else { | ||
d_eval = eval; | ||
} | ||
|
||
// Allocate and move domain's elements to device if necessary | ||
S* d_domain_xs; | ||
if (!cfg.are_domain_elements_on_device) { | ||
auto data_size = sizeof(S) * n / 2; | ||
CHK_IF_RETURN(cudaMallocAsync(&d_domain_xs, data_size, stream)); | ||
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_xs, domain_xs, data_size, cudaMemcpyHostToDevice, stream)); | ||
} else { | ||
d_domain_xs = domain_xs; | ||
} | ||
|
||
// Allocate folded_eval if pointer is not a device pointer | ||
E* d_folded_eval; | ||
if (!cfg.are_results_on_device) { | ||
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream)); | ||
} else { | ||
d_folded_eval = folded_eval; | ||
} | ||
|
||
uint64_t num_threads = 256; | ||
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads; | ||
fold_line_kernel<<<num_blocks, num_threads, 0, stream>>>(d_eval, d_domain_xs, alpha, d_folded_eval, n); | ||
|
||
// Move folded_eval back to host if requested | ||
if (!cfg.are_results_on_device) { | ||
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream)); | ||
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream)); | ||
} | ||
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_xs, stream)); } | ||
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); } | ||
|
||
// Sync if stream is default stream | ||
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream)); | ||
|
||
return CHK_LAST(); | ||
} | ||
|
||
template <typename S, typename E> | ||
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg) | ||
{ | ||
CHK_INIT_IF_RETURN(); | ||
|
||
cudaStream_t stream = cfg.ctx.stream; | ||
// Allocate and move circle domain evals to device if necessary | ||
E* d_eval; | ||
if (!cfg.are_evals_on_device) { | ||
auto data_size = sizeof(E) * n; | ||
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream)); | ||
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream)); | ||
} else { | ||
d_eval = eval; | ||
} | ||
|
||
// Allocate and move domain's elements to device if necessary | ||
S* d_domain_ys; | ||
if (!cfg.are_domain_elements_on_device) { | ||
auto data_size = sizeof(S) * n / 2; | ||
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream)); | ||
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream)); | ||
} else { | ||
d_domain_ys = domain_ys; | ||
} | ||
|
||
// Allocate folded_evals if pointer is not a device pointer | ||
E* d_folded_eval; | ||
if (!cfg.are_results_on_device) { | ||
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream)); | ||
} else { | ||
d_folded_eval = folded_eval; | ||
} | ||
|
||
E alpha_sq = alpha * alpha; | ||
uint64_t num_threads = 256; | ||
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads; | ||
fold_circle_into_line_kernel<<<num_blocks, num_threads, 0, stream>>>( | ||
d_eval, d_domain_ys, alpha, alpha_sq, d_folded_eval, n); | ||
|
||
// Move folded_evals back to host if requested | ||
if (!cfg.are_results_on_device) { | ||
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream)); | ||
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream)); | ||
} | ||
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_ys, stream)); } | ||
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); } | ||
|
||
// Sync if stream is default stream | ||
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream)); | ||
|
||
return CHK_LAST(); | ||
} | ||
} // namespace fri |
Oops, something went wrong.