Skip to content

Commit

Permalink
Add FRI folding for M31 (#618)
Browse files Browse the repository at this point in the history
This PR adds FRI folding for M31 field.
  • Loading branch information
jeremyfelder authored Sep 29, 2024
1 parent 98dd414 commit 436f401
Show file tree
Hide file tree
Showing 6 changed files with 593 additions and 0 deletions.
64 changes: 64 additions & 0 deletions icicle/include/fri/fri.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once
#ifndef FRI_H
#define FRI_H

#include <cuda_runtime.h>

#include "gpu-utils/device_context.cuh"

namespace fri {

struct FriConfig {
device_context::DeviceContext ctx;
bool are_evals_on_device;
bool are_domain_elements_on_device;
bool are_results_on_device;
bool is_async;
};

/**
* @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluation in the current FRI layer.
* @param domain_xs Pointer to a subset of line domain values.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for domain_xs.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
* evaluations by half.
*/
template <typename S, typename E>
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);

/**
* @brief Folds a layer of FRI evaluations from a circle into a line.
*
* This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
* specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
* them into a line using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
* @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals
* represents.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for alpha and domain_ys.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
* evaluations by half.
* @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
*/

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);

} // namespace fri

#endif
6 changes: 6 additions & 0 deletions icicle/src/fields/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ endif ()

SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
SET(SUPPORTED_FIELDS_WITH_FRI m31)

set(TARGET icicle_field)

Expand Down Expand Up @@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT
endif()

if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
endif()

add_library(${TARGET} STATIC ${FIELD_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
Expand Down
55 changes: 55 additions & 0 deletions icicle/src/fri/extern.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "fields/field_config.cuh"
using namespace field_config;

#include "fri.cu"
#include "utils/utils.h"

namespace fri {
/**
* Extern "C" version of [fold_line](@ref fold_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param line_eval Pointer to the array of evaluations on the line
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_line)(
extension_t* line_eval,
scalar_t* domain_elements,
extension_t alpha,
extension_t* folded_evals,
uint64_t n,
FriConfig& cfg)
{
return fri::fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
};

/**
* Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param circle_evals Pointer to the array of evaluations on the circle
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_line_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_circle_into_line)(
extension_t* circle_evals,
scalar_t* domain_elements,
extension_t alpha,
extension_t* folded_line_evals,
uint64_t n,
FriConfig& cfg)
{
return fri::fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
};
} // namespace fri
154 changes: 154 additions & 0 deletions icicle/src/fri/fri.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#include <cuda_runtime.h>

#include "fri/fri.cuh"

#include "fields/field.cuh"
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/device_context.cuh"

namespace fri {

namespace {
template <typename S, typename E>
__device__ void ibutterfly(E& v0, E& v1, const S& itwid)
{
E tmp = v0;
v0 = tmp + v1;
v1 = (tmp - v1) * itwid;
}

template <typename S, typename E>
__global__ void fold_line_kernel(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n)
{
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f_x = eval[idx]; // even
E f_x_neg = eval[idx + 1]; // odd
S x_domain = domain_xs[idx / 2];
ibutterfly(f_x, f_x_neg, S::inverse(x_domain));
auto folded_eval_idx = idx / 2;
folded_eval[folded_eval_idx] = f_x + alpha * f_x_neg;
}
}

template <typename S, typename E>
__global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, E alpha, E alpha_sq, E* folded_eval, uint64_t n)
{
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f0_px = eval[idx];
E f1_px = eval[idx + 1];
ibutterfly(f0_px, f1_px, S::inverse(domain_ys[idx / 2]));
E f_prime = f0_px + alpha * f1_px;
auto folded_eval_idx = idx / 2;
folded_eval[folded_eval_idx] = folded_eval[folded_eval_idx] * alpha_sq + f_prime;
}
}
} // namespace

template <typename S, typename E>
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
{
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move line domain evals to device if necessary
E* d_eval;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_eval = eval;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_xs;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n / 2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_xs, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_xs, domain_xs, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_xs = domain_xs;
}

// Allocate folded_eval if pointer is not a device pointer
E* d_folded_eval;
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
} else {
d_folded_eval = folded_eval;
}

uint64_t num_threads = 256;
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
fold_line_kernel<<<num_blocks, num_threads, 0, stream>>>(d_eval, d_domain_xs, alpha, d_folded_eval, n);

// Move folded_eval back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
}
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_xs, stream)); }
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }

// Sync if stream is default stream
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
{
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move circle domain evals to device if necessary
E* d_eval;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_eval = eval;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_ys;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n / 2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_ys = domain_ys;
}

// Allocate folded_evals if pointer is not a device pointer
E* d_folded_eval;
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
} else {
d_folded_eval = folded_eval;
}

E alpha_sq = alpha * alpha;
uint64_t num_threads = 256;
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
fold_circle_into_line_kernel<<<num_blocks, num_threads, 0, stream>>>(
d_eval, d_domain_ys, alpha, alpha_sq, d_folded_eval, n);

// Move folded_evals back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
}
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_ys, stream)); }
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }

// Sync if stream is default stream
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}
} // namespace fri
Loading

0 comments on commit 436f401

Please sign in to comment.