Skip to content

Commit

Permalink
Merge pull request #52 from eth-cscs/release/v1.0.6
Browse files Browse the repository at this point in the history
Release v1.0.6
  • Loading branch information
AdhocMan authored Feb 17, 2022
2 parents ae11716 + ceac18c commit aa6653f
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 20 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # 3.11 to avoid issues with OpenMP + CUDA
project(SpFFT LANGUAGES CXX VERSION 1.0.5)
project(SpFFT LANGUAGES CXX VERSION 1.0.6)
set(SPFFT_SO_VERSION 1)
set(SPFFT_VERSION ${PROJECT_VERSION})

Expand Down
44 changes: 31 additions & 13 deletions src/compression/gpu_kernels/compression_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
*/
#include <algorithm>
#include <cassert>

#include "gpu_util/gpu_fft_api.hpp"
#include "gpu_util/gpu_kernel_parameter.hpp"
#include "gpu_util/gpu_runtime.hpp"
#include "memory/array_view_utility.hpp"
#include "memory/gpu_array_const_view.hpp"
#include "memory/gpu_array_view.hpp"

Expand Down Expand Up @@ -59,8 +59,10 @@ auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& ind
const dim3 threadGrid(std::min(
static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
// const dim3 threadGrid(indices.size() < 4 ? 1 : indices.size() / 4);
launch_kernel(decompress_kernel<double>, threadGrid, threadBlock, 0, stream, indices, input,
create_1d_view(output, 0, output.size()));
launch_kernel(decompress_kernel<double>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices), input,
GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>(
output.data(), output.size(), output.device_id()));
}

auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
Expand All @@ -70,8 +72,10 @@ auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& ind
const dim3 threadBlock(gpu::BlockSizeMedium);
const dim3 threadGrid(std::min(
static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
launch_kernel(decompress_kernel<float>, threadGrid, threadBlock, 0, stream, indices, input,
create_1d_view(output, 0, output.size()));
launch_kernel(decompress_kernel<float>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices), input,
GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>(
output.data(), output.size(), output.device_id()));
}

template <typename T>
Expand Down Expand Up @@ -109,11 +113,17 @@ auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indic
static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));

if (useScaling) {
launch_kernel(compress_kernel_scaled<double>, threadGrid, threadBlock, 0, stream, indices,
create_1d_view(input, 0, input.size()), output, scalingFactor);
launch_kernel(compress_kernel_scaled<double>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices),
GPUArrayConstView1D<typename gpu::fft::ComplexType<double>::type>(
input.data(), input.size(), input.device_id()),
output, scalingFactor);
} else {
launch_kernel(compress_kernel<double>, threadGrid, threadBlock, 0, stream, indices,
create_1d_view(input, 0, input.size()), output);
launch_kernel(compress_kernel<double>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices),
GPUArrayConstView1D<typename gpu::fft::ComplexType<double>::type>(
input.data(), input.size(), input.device_id()),
output);
}
}

Expand All @@ -123,12 +133,20 @@ auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indic
const dim3 threadBlock(gpu::BlockSizeMedium);
const dim3 threadGrid(std::min(
static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));

if (useScaling) {
launch_kernel(compress_kernel_scaled<float>, threadGrid, threadBlock, 0, stream, indices,
create_1d_view(input, 0, input.size()), output, scalingFactor);
launch_kernel(compress_kernel_scaled<float>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices),
GPUArrayConstView1D<typename gpu::fft::ComplexType<float>::type>(
input.data(), input.size(), input.device_id()),
output, scalingFactor);
} else {
launch_kernel(compress_kernel<float>, threadGrid, threadBlock, 0, stream, indices,
create_1d_view(input, 0, input.size()), output);
launch_kernel(compress_kernel<float>, threadGrid, threadBlock, 0, stream,
GPUArrayConstView1D<int>(indices),
GPUArrayConstView1D<typename gpu::fft::ComplexType<float>::type>(
input.data(), input.size(), input.device_id()),
output);
}
}

} // namespace spfft
10 changes: 4 additions & 6 deletions src/fft/fftw_plan_1d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ struct FFTWPropHash {
assert(std::get<1>(tuple) < (1 << (sizeof(int) * 4 - 1)));
assert(std::get<2>(tuple) < (1 << (sizeof(int) * 4 - 1)));
const int sign = 2 * static_cast<int>(std::get<0>(tuple)) - 1;
return std::hash<int>()((sign * (std::get<1>(tuple)<< (sizeof(int) * 4 - 1)) + std::get<2>(tuple)));
return std::hash<int>()(
sign * ((std::get<1>(tuple) << (sizeof(int) * 4 - 1)) + std::get<2>(tuple) + 1));
}
};

Expand Down Expand Up @@ -81,9 +82,6 @@ class FFTWPlan {
int inembed[] = {n[0]};
int onembed[] = {n[0]};
auto flags = FFTW_ESTIMATE;
if (input != output) {
flags = flags | FFTW_DESTROY_INPUT; // allow input override for out-of-place transform
}

plan_ = FFTW<T>::plan_many_dft(
rank, n, (int)howmany,
Expand All @@ -110,7 +108,7 @@ class FFTWPlan {
int n[] = {(int)size};
int inembed[] = {n[0]};
int onembed[] = {n[0]};
auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
auto flags = FFTW_ESTIMATE;
plan_ = FFTW<T>::plan_many_dft_c2r(
rank, n, (int)howmany,
reinterpret_cast<typename FFTW<T>::ComplexType*>(const_cast<ComplexType*>(input)), inembed,
Expand All @@ -134,7 +132,7 @@ class FFTWPlan {
int n[] = {(int)size};
int inembed[] = {n[0]};
int onembed[] = {n[0]};
auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
auto flags = FFTW_ESTIMATE;
plan_ = FFTW<T>::plan_many_dft_r2c(rank, n, (int)howmany, const_cast<T*>(input), inembed,
(int)istride, (int)idist,
reinterpret_cast<typename FFTW<T>::ComplexType*>(output),
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ if(SPFFT_BUILD_TESTS)
run_local_tests.cpp
local_tests/test_host_array.cpp
local_tests/test_disjoint.cpp
local_tests/test_fftw_prop_hash.cpp
local_tests/test_local_transform.cpp
)
target_link_libraries(run_local_tests PRIVATE gtest_main gtest_mpi)
Expand Down
21 changes: 21 additions & 0 deletions tests/local_tests/test_fftw_prop_hash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include <unordered_set>
#include "gtest/gtest.h"
#include "fft/fftw_plan_1d.hpp"


TEST(FFTWPropHashTest, Unique) {
std::unordered_set<std::tuple<bool, int, int>, spfft::FFTWPropHash> set;

int maxAlignment = 1024;

for (int inPlace = 0; inPlace < 2; ++inPlace) {
for (int i = 0 ;i < maxAlignment; ++i) {
for (int j = 0; j < maxAlignment; ++j) {
set.emplace(inPlace, i, j);
}
}
}

EXPECT_EQ(static_cast<std::size_t>(maxAlignment) * static_cast<std::size_t>(maxAlignment) * 2,
set.size());
}

0 comments on commit aa6653f

Please sign in to comment.