Merge pull request #52 from eth-cscs/release/v1.0.6

Release v1.0.6
eth-cscs · Feb 17, 2022 · aa6653f · aa6653f
2 parents ae11716 + ceac18c
commit aa6653f
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 20 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # 3.11 to avoid issues with OpenMP + CUDA
-project(SpFFT LANGUAGES CXX VERSION 1.0.5)
+project(SpFFT LANGUAGES CXX VERSION 1.0.6)
 set(SPFFT_SO_VERSION 1)
 set(SPFFT_VERSION ${PROJECT_VERSION})
 

diff --git a/src/compression/gpu_kernels/compression_kernels.cu b/src/compression/gpu_kernels/compression_kernels.cu
@@ -27,10 +27,10 @@
  */
 #include <algorithm>
 #include <cassert>
+
 #include "gpu_util/gpu_fft_api.hpp"
 #include "gpu_util/gpu_kernel_parameter.hpp"
 #include "gpu_util/gpu_runtime.hpp"
-#include "memory/array_view_utility.hpp"
 #include "memory/gpu_array_const_view.hpp"
 #include "memory/gpu_array_view.hpp"
 
@@ -59,8 +59,10 @@ auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& ind
   const dim3 threadGrid(std::min(
       static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
   // const dim3 threadGrid(indices.size() < 4 ? 1 : indices.size() / 4);
-  launch_kernel(decompress_kernel<double>, threadGrid, threadBlock, 0, stream, indices, input,
-                create_1d_view(output, 0, output.size()));
+  launch_kernel(decompress_kernel<double>, threadGrid, threadBlock, 0, stream,
+                GPUArrayConstView1D<int>(indices), input,
+                GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>(
+                    output.data(), output.size(), output.device_id()));
 }
 
 auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
@@ -70,8 +72,10 @@ auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& ind
   const dim3 threadBlock(gpu::BlockSizeMedium);
   const dim3 threadGrid(std::min(
       static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
-  launch_kernel(decompress_kernel<float>, threadGrid, threadBlock, 0, stream, indices, input,
-                create_1d_view(output, 0, output.size()));
+  launch_kernel(decompress_kernel<float>, threadGrid, threadBlock, 0, stream,
+                GPUArrayConstView1D<int>(indices), input,
+                GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>(
+                    output.data(), output.size(), output.device_id()));
 }
 
 template <typename T>
@@ -109,11 +113,17 @@ auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indic
       static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
 
   if (useScaling) {
-    launch_kernel(compress_kernel_scaled<double>, threadGrid, threadBlock, 0, stream, indices,
-                  create_1d_view(input, 0, input.size()), output, scalingFactor);
+    launch_kernel(compress_kernel_scaled<double>, threadGrid, threadBlock, 0, stream,
+                  GPUArrayConstView1D<int>(indices),
+                  GPUArrayConstView1D<typename gpu::fft::ComplexType<double>::type>(
+                      input.data(), input.size(), input.device_id()),
+                  output, scalingFactor);
   } else {
-    launch_kernel(compress_kernel<double>, threadGrid, threadBlock, 0, stream, indices,
-                  create_1d_view(input, 0, input.size()), output);
+    launch_kernel(compress_kernel<double>, threadGrid, threadBlock, 0, stream,
+                  GPUArrayConstView1D<int>(indices),
+                  GPUArrayConstView1D<typename gpu::fft::ComplexType<double>::type>(
+                      input.data(), input.size(), input.device_id()),
+                  output);
   }
 }
 
@@ -123,12 +133,20 @@ auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indic
   const dim3 threadBlock(gpu::BlockSizeMedium);
   const dim3 threadGrid(std::min(
       static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), gpu::GridSizeMedium));
+
   if (useScaling) {
-    launch_kernel(compress_kernel_scaled<float>, threadGrid, threadBlock, 0, stream, indices,
-                  create_1d_view(input, 0, input.size()), output, scalingFactor);
+    launch_kernel(compress_kernel_scaled<float>, threadGrid, threadBlock, 0, stream,
+                  GPUArrayConstView1D<int>(indices),
+                  GPUArrayConstView1D<typename gpu::fft::ComplexType<float>::type>(
+                      input.data(), input.size(), input.device_id()),
+                  output, scalingFactor);
   } else {
-    launch_kernel(compress_kernel<float>, threadGrid, threadBlock, 0, stream, indices,
-                  create_1d_view(input, 0, input.size()), output);
+    launch_kernel(compress_kernel<float>, threadGrid, threadBlock, 0, stream,
+                  GPUArrayConstView1D<int>(indices),
+                  GPUArrayConstView1D<typename gpu::fft::ComplexType<float>::type>(
+                      input.data(), input.size(), input.device_id()),
+                  output);
   }
 }
+
 }  // namespace spfft
diff --git a/src/fft/fftw_plan_1d.hpp b/src/fft/fftw_plan_1d.hpp
@@ -49,7 +49,8 @@ struct FFTWPropHash {
     assert(std::get<1>(tuple) < (1 << (sizeof(int) * 4 - 1)));
     assert(std::get<2>(tuple) < (1 << (sizeof(int) * 4 - 1)));
     const int sign = 2 * static_cast<int>(std::get<0>(tuple)) - 1;
-    return std::hash<int>()((sign * (std::get<1>(tuple)<< (sizeof(int) * 4 - 1)) + std::get<2>(tuple)));
+    return std::hash<int>()(
+        sign * ((std::get<1>(tuple) << (sizeof(int) * 4 - 1)) + std::get<2>(tuple) + 1));
   }
 };
 
@@ -81,9 +82,6 @@ class FFTWPlan {
     int inembed[] = {n[0]};
     int onembed[] = {n[0]};
     auto flags = FFTW_ESTIMATE;
-    if (input != output) {
-      flags = flags | FFTW_DESTROY_INPUT;  // allow input override for out-of-place transform
-    }
 
     plan_ = FFTW<T>::plan_many_dft(
         rank, n, (int)howmany,
@@ -110,7 +108,7 @@ class FFTWPlan {
     int n[] = {(int)size};
     int inembed[] = {n[0]};
     int onembed[] = {n[0]};
-    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    auto flags = FFTW_ESTIMATE;
     plan_ = FFTW<T>::plan_many_dft_c2r(
         rank, n, (int)howmany,
         reinterpret_cast<typename FFTW<T>::ComplexType*>(const_cast<ComplexType*>(input)), inembed,
@@ -134,7 +132,7 @@ class FFTWPlan {
     int n[] = {(int)size};
     int inembed[] = {n[0]};
     int onembed[] = {n[0]};
-    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    auto flags = FFTW_ESTIMATE;
     plan_ = FFTW<T>::plan_many_dft_r2c(rank, n, (int)howmany, const_cast<T*>(input), inembed,
                                        (int)istride, (int)idist,
                                        reinterpret_cast<typename FFTW<T>::ComplexType*>(output),

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -74,6 +74,7 @@ if(SPFFT_BUILD_TESTS)
 		run_local_tests.cpp
 		local_tests/test_host_array.cpp
 		local_tests/test_disjoint.cpp
+		local_tests/test_fftw_prop_hash.cpp
 		local_tests/test_local_transform.cpp
 		)
 	target_link_libraries(run_local_tests PRIVATE gtest_main gtest_mpi)

diff --git a/tests/local_tests/test_fftw_prop_hash.cpp b/tests/local_tests/test_fftw_prop_hash.cpp
@@ -0,0 +1,21 @@
+#include <unordered_set>
+#include "gtest/gtest.h"
+#include "fft/fftw_plan_1d.hpp"
+
+
+TEST(FFTWPropHashTest, Unique) {
+  std::unordered_set<std::tuple<bool, int, int>,  spfft::FFTWPropHash> set;
+
+  int maxAlignment = 1024;
+
+  for (int inPlace = 0; inPlace < 2; ++inPlace) {
+    for (int i = 0 ;i < maxAlignment; ++i) {
+      for (int j = 0; j < maxAlignment; ++j) {
+        set.emplace(inPlace, i, j);
+      }
+    }
+  }
+
+  EXPECT_EQ(static_cast<std::size_t>(maxAlignment) * static_cast<std::size_t>(maxAlignment) * 2,
+            set.size());
+}