oneapi-src · SergeyKopienko · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -42,10 +42,9 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
     auto __buf = __keep(__first, __last);
 
-    auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view());
-    return __future_obj;
+    return oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                                                             unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f},
+                                                             __n, __buf.all_view());
 }
 
 template <__par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read,
@@ -69,7 +68,8 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
 
-    return __future.__make_future(__first2 + __n);
+    using _f_type = decltype(__future);
+    return _f_type::__make_future(std::move(__future), __first2 + __n);
 }
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2,
@@ -96,7 +96,8 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
                                                           unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
                                                           __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
 
-    return __future.__make_future(__first3 + __n);
+    using _f_type = decltype(__future);
+    return _f_type::__make_future(std::move(__future), __first3 + __n);
 }
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2,
@@ -201,10 +202,12 @@ __pattern_transform_scan_base_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
     auto __buf2 = __keep2(__result, __result + __n);
 
-    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n, __unary_op,
         __init, __binary_op, _Inclusive{});
-    return __res.__make_future(__result + __n);
+
+    using _f_type = decltype(__future);
+    return _f_type::__make_future(std::move(__future), __result + __n);
 }
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,

diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h
@@ -46,10 +46,9 @@ transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIt
     const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first, __result);
 
     wait_for_all(::std::forward<_Events>(__dependencies)...);
-    auto ret_val = oneapi::dpl::__internal::__pattern_walk2_async(
+    return oneapi::dpl::__internal::__pattern_walk2_async(
         __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result,
         oneapi::dpl::__internal::__transform_functor<_UnaryOperation>{::std::move(__op)});
-    return ret_val;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class _ForwardIterator,
@@ -64,10 +63,9 @@ transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardI
     const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first1, __first2, __result);
 
     wait_for_all(::std::forward<_Events>(__dependencies)...);
-    auto ret_val = oneapi::dpl::__internal::__pattern_walk3_async(
+    return oneapi::dpl::__internal::__pattern_walk3_async(
         __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __result,
         oneapi::dpl::__internal::__transform_functor<_BinaryOperation>(::std::move(__op)));
-    return ret_val;
 }
 
 // [async.copy]
@@ -80,10 +78,9 @@ copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterato
     auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first, __result);
 
     wait_for_all(::std::forward<_Events>(__dependencies)...);
-    auto ret_val = oneapi::dpl::__internal::__pattern_walk2_brick_async(
+    return oneapi::dpl::__internal::__pattern_walk2_brick_async(
         __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result,
         oneapi::dpl::__internal::__brick_copy<decltype(__dispatch_tag), _ExecutionPolicy>{});
-    return ret_val;
 }
 
 // [async.sort]
@@ -127,9 +124,8 @@ for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIter
     const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first);
 
     wait_for_all(::std::forward<_Events>(__dependencies)...);
-    auto ret_val = oneapi::dpl::__internal::__pattern_walk1_async(
-        __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __f);
-    return ret_val;
+    return oneapi::dpl::__internal::__pattern_walk1_async(__dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec),
+                                                          __first, __last, __f);
 }
 
 // [async.reduce]
@@ -144,10 +140,9 @@ reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterat
     const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first);
 
     wait_for_all(::std::forward<_Events>(__dependencies)...);
-    auto ret_val = oneapi::dpl::__internal::__pattern_transform_reduce_async(
+    return oneapi::dpl::__internal::__pattern_transform_reduce_async(
         __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, __binary_op,
         oneapi::dpl::__internal::__no_op());
-    return ret_val;
 }
 
 template <class _ExecutionPolicy, class _ForwardIt, class... _Events,

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -242,7 +242,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                 __brick(__idx, __rngs...);
             });
         });
-        return __future(__event);
+        return __future<sycl::event>(std::move(__event));
     }
 };
 
@@ -372,7 +372,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
             });
         });
 
-        return __future(__final_event, __result_and_scratch);
+        return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy, typename _InitType::__value_type>>{
+            std::move(__final_event), std::move(__result_and_scratch)};
     }
 };
 
@@ -644,7 +645,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                     }
                 });
         });
-        return __future(__event, __result);
+        return __future<sycl::event, __result_and_scratch_storage<_Policy, _Size>>{std::move(__event),
+                                                                                   std::move(__result)};
     }
 };
 
@@ -700,7 +702,10 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
                         /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()(
                     ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
                     std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
-            return __future(__event, __dummy_result_and_scratch);
+
+            return __future<sycl::event,
+                            __result_and_scratch_storage<_ExecutionPolicy, typename _InitType::__value_type>>{
+                std::move(__event), std::move(__dummy_result_and_scratch)};
         };
         if (__n <= 16)
             return __single_group_scan_f(std::integral_constant<::std::uint16_t, 16>{});
@@ -734,7 +739,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
             __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()(
                 std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
                 std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size);
-        return __future(__event, __dummy_result_and_scratch);
+        return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy, typename _InitType::__value_type>>{
+            std::move(__event), std::move(__dummy_result_and_scratch)};
     }
 }
 
@@ -1866,7 +1872,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo
             });
         }
         // return future and extend lifetime of temporary buffer
-        return __future(__event1);
+        return __future<sycl::event>(std::move(__event1));
     }
 };
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -75,7 +75,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
                 }
             });
         });
-        return __future(__event);
+        return __future<sycl::event>(std::move(__event));
     }
 };
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h
@@ -516,19 +516,19 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag
     // if bins fit into registers, use register private accumulation
     if (__num_bins <= __max_work_item_private_bins)
     {
-        return __future(
+        return __future<sycl::event>(
             __histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>(
-                __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
-                ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
+                std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager));
     }
     // if bins fit into SLM, use local atomics
     else if (__num_bins * sizeof(_local_histogram_type) +
                  __binhash_manager.get_required_SLM_elements() * sizeof(_extra_memory_type) <
              __local_mem_size)
     {
-        return __future(__histogram_general_local_atomics<__iters_per_work_item>(
-            __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
-            ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
+        return __future<sycl::event>(__histogram_general_local_atomics<__iters_per_work_item>(
+            __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
+            std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager));
     }
     else // otherwise, use global atomics (private copies per workgroup)
     {
@@ -537,9 +537,9 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag
         // suggestion which but global memory limitations may increase this value to be able to fit the workgroup
         // private copies of the histogram bins in global memory.  No unrolling is taken advantage of here because it
         // is a runtime argument.
-        return __future(__histogram_general_private_global_atomics(
-            __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item,
-            __work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
+        return __future<sycl::event>(__histogram_general_private_global_atomics(
+            __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item,
+            __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager));
     }
 }
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -162,7 +162,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
                                __comp);
             });
         });
-        return __future(__event);
+        return __future<sycl::event>(std::move(__event));
     }
 };
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
@@ -303,7 +303,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le
             });
         }
 
-        return __future(__event1);
+        return __future<sycl::event>(std::move(__event1));
     }
 };
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -866,7 +866,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP
         }
     }
 
-    return __future(__event);
+    return __future<sycl::event>(std::move(__event));
 }
 
 } // namespace __par_backend_hetero

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -154,7 +154,8 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,
                 });
         });
 
-        return __future(__reduce_event, __scratch_container);
+        return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy, _Tp>>{
+            std::move(__reduce_event), std::move(__scratch_container)};
     }
 }; // struct __parallel_transform_reduce_small_submitter
 
@@ -238,7 +239,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
     auto
     operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
                const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
-               _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const
+               _InitType __init, __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const
     {
         using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
         auto __transform_pattern =
@@ -268,7 +269,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
                 });
         });
 
-        return __future(__reduce_event, __scratch_container);
+        return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy2, _Tp>>{
+            std::move(__reduce_event), std::move(__scratch_container)};
     }
 }; // struct __parallel_transform_reduce_work_group_kernel_submitter
 
@@ -300,7 +302,7 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t
     // __n_groups preliminary results from the device kernel.
     return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
                                                                    _ReduceWorkGroupKernel>()(
-        __backend_tag, std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size,
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::move(__reduce_event), __n_groups, __work_group_size,
         __iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container);
 }
 
@@ -418,7 +420,8 @@ struct __parallel_transform_reduce_impl
             __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group);
         } while (__n > 1);
 
-        return __future(__reduce_event, __scratch_container);
+        return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy, _Tp>>{
+            std::move(__reduce_event), std::move(__scratch_container)};
     }
 }; // struct __parallel_transform_reduce_impl
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -863,7 +863,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
             __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
         }
     }
-    return __future(__event, __result_and_scratch);
+    return __future<sycl::event, __result_and_scratch_storage<_ExecutionPolicy, typename _InitType::__value_type>>{
+        std::move(__event), std::move(__result_and_scratch)};
 }
 
 } // namespace __par_backend_hetero

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -721,8 +721,19 @@ class __future : private std::tuple<_Args...>
     }
 
   public:
-    __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {}
-    __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {}
+    __future(_Event&& __e) : __my_event(std::move(__e)) {}
+    __future(_Event&& __e, const std::tuple<_Args...>& __data)
+        : std::tuple<_Args...>(__data), __my_event(std::move(__e))
+    {
+    }
+    __future(_Event&& __e, std::tuple<_Args...>&& __data)
+        : std::tuple<_Args...>(std::move(__data)), __my_event(std::move(__e))
+    {
+    }
+
+    __future(const __future&) = delete;
+    __future&
+    operator=(const __future&) = delete;
 
     auto
     event() const
@@ -767,13 +778,13 @@ class __future : private std::tuple<_Args...>
 
     //The internal API. There are cases where the implementation specifies return value  "higher" than SYCL backend,
     //where a future is created.
-    template <typename _T>
-    auto
-    __make_future(_T __t) const
+    template <typename _OtherEvent, typename... _OtherArgs, typename... _AddArgs>
+    static __future<_OtherEvent, _AddArgs..., _OtherArgs...>
+    __make_future(__future<_OtherEvent, _OtherArgs...>&& __f, _AddArgs... __add_args)
     {
-        auto new_val = std::tuple<_T>(__t);
-        auto new_tuple = std::tuple_cat(new_val, (std::tuple<_Args...>)*this);
-        return __future<_Event, _T, _Args...>(__my_event, new_tuple);
+        return __future<_OtherEvent, _AddArgs..., _OtherArgs...>{
+            std::move(__f.__my_event), std::tuple_cat(std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...),
+                                                      static_cast<std::tuple<_OtherArgs...>&&>(__f))};
     }
 };