Use int64 stride everywhere (#1671)

* use int64 stride everywhere * fix ext * fix ext * more shape + cleanup * one more * few more
ml-explore · Dec 9, 2024 · 40c62c1 · 40c62c1
1 parent 35b412c
commit 40c62c1
Show file tree

Hide file tree

Showing 102 changed files with 1,264 additions and 1,707 deletions.
diff --git a/docs/src/dev/extensions.rst b/docs/src/dev/extensions.rst
@@ -420,8 +420,8 @@ element in the output.
             constant const float& alpha [[buffer(3)]],
             constant const float& beta [[buffer(4)]],
             constant const int* shape [[buffer(5)]],
-            constant const size_t* x_strides [[buffer(6)]],
-            constant const size_t* y_strides [[buffer(7)]],
+            constant const int64_t* x_strides [[buffer(6)]],
+            constant const int64_t* y_strides [[buffer(7)]],
             constant const int& ndim [[buffer(8)]],
             uint index [[thread_position_in_grid]]) {
         // Convert linear indices to offsets in array
@@ -438,24 +438,10 @@ each instantiation a unique host name so we can identify it.
 
 .. code-block:: C++
 
-    #define instantiate_axpby(type_name, type)              \
-        template [[host_name("axpby_general_" #type_name)]] \
-        [[kernel]] void axpby_general<type>(                \
-            device const type* x [[buffer(0)]],             \
-            device const type* y [[buffer(1)]],             \
-            device type* out [[buffer(2)]],                 \
-            constant const float& alpha [[buffer(3)]],      \
-            constant const float& beta [[buffer(4)]],       \
-            constant const int* shape [[buffer(5)]],        \
-            constant const size_t* x_strides [[buffer(6)]], \
-            constant const size_t* y_strides [[buffer(7)]], \
-            constant const int& ndim [[buffer(8)]],         \
-            uint index [[thread_position_in_grid]]);
-
-    instantiate_axpby(float32, float);
-    instantiate_axpby(float16, half);
-    instantiate_axpby(bfloat16, bfloat16_t);
-    instantiate_axpby(complex64, complex64_t);
+    instantiate_kernel("axpby_general_float32", axpby_general, float)
+    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
+    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
+    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)
 
 The logic to determine the kernel, set the inputs, resolve the grid dimensions,
 and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown

diff --git a/examples/extensions/axpby/axpby.metal b/examples/extensions/axpby/axpby.metal
@@ -12,8 +12,8 @@ template <typename T>
     constant const float& alpha [[buffer(3)]],
     constant const float& beta [[buffer(4)]],
     constant const int* shape [[buffer(5)]],
-    constant const size_t* x_strides [[buffer(6)]],
-    constant const size_t* y_strides [[buffer(7)]],
+    constant const int64_t* x_strides [[buffer(6)]],
+    constant const int64_t* y_strides [[buffer(7)]],
     constant const int& ndim [[buffer(8)]],
     uint index [[thread_position_in_grid]]) {
   auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
@@ -34,29 +34,14 @@ template <typename T>
       static_cast<T>(alpha) * x[index] + static_cast<T>(beta) * y[index];
 }
 
-#define instantiate_axpby(type_name, type)                               \
-  template [[host_name("axpby_general_" #type_name)]] [[kernel]] void    \
-  axpby_general<type>(                                                   \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      constant const int* shape [[buffer(5)]],                           \
-      constant const size_t* x_strides [[buffer(6)]],                    \
-      constant const size_t* y_strides [[buffer(7)]],                    \
-      constant const int& ndim [[buffer(8)]],                            \
-      uint index [[thread_position_in_grid]]);                           \
-  template [[host_name("axpby_contiguous_" #type_name)]] [[kernel]] void \
-  axpby_contiguous<type>(                                                \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      uint index [[thread_position_in_grid]]);
+// clang-format off
+#define instantiate_axpby(type_name, type)                             \
+  instantiate_kernel("axpby_general_" #type_name, axpby_general, type) \
+  instantiate_kernel(                                                  \
+          "axpby_contiguous_" #type_name, axpby_contiguous, type)
 
 instantiate_axpby(float32, float);
 instantiate_axpby(float16, half);
 instantiate_axpby(bfloat16, bfloat16_t);
 instantiate_axpby(complex64, complex64_t);
+// clang-format on
diff --git a/mlx/array.h b/mlx/array.h
@@ -18,7 +18,7 @@ class Primitive;
 
 using Deleter = std::function<void(allocator::Buffer)>;
 using Shape = std::vector<int32_t>;
-using Strides = std::vector<size_t>;
+using Strides = std::vector<int64_t>;
 
 class array {
   /* An array is really a node in a graph. It contains a shared ArrayDesc

diff --git a/mlx/backend/common/arg_reduce.cpp b/mlx/backend/common/arg_reduce.cpp
@@ -13,8 +13,8 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
   auto axis_size = in.shape()[axis];
   auto axis_stride = in.strides()[axis];
-  std::vector<size_t> strides = in.strides();
-  std::vector<int> shape = in.shape();
+  Strides strides = in.strides();
+  Shape shape = in.shape();
   strides.erase(strides.begin() + axis);
   shape.erase(shape.begin() + axis);
   for (uint32_t i = 0; i < out.size(); ++i) {

diff --git a/mlx/backend/common/binary.h b/mlx/backend/common/binary.h
@@ -178,10 +178,10 @@ void binary_op_dims(
     const T* b,
     U* out,
     Op op,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
     int axis) {
   auto stride_a = a_strides[axis];
   auto stride_b = b_strides[axis];
@@ -212,10 +212,10 @@ void binary_op_dispatch_dims(
     array& out,
     Op op,
     int dim,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides) {
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides) {
   const T* a_ptr = a.data<T>();
   const T* b_ptr = b.data<T>();
   U* out_ptr = out.data<U>();
@@ -258,10 +258,10 @@ void binary_op_dispatch_dims(
       return;
   }
 
-  ContiguousIterator<size_t> a_it(shape, a_strides, dim - 3);
-  ContiguousIterator<size_t> b_it(shape, b_strides, dim - 3);
-  size_t stride = out_strides[dim - 4];
-  for (size_t elem = 0; elem < a.size(); elem += stride) {
+  ContiguousIterator a_it(shape, a_strides, dim - 3);
+  ContiguousIterator b_it(shape, b_strides, dim - 3);
+  auto stride = out_strides[dim - 4];
+  for (int64_t elem = 0; elem < a.size(); elem += stride) {
     binary_op_dims<T, U, Op, 3, Strided>(
         a_ptr + a_it.loc,
         b_ptr + b_it.loc,
@@ -327,7 +327,7 @@ void binary_op(
   const auto& strides = new_strides[2];
 
   // Get the left-most dim such that the array is row contiguous after
-  auto leftmost_rc_dim = [&strides](const std::vector<size_t>& arr_strides) {
+  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
     int d = arr_strides.size() - 1;
     for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
     }
@@ -337,7 +337,7 @@ void binary_op(
   auto b_rc_dim = leftmost_rc_dim(b_strides);
 
   // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const std::vector<size_t>& arr_strides) {
+  auto leftmost_s_dim = [](const auto& arr_strides) {
     int d = arr_strides.size() - 1;
     for (; d >= 0 && arr_strides[d] == 0; d--) {
     }

diff --git a/mlx/backend/common/binary_two.h b/mlx/backend/common/binary_two.h
@@ -16,10 +16,10 @@ void binary_op_dims(
     U* out_a,
     U* out_b,
     Op op,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
     int axis) {
   auto stride_a = a_strides[axis];
   auto stride_b = b_strides[axis];
@@ -96,9 +96,9 @@ void binary_op_dispatch_dims(
       return;
   }
 
-  ContiguousIterator<size_t> a_it(shape, a_strides, ndim - 2);
-  ContiguousIterator<size_t> b_it(shape, b_strides, ndim - 2);
-  size_t stride = out_strides[ndim - 3];
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
   for (size_t elem = 0; elem < a.size(); elem += stride) {
     binary_op_dims<T, U, Op, 2>(
         a_ptr + a_it.loc,

diff --git a/mlx/backend/common/common.cpp b/mlx/backend/common/common.cpp
@@ -49,7 +49,7 @@ void Broadcast::eval(const std::vector<array>& inputs, array& out) {
     out.set_data(nullptr);
     return;
   }
-  std::vector<size_t> strides(out.ndim(), 0);
+  Strides strides(out.ndim(), 0);
   int diff = out.ndim() - in.ndim();
   for (int i = in.ndim() - 1; i >= 0; --i) {
     strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
@@ -141,7 +141,7 @@ void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
   }
 }
 
-std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
+std::pair<bool, Strides> Reshape::prepare_reshape(
     const array& in,
     const array& out) {
   // Special case for empty arrays or row contiguous arrays
@@ -151,16 +151,15 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
 
   // Special case for scalars
   if (in.ndim() == 0) {
-    std::vector<size_t> out_strides(out.ndim(), 0);
-    return {false, out_strides};
+    return {false, Strides(out.ndim(), 0)};
   }
 
   // Firstly let's collapse all the contiguous dimensions of the input
   auto [shape, strides] = collapse_contiguous_dims(in);
 
   // If shapes fit exactly in the contiguous dims then no copy is necessary so
   // let's check.
-  std::vector<size_t> out_strides;
+  Strides out_strides;
   bool copy_necessary = false;
   int j = 0;
   for (int i = 0; i < out.ndim(); i++) {
@@ -183,7 +182,7 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
 
 void Reshape::shared_buffer_reshape(
     const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
     array& out) {
   auto flags = in.flags();
   if (flags.row_contiguous) {
@@ -249,26 +248,14 @@ void Split::eval(
   }
 }
 
-std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-  }
-
-  return std::make_tuple(data_offset, inp_strides);
-}
-
 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
   assert(inputs.size() == 1);
   move_or_copy(inputs[0], out);
 }
 
 void Transpose::eval(const std::vector<array>& inputs, array& out) {
   assert(inputs.size() == 1);
-  std::vector<size_t> out_strides(out.ndim());
+  Strides out_strides(out.ndim());
   auto& in = inputs[0];
   for (int ax = 0; ax < axes_.size(); ++ax) {
     out_strides[ax] = in.strides()[axes_[ax]];
@@ -285,8 +272,8 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
   //   true, they stay true)
   auto flags = in.flags();
   if (flags.contiguous && in.data_size() == in.size()) {
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
     flags.col_contiguous = true;
     flags.row_contiguous = true;
     for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {

diff --git a/mlx/backend/common/compiled.cpp b/mlx/backend/common/compiled.cpp
@@ -165,7 +165,7 @@ void compiled_allocate_outputs(
     bool move_buffers /* = false */) {
   if (contiguous) {
     int o = 0;
-    std::vector<size_t> strides;
+    Strides strides;
     size_t data_size;
     array::Flags flags;
     for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {

diff --git a/mlx/backend/common/conv.cpp b/mlx/backend/common/conv.cpp
@@ -746,9 +746,9 @@ void explicit_gemm_conv_1D_cpu(
   copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
 
   // Make strided view
-  std::vector<int> strided_shape = {N, oH, wH, C};
+  Shape strided_shape = {N, oH, wH, C};
 
-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
       in_padded.strides()[0],
       in_padded.strides()[1] * wt_strides[0],
       in_padded.strides()[1],
@@ -865,9 +865,9 @@ void explicit_gemm_conv_2D_cpu(
   copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
 
   // Make strided view
-  std::vector<int> strided_shape = {N, oH, oW, wH, wW, C};
+  Shape strided_shape = {N, oH, oW, wH, wW, C};
 
-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
       in_padded.strides()[0],
       in_padded.strides()[1] * wt_strides[0],
       in_padded.strides()[2] * wt_strides[1],
@@ -974,7 +974,7 @@ void explicit_gemm_conv_ND_cpu(
   copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
 
   // Make strided view
-  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
+  Shape strided_shape(oDim.size() + wDim.size() + 2);
   strided_shape.front() = N;
   for (size_t i = 0; i < oDim.size(); i++) {
     strided_shape[i + 1] = oDim[i];
@@ -984,7 +984,7 @@ void explicit_gemm_conv_ND_cpu(
   }
   strided_shape.back() = C;
 
-  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
+  Strides strided_strides(in.shape().size() * 2 - 2);
   strided_strides[0] = in_padded.strides()[0];
   for (size_t i = 0; i < wt_strides.size(); i++) {
     strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
@@ -1000,7 +1000,7 @@ void explicit_gemm_conv_ND_cpu(
       in_padded, strided_strides, flags, in_strided_view.size(), 0);
 
   // Materialize strided view
-  std::vector<int> strided_reshape = {N, C};
+  Shape strided_reshape = {N, C};
   for (const auto& o : oDim) {
     strided_reshape[0] *= o;
   }