diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
index ef4c1177c7e6..d884c453334f 100644
--- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt
+++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -26,8 +26,10 @@ set(TTNN_TENSOR_UNIT_TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor_multi_device.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor_with_layout.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_distributed_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_partition.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_shape_base.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_sharding_with_alignment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_vector_conversion.cpp
 )
 
 add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
index a66c1b737497..e60ee4912e3e 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
@@ -4,12 +4,10 @@
 
 #include <gtest/gtest.h>
 
-#include "common/bfloat16.hpp"
 #include "ttnn/distributed/api.hpp"
 #include "ttnn/operations/functions.hpp"
 #include "ttnn/tensor/xtensor/conversion_utils.hpp"
 #include "ttnn_test_fixtures.hpp"
-#include <exception>
 #include <ttnn/distributed/types.hpp>
 #include <ttnn/distributed/distributed_tensor.hpp>
 
@@ -20,7 +18,8 @@ using ::ttnn::experimental::xtensor::from_vector;
 using TensorDistributionTest = T3kMultiDeviceFixture;
 
 TEST_F(TensorDistributionTest, Replication) {
-    Tensor input_tensor = from_vector(std::vector<float>{42.F, 13.F, -99.F}, ttnn::Shape{1, 1, 1, 3});
+    Tensor input_tensor =
+        from_vector(std::vector<float>{42.F, 13.F, -99.F}, ttnn::SimpleShape{1, 1, 1, 3}, DataType::FLOAT32);
 
     auto mapper = api::replicate_tensor_to_mesh_mapper(*mesh_device_);
     Tensor replicated_tensor = api::distribute_tensor(input_tensor, *mesh_device_, *mapper);
@@ -28,13 +27,14 @@ TEST_F(TensorDistributionTest, Replication) {
     std::vector<Tensor> device_tensors = api::get_device_tensors(replicated_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
     for (const auto& device_tensor : device_tensors) {
-        EXPECT_TRUE(ttnn::allclose<bfloat16>(device_tensor.cpu(), input_tensor));
+        EXPECT_TRUE(ttnn::allclose<float>(device_tensor.cpu(), input_tensor));
     }
 }
 
 TEST_F(TensorDistributionTest, Shard1DInvalidDim) {
     const int num_devices = mesh_device_->num_devices();
-    Tensor input_tensor = from_vector(std::vector<float>(num_devices, 0), ttnn::Shape{1, 1, 1, num_devices});
+    Tensor input_tensor =
+        from_vector(std::vector<float>(num_devices, 0), ttnn::SimpleShape{1, 1, 1, num_devices}, DataType::FLOAT32);
 
     EXPECT_ANY_THROW({
         auto mapper = api::shard_tensor_to_mesh_mapper(*mesh_device_, -1);
@@ -50,7 +50,8 @@ TEST_F(TensorDistributionTest, Shard1DInvalidDim) {
 TEST_F(TensorDistributionTest, Shard1DTooFewShards) {
     const int num_devices = mesh_device_->num_devices();
     ASSERT_LT(3, num_devices);
-    Tensor input_tensor = from_vector(std::vector<float>{42.F, 13.F, -99.F}, ttnn::Shape{1, 1, 1, 3});
+    Tensor input_tensor =
+        from_vector(std::vector<float>{42.F, 13.F, -99.F}, ttnn::SimpleShape{1, 1, 1, 3}, DataType::FLOAT32);
 
     EXPECT_ANY_THROW({
         auto mapper = api::shard_tensor_to_mesh_mapper(*mesh_device_, 3);
@@ -64,7 +65,7 @@ TEST_F(TensorDistributionTest, Shard1D) {
     for (int i = 0; i < num_devices; i++) {
         test_data.insert(test_data.end(), {i * 1.F, i * 2.F, i * 3.F});
     }
-    Tensor input_tensor = from_vector(test_data, ttnn::Shape{1, num_devices, 3, 1});
+    Tensor input_tensor = from_vector(test_data, ttnn::SimpleShape{1, num_devices, 3, 1}, DataType::FLOAT32);
 
     auto mapper = api::shard_tensor_to_mesh_mapper(*mesh_device_, 1);
     Tensor sharded_tensor = api::distribute_tensor(input_tensor, *mesh_device_, *mapper);
@@ -72,15 +73,16 @@ TEST_F(TensorDistributionTest, Shard1D) {
     std::vector<Tensor> device_tensors = api::get_device_tensors(sharded_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
     for (int i = 0; i < device_tensors.size(); i++) {
-        auto expected = from_vector(std::vector<float>{i * 1.F, i * 2.F, i * 3.F}, ttnn::Shape{1, 1, 3, 1});
-        EXPECT_TRUE(ttnn::allclose<bfloat16>(device_tensors[i].cpu(), expected));
+        auto expected = from_vector(
+            std::vector<float>{i * 1.F, i * 2.F, i * 3.F}, ttnn::SimpleShape{1, 1, 3, 1}, DataType::FLOAT32);
+        EXPECT_TRUE(ttnn::allclose<float>(device_tensors[i].cpu(), expected));
     }
 
     auto composer = api::concat_mesh_to_tensor_composer(/*dim=*/0);
     Tensor concatenated_tensor = api::aggregate_tensor(sharded_tensor, *composer);
 
-    Tensor expected_tensor = from_vector(test_data, ttnn::Shape{num_devices, 1, 3, 1});
-    EXPECT_TRUE(ttnn::allclose<bfloat16>(concatenated_tensor, expected_tensor));
+    Tensor expected_tensor = from_vector(test_data, ttnn::SimpleShape{num_devices, 1, 3, 1}, DataType::FLOAT32);
+    EXPECT_TRUE(ttnn::allclose<float>(concatenated_tensor, expected_tensor));
 }
 
 TEST_F(TensorDistributionTest, Shard2DInvalidMeshShape) {
@@ -110,7 +112,7 @@ TEST_F(TensorDistributionTest, Shard2DReplicateDim) {
     const int num_devices = num_rows * num_cols;
 
     std::vector<float> test_data = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
-    Tensor input_tensor = from_vector(test_data, ttnn::Shape{1, num_rows, num_cols, 1});
+    Tensor input_tensor = from_vector(test_data, ttnn::SimpleShape{1, num_rows, num_cols, 1}, DataType::FLOAT32);
     input_tensor.print();
 
     auto mapper = api::shard_tensor_2d_to_mesh_mapper(
@@ -127,12 +129,14 @@ TEST_F(TensorDistributionTest, Shard2DReplicateDim) {
 
     int i = 0;
     for (; i < 4; i++) {
-        auto expected = from_vector(std::vector<float>{0.0, 1.0, 2.0, 3.0}, ttnn::Shape{1, 1, 4, 1});
-        EXPECT_TRUE(ttnn::allclose<bfloat16>(device_tensors[i].cpu(), expected));
+        auto expected =
+            from_vector(std::vector<float>{0.0, 1.0, 2.0, 3.0}, ttnn::SimpleShape{1, 1, 4, 1}, DataType::FLOAT32);
+        EXPECT_TRUE(ttnn::allclose<float>(device_tensors[i].cpu(), expected));
     }
     for (; i < device_tensors.size(); i++) {
-        auto expected = from_vector(std::vector<float>{4.0, 5.0, 6.0, 7.0}, ttnn::Shape{1, 1, 4, 1});
-        EXPECT_TRUE(ttnn::allclose<bfloat16>(device_tensors[i].cpu(), expected));
+        auto expected =
+            from_vector(std::vector<float>{4.0, 5.0, 6.0, 7.0}, ttnn::SimpleShape{1, 1, 4, 1}, DataType::FLOAT32);
+        EXPECT_TRUE(ttnn::allclose<float>(device_tensors[i].cpu(), expected));
     }
 }
 
@@ -146,7 +150,7 @@ TEST_F(TensorDistributionTest, Shard2D) {
     for (int i = 0; i < num_devices; i++) {
         test_data.insert(test_data.end(), {i * 1.F, i * 2.F, i * 3.F});
     }
-    Tensor input_tensor = from_vector(test_data, ttnn::Shape{1, num_rows, num_cols, 3});
+    Tensor input_tensor = from_vector(test_data, ttnn::SimpleShape{1, num_rows, num_cols, 3}, DataType::FLOAT32);
 
     auto mapper = api::shard_tensor_2d_to_mesh_mapper(
         *mesh_device_,
@@ -160,8 +164,9 @@ TEST_F(TensorDistributionTest, Shard2D) {
     std::vector<Tensor> device_tensors = api::get_device_tensors(sharded_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
     for (int i = 0; i < device_tensors.size(); i++) {
-        auto expected = from_vector(std::vector<float>{i * 1.F, i * 2.F, i * 3.F}, ttnn::Shape{1, 1, 1, 3});
-        EXPECT_TRUE(ttnn::allclose<bfloat16>(device_tensors[i].cpu(), expected));
+        auto expected = from_vector(
+            std::vector<float>{i * 1.F, i * 2.F, i * 3.F}, ttnn::SimpleShape{1, 1, 1, 3}, DataType::FLOAT32);
+        EXPECT_TRUE(ttnn::allclose<float>(device_tensors[i].cpu(), expected));
     }
 
     auto composer = api::concat_mesh_2d_to_tensor_composer(
@@ -172,8 +177,8 @@ TEST_F(TensorDistributionTest, Shard2D) {
         });
     Tensor concatenated_tensor = api::aggregate_tensor(sharded_tensor, *composer);
 
-    Tensor expected_tensor = from_vector(test_data, ttnn::Shape{num_rows, 1, num_cols, 3});
-    EXPECT_TRUE(ttnn::allclose<bfloat16>(concatenated_tensor, expected_tensor));
+    Tensor expected_tensor = from_vector(test_data, ttnn::SimpleShape{num_rows, 1, num_cols, 3}, DataType::FLOAT32);
+    EXPECT_TRUE(ttnn::allclose<float>(concatenated_tensor, expected_tensor));
 }
 
 }  // namespace ttnn::distributed::test
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_partition.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_partition.cpp
new file mode 100644
index 000000000000..0d8a6883a76b
--- /dev/null
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_partition.cpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/tensor/xtensor/conversion_utils.hpp"
+#include "ttnn/tensor/xtensor/partition.hpp"
+#include "ttnn/tensor/xtensor/xtensor_all_includes.hpp"
+
+namespace ttnn {
+namespace {
+
+using ::tt::tt_metal::Tensor;
+using ::ttnn::experimental::xtensor::chunk;
+using ::ttnn::experimental::xtensor::concatenate;
+using ::ttnn::experimental::xtensor::from_vector;
+
+}  // namespace
+}  // namespace ttnn
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp
new file mode 100644
index 000000000000..8a72b8e9b360
--- /dev/null
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <algorithm>
+#include <cstdint>
+
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/tensor/xtensor/conversion_utils.hpp"
+
+namespace ttnn {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Pointwise;
+using ::tt::tt_metal::Tensor;
+using ::ttnn::experimental::xtensor::from_vector;
+using ::ttnn::experimental::xtensor::to_vector;
+
+const std::vector<ttnn::SimpleShape>& GetShapesForTest() {
+    static auto* shapes = new std::vector<ttnn::SimpleShape>{
+        ttnn::SimpleShape{1, 1, 1, 1},
+        ttnn::SimpleShape{1, 1, 1, 10},
+        ttnn::SimpleShape{1, 32, 32, 16},
+        ttnn::SimpleShape{1, 40, 3, 128},
+        ttnn::SimpleShape{2, 2},
+        ttnn::SimpleShape{1, 1, 1, 1, 10},
+    };
+    return *shapes;
+}
+
+template <typename T>
+std::vector<T> Arange(int64_t start, int64_t end, int64_t step) {
+    std::vector<T> result;
+    for (int64_t i = start; i < end; i += step) {
+        if constexpr (std::is_same_v<T, ::bfloat16>) {
+            result.push_back(T(static_cast<float>(i)));
+        } else {
+            result.push_back(static_cast<T>(i));
+        }
+    }
+    return result;
+}
+
+template <typename T>
+class VectorConversionTest : public ::testing::Test {};
+
+using TestTypes = ::testing::Types<float, bfloat16, uint32_t, int32_t>;
+TYPED_TEST_SUITE(VectorConversionTest, TestTypes);
+
+TYPED_TEST(VectorConversionTest, Basic) {
+    for (const auto& shape : GetShapesForTest()) {
+        auto input = Arange<TypeParam>(0, static_cast<int64_t>(shape.volume()), 1);
+        auto output = to_vector<TypeParam>(from_vector(input, shape, convert_to_data_type<TypeParam>()));
+        EXPECT_THAT(output, Pointwise(Eq(), input)) << "for shape: " << shape;
+    }
+}
+
+TYPED_TEST(VectorConversionTest, InvalidSize) {
+    ttnn::SimpleShape shape{32, 32};
+    auto input = Arange<TypeParam>(0, 42, 1);
+
+    ASSERT_NE(input.size(), shape.volume());
+    EXPECT_ANY_THROW(from_vector(input, shape, convert_to_data_type<TypeParam>()));
+}
+
+TYPED_TEST(VectorConversionTest, InvalidDtype) {
+    ttnn::SimpleShape shape{32, 32};
+    auto input = Arange<TypeParam>(0, 42, 1);
+
+    ASSERT_NE(input.size(), shape.volume());
+    EXPECT_ANY_THROW(from_vector(
+        input,
+        shape,
+        // Use INT32 for verification, except for when the actual type is int32_t.
+        (std::is_same_v<TypeParam, int32_t> ? DataType::FLOAT32 : DataType::INT32)));
+}
+
+TEST(FloatVectorConversionTest, Bfloat16Representation) {
+    for (const auto& shape : GetShapesForTest()) {
+        auto input_bf16 = Arange<bfloat16>(0, static_cast<int64_t>(shape.volume()), 1);
+        std::vector<float> input_ft;
+        input_ft.reserve(input_bf16.size());
+        std::transform(input_bf16.begin(), input_bf16.end(), std::back_inserter(input_ft), [](bfloat16 bf) {
+            return bf.to_float();
+        });
+
+        auto output_bf16 = to_vector<bfloat16>(from_vector(input_ft, shape, DataType::BFLOAT16));
+        EXPECT_THAT(output_bf16, Pointwise(Eq(), input_bf16)) << "for shape: " << shape;
+
+        auto output_ft = to_vector<float>(from_vector(input_bf16, shape, DataType::BFLOAT16));
+        EXPECT_THAT(output_ft, Pointwise(Eq(), input_ft)) << "for shape: " << shape;
+    }
+}
+
+}  // namespace
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
index 15c373e814d5..a2c64b7d99ae 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
@@ -16,35 +16,33 @@ namespace {
 
 class ReplicateTensorToMesh : public TensorToMesh {
 public:
-    ReplicateTensorToMesh(MeshDevice& mesh_device) : mesh_device_(mesh_device) {}
+    ReplicateTensorToMesh(int num_devices) : num_devices_(num_devices) {}
 
     std::vector<Tensor> map(const Tensor& tensor) override {
         std::vector<Tensor> tensors;
-        tensors.reserve(mesh_device_.num_devices());
-        std::fill_n(std::back_inserter(tensors), mesh_device_.num_devices(), tensor);
+        tensors.reserve(num_devices_);
+        std::fill_n(std::back_inserter(tensors), num_devices_, tensor);
         return tensors;
     }
 
-    DistributedTensorConfig config() const override {
-        return DistributedTensorConfig{ReplicateTensor{mesh_device_.num_devices()}};
-    }
+    DistributedTensorConfig config() const override { return DistributedTensorConfig{ReplicateTensor{num_devices_}}; }
 
 private:
-    MeshDevice& mesh_device_;
+    int num_devices_ = -1;
 };
 
 class ShardTensorToMesh : public TensorToMesh {
 public:
-    ShardTensorToMesh(MeshDevice& mesh_device, int dim) : mesh_device_(mesh_device), shard_dim_(dim) {}
+    ShardTensorToMesh(int num_devices, int dim) : num_devices_(num_devices), shard_dim_(dim) {}
 
     std::vector<Tensor> map(const Tensor& tensor) override {
-        return experimental::xtensor::chunk(tensor, mesh_device_.num_devices(), shard_dim_);
+        return experimental::xtensor::chunk(tensor, num_devices_, shard_dim_);
     }
 
     DistributedTensorConfig config() const override { return DistributedTensorConfig{ShardTensor{shard_dim_}}; }
 
 private:
-    MeshDevice& mesh_device_;
+    int num_devices_ = -1;
     int shard_dim_ = -1;
 };
 
@@ -144,11 +142,11 @@ class ConcatMesh2dToTensor : public MeshToTensor {
 }  // namespace
 
 std::unique_ptr<TensorToMesh> replicate_tensor_to_mesh_mapper(MeshDevice& mesh_device) {
-    return std::make_unique<ReplicateTensorToMesh>(mesh_device);
+    return std::make_unique<ReplicateTensorToMesh>(mesh_device.num_devices());
 }
 
 std::unique_ptr<TensorToMesh> shard_tensor_to_mesh_mapper(MeshDevice& mesh_device, int dim) {
-    return std::make_unique<ShardTensorToMesh>(mesh_device, dim);
+    return std::make_unique<ShardTensorToMesh>(mesh_device.num_devices(), dim);
 }
 
 std::unique_ptr<TensorToMesh> shard_tensor_2d_to_mesh_mapper(
diff --git a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.cpp b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.cpp
index 3b9906cd4edd..4e4d829df324 100644
--- a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.cpp
+++ b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.cpp
@@ -12,50 +12,21 @@
 namespace ttnn::experimental::xtensor {
 namespace {
 
+using ::tt::tt_metal::DataType;
 using ::tt::tt_metal::Tensor;
 
-// copypaste from deprecated tensor pybinds ttnn
-tt::tt_metal::OwnedBuffer create_owned_buffer(const std::vector<float>& data, DataType data_type) {
-    using ::tt::tt_metal::owned_buffer::create;
-
-    switch (data_type) {
-        case DataType::BFLOAT8_B: {
-            auto uint32_vector = pack_fp32_vec_as_bfp8_tiles(data, /*row_major_input=*/false, /*is_exp_a=*/false);
-            return create<uint32_t>(std::move(uint32_vector));
-        }
-        case DataType::BFLOAT4_B: {
-            auto uint32_vector = pack_fp32_vec_as_bfp4_tiles(data, /*row_major_input=*/false, /*is_exp_a=*/false);
-            return create<uint32_t>(std::move(uint32_vector));
-        }
-        case DataType::FLOAT32: {
-            auto data_copy = data;
-            return create<float>(std::move(data_copy));
-        }
-        case DataType::BFLOAT16: {
-            std::vector<bfloat16> bfloat16_data(data.size());
-            std::transform(std::begin(data), std::end(data), std::begin(bfloat16_data), [](float value) {
-                return bfloat16(value);
-            });
-            return create<bfloat16>(std::move(bfloat16_data));
-        }
-        default: {
-            TT_THROW("Cannot create a host buffer for data type: {}", data_type);
-        }
-    }
-}
-
 template <typename T>
 Tensor create_owned_tensor(
-    std::vector<T> data, const ttnn::Shape& shape, tt::tt_metal::DataType data_type, tt::tt_metal::Layout layout) {
-    auto buffer = tt::tt_metal::owned_buffer::create(std::move(data));
+    tt::stl::Span<const T> data, const ttnn::SimpleShape& shape, DataType data_type, tt::tt_metal::Layout layout) {
+    auto buffer = tt::tt_metal::owned_buffer::create(std::vector<T>(data.begin(), data.end()));
     auto storage = OwnedStorage{std::move(buffer)};
     return Tensor{std::move(storage), shape, data_type, layout};
 }
 
 // TODO: optimize precomputing multipliers
-template <class T = float, class InternalT = bfloat16>
+template <typename T, typename InternalType>
 std::vector<T> untile_tensor_to_vec(const Tensor& cpu_tensor) {
-    auto tiled_buffer = tt::tt_metal::host_buffer::get_as<InternalT>(cpu_tensor);
+    auto tiled_buffer = tt::tt_metal::host_buffer::get_as<InternalType>(cpu_tensor);
     auto untiled_shape = cpu_tensor.get_logical_shape();
     auto tiled_shape = cpu_tensor.get_padded_shape();
 
@@ -79,7 +50,7 @@ std::vector<T> untile_tensor_to_vec(const Tensor& cpu_tensor) {
     for (size_t idx = 0; idx < total_size; ++idx) {
         uint32_t untiled_index = compute_flat_index(indices, untiled_shape);
         uint32_t tiled_index = compute_flat_index(indices, tiled_shape);
-        if constexpr (std::is_same_v<InternalT, bfloat16>) {
+        if constexpr (std::is_same_v<InternalType, bfloat16>) {
             untiled_data[untiled_index] = tiled_buffer[tiled_index].to_float();
         } else {
             untiled_data[untiled_index] = tiled_buffer[tiled_index];
@@ -99,41 +70,51 @@ std::vector<T> untile_tensor_to_vec(const Tensor& cpu_tensor) {
 }  // namespace
 
 template <>
-Tensor from_vector<float, DataType::BFLOAT16>(const std::vector<float>& buffer, const ttnn::Shape& shape) {
-    const DataType data_type = DataType::BFLOAT16;
-    auto logical_shape = shape.logical_shape();
-    size_t volume = logical_shape.volume();
+Tensor from_span<float>(tt::stl::Span<const float> buffer, const ttnn::SimpleShape& shape, DataType dtype) {
+    size_t volume = shape.volume();
     TT_FATAL(
         buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
-    auto owned_buffer = create_owned_buffer(buffer, data_type);
-    return Tensor(OwnedStorage{owned_buffer}, logical_shape, data_type, Layout::ROW_MAJOR);
+    if (dtype == DataType::FLOAT32) {
+        return create_owned_tensor(buffer, shape, dtype, Layout::ROW_MAJOR);
+    } else if (dtype == DataType::BFLOAT16) {
+        std::vector<bfloat16> bfloat16_data;
+        bfloat16_data.reserve(buffer.size());
+        std::transform(std::begin(buffer), std::end(buffer), std::back_inserter(bfloat16_data), [](float value) {
+            return bfloat16(value);
+        });
+        return create_owned_tensor(
+            tt::stl::Span<const bfloat16>(bfloat16_data.data(), bfloat16_data.size()), shape, dtype, Layout::ROW_MAJOR);
+    } else {
+        // TODO: support bf8 and bf4
+        TT_THROW("Unsupported data type for from_span<float>: {}", dtype);
+    }
 }
 
-// Workaround implementation due to issue with tilize for float32
-// it is expected that tilize will be fixed in the after next tt-metal main update
 template <>
-Tensor from_vector<float, DataType::FLOAT32>(const std::vector<float>& buffer, const ttnn::Shape& shape) {
-    auto tensor = from_vector<float, DataType::BFLOAT16>(buffer, shape);
-    return ttnn::typecast(tensor, DataType::FLOAT32);
+Tensor from_span<bfloat16>(tt::stl::Span<const bfloat16> buffer, const ttnn::SimpleShape& shape, DataType dtype) {
+    size_t volume = shape.volume();
+    TT_FATAL(
+        buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
+    TT_FATAL(dtype == DataType::BFLOAT16, "Unsupported data type for from_span<bfloat16>: {}", dtype);
+    return create_owned_tensor(buffer, shape, dtype, Layout::ROW_MAJOR);
 }
 
 template <>
-Tensor from_vector<uint32_t, DataType::UINT32>(const std::vector<uint32_t>& buffer, const ttnn::Shape& shape) {
-    MemoryConfig output_mem_config{};
-    auto logical_shape = shape.logical_shape();
-    auto volume = logical_shape.volume();
+Tensor from_span<uint32_t>(tt::stl::Span<const uint32_t> buffer, const ttnn::SimpleShape& shape, DataType dtype) {
+    size_t volume = shape.volume();
     TT_FATAL(
         buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
-    return create_owned_tensor(buffer, logical_shape, DataType::UINT32, Layout::ROW_MAJOR);
+    TT_FATAL(dtype == DataType::UINT32, "Unsupported data type for from_span<uint32_t>: {}", dtype);
+    return create_owned_tensor(buffer, shape, DataType::UINT32, Layout::ROW_MAJOR);
 }
 
 template <>
-Tensor from_vector<int32_t, DataType::INT32>(const std::vector<int32_t>& buffer, const ttnn::Shape& shape) {
-    auto logical_shape = shape.logical_shape();
-    auto volume = logical_shape.volume();
+Tensor from_span<int32_t>(tt::stl::Span<const int32_t> buffer, const ttnn::SimpleShape& shape, DataType dtype) {
+    size_t volume = shape.volume();
     TT_FATAL(
         buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
-    return create_owned_tensor(buffer, logical_shape, DataType::INT32, Layout::ROW_MAJOR);
+    TT_FATAL(dtype == DataType::INT32, "Unsupported data type for from_span<int32_t>: {}", dtype);
+    return create_owned_tensor(buffer, shape, DataType::INT32, Layout::ROW_MAJOR);
 }
 
 template <>
@@ -144,19 +125,38 @@ std::vector<float> to_vector<float>(const Tensor& tensor) {
     } else if (cpu_tensor.get_dtype() == DataType::FLOAT32) {
         return untile_tensor_to_vec<float, float>(cpu_tensor);
     } else {
+        // TODO: support bf4, bf8.
         TT_THROW("Cannot convert tensor to vector for data type: {}", cpu_tensor.get_dtype());
     }
 }
 
+template <>
+std::vector<bfloat16> to_vector<bfloat16>(const Tensor& tensor) {
+    auto cpu_tensor = tensor.cpu().to(Layout::ROW_MAJOR);
+    TT_FATAL(
+        cpu_tensor.get_dtype() == DataType::BFLOAT16,
+        "Unsupported data type for to_vector<bfloat16>: {}",
+        cpu_tensor.get_dtype());
+    return untile_tensor_to_vec<bfloat16, bfloat16>(cpu_tensor);
+}
+
 template <>
 std::vector<uint32_t> to_vector<uint32_t>(const Tensor& tensor) {
     auto cpu_tensor = tensor.cpu().to(Layout::ROW_MAJOR);
+    TT_FATAL(
+        cpu_tensor.get_dtype() == DataType::UINT32,
+        "Unsupported data type for to_vector<uint32_t>: {}",
+        cpu_tensor.get_dtype());
     return untile_tensor_to_vec<uint32_t, uint32_t>(cpu_tensor);
 }
 
 template <>
 std::vector<int32_t> to_vector<int32_t>(const Tensor& tensor) {
     auto cpu_tensor = tensor.cpu().to(Layout::ROW_MAJOR);
+    TT_FATAL(
+        cpu_tensor.get_dtype() == DataType::INT32,
+        "Unsupported data type for to_vector<int32_t>: {}",
+        cpu_tensor.get_dtype());
     return untile_tensor_to_vec<int32_t, int32_t>(cpu_tensor);
 }
 
diff --git a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp
index fe84efd50bbb..446e1626314c 100644
--- a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp
+++ b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp
@@ -11,43 +11,65 @@
 namespace ttnn::experimental::xtensor {
 
 template <typename E>
-ttnn::Shape get_shape_from_xarray(const E& xarr) {
+ttnn::SimpleShape get_shape_from_xarray(const E& xarr) {
     ttnn::SmallVector<uint32_t> shape_dims;
     for (size_t i = 0; i < xarr.shape().size(); ++i) {
         shape_dims.push_back(xarr.shape()[i]);
     }
-    return ttnn::Shape(shape_dims);
+    return ttnn::SimpleShape(shape_dims);
 }
 
-template <class VectorType = float, DataType TensorType = DataType::BFLOAT16>
-tt::tt_metal::Tensor from_vector(const std::vector<VectorType>& buffer, const ttnn::Shape& shape);
+template <typename T>
+tt::tt_metal::Tensor from_span(
+    tt::stl::Span<const T> buffer, const ttnn::SimpleShape& shape, tt::tt_metal::DataType dtype);
 
-template <class T = float>
+template <typename T>
+tt::tt_metal::Tensor from_vector(
+    const std::vector<T>& buffer, const ttnn::SimpleShape& shape, tt::tt_metal::DataType dtype) {
+    return from_span(tt::stl::Span<const T>(buffer.data(), buffer.size()), shape, dtype);
+}
+
+template <typename T>
 std::vector<T> to_vector(const tt::tt_metal::Tensor& tensor);
 
-template <class T>
+template <typename T>
+xt::xarray<T> tt_span_to_xtensor(tt::stl::Span<const T> vec, const ttnn::SimpleShape& shape) {
+    std::vector<size_t> shape_vec(shape.cbegin(), shape.cend());
+    return xt::adapt(vec.data(), vec.size(), xt::no_ownership(), shape_vec);
+}
+
+// TODO: make the usage of std::span / tt::stl::Span consistent.
+template <typename T>
 xt::xarray<T> span_to_xtensor(std::span<T> vec, const ttnn::SimpleShape& shape) {
     std::vector<size_t> shape_vec(shape.cbegin(), shape.cend());
     return xt::adapt(vec.data(), vec.size(), xt::no_ownership(), shape_vec);
 }
-template <class T>
+
+template <typename T>
+auto xtensor_to_tt_span(const xt::xarray<T>& xtensor) {
+    auto adaptor = xt::adapt(xtensor.data(), xtensor.size(), xt::no_ownership());
+    return tt::stl::Span<const T>(adaptor.data(), adaptor.size());
+}
+
+// TODO: make the usage of std::span / tt::stl::Span consistent.
+template <typename T>
 auto xtensor_to_span(const xt::xarray<T>& xtensor) {
     auto adaptor = xt::adapt(xtensor.data(), xtensor.size(), xt::no_ownership());
     return std::span(adaptor.data(), adaptor.size());
 }
 
-template <class T = float, DataType TensorType = DataType::BFLOAT16>
-tt::tt_metal::Tensor from_xtensor(const xt::xarray<T>& buffer) {
+template <typename T>
+tt::tt_metal::Tensor from_xtensor(const xt::xarray<T>& buffer, tt::tt_metal::DataType dtype) {
     auto shape = get_shape_from_xarray(buffer);
-    auto buffer_view = xtensor_to_span(buffer);
-    return from_vector<T, TensorType>(std::vector<T>(buffer_view.begin(), buffer_view.end()), shape);
+    auto buffer_view = xtensor_to_tt_span(buffer);
+    return from_span<T>(buffer_view, shape, dtype);
 }
 
-template <class T = float>
+template <typename T>
 xt::xarray<T> to_xtensor(const tt::tt_metal::Tensor& tensor) {
     auto vec = to_vector<T>(tensor);
     auto shape = tensor.get_shape().logical_shape();
-    return span_to_xtensor(std::span<T>(vec.data(), vec.size()), shape);
+    return tt_span_to_xtensor(tt::stl::Span<const T>(vec.data(), vec.size()), shape);
 }
 
 }  // namespace ttnn::experimental::xtensor
diff --git a/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp b/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp
index 161ee993373e..52b52c6671e0 100644
--- a/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp
+++ b/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp
@@ -82,6 +82,7 @@ std::vector<xt::xarray<T>> chunk(const xt::xarray<T>& xtensor, int num_chunks, i
 
         auto chunk_view = xt::strided_view(xtensor, indices);
 
+        // TODO: optimize away this copy.
         // Construct xarray from the view
         // This forces a copy of that slice into a new xarray
         chunks.push_back(xt::xarray<T>(chunk_view));
@@ -119,25 +120,25 @@ template xt::xarray<int32_t> concatenate(const std::vector<xt::xarray<int32_t>>&
 namespace adaptor {
 namespace {
 
-template <typename VectorType, DataType TensorType>
-Tensor concatenate_impl(const std::vector<Tensor>& tensors, int dim) {
-    std::vector<xt::xarray<VectorType>> xtensors;
+template <typename T>
+Tensor concatenate_impl(const std::vector<Tensor>& tensors, DataType dtype, int dim) {
+    std::vector<xt::xarray<T>> xtensors;
     for (const auto& tensor : tensors) {
-        xtensors.push_back(to_xtensor<VectorType>(tensor));
+        xtensors.push_back(to_xtensor<T>(tensor));
     }
-    xt::xarray<VectorType> result = concatenate(xtensors, dim);
-    return from_xtensor<VectorType, TensorType>(result);
+    xt::xarray<T> result = concatenate(xtensors, dim);
+    return from_xtensor<T>(result, dtype);
 }
 
-template <typename VectorType, DataType TensorType>
-std::vector<Tensor> chunk_impl(const Tensor& tensor, int num_chunks, int dim) {
-    xt::xarray<VectorType> xtensor = to_xtensor<VectorType>(tensor);
-    auto xtensor_chunks = chunk<VectorType>(xtensor, num_chunks, dim);
+template <typename T>
+std::vector<Tensor> chunk_impl(const Tensor& tensor, DataType dtype, int num_chunks, int dim) {
+    xt::xarray<T> xtensor = to_xtensor<T>(tensor);
+    auto xtensor_chunks = chunk<T>(xtensor, num_chunks, dim);
 
     std::vector<Tensor> tensors;
     tensors.reserve(xtensor_chunks.size());
     for (const auto& c : xtensor_chunks) {
-        tensors.push_back(from_xtensor<VectorType, TensorType>(c));
+        tensors.push_back(from_xtensor<T>(c, dtype));
     }
     return tensors;
 }
@@ -147,20 +148,21 @@ std::vector<Tensor> chunk_impl(const Tensor& tensor, int num_chunks, int dim) {
 
 std::vector<Tensor> chunk(const Tensor& tensor, int num_chunks, int dim) {
     switch (tensor.dtype()) {
-        case DataType::BFLOAT16: return adaptor::chunk_impl<float, DataType::BFLOAT16>(tensor, num_chunks, dim);
-        case DataType::FLOAT32: return adaptor::chunk_impl<float, DataType::FLOAT32>(tensor, num_chunks, dim);
-        case DataType::INT32: return adaptor::chunk_impl<int32_t, DataType::INT32>(tensor, num_chunks, dim);
-        case DataType::UINT32: return adaptor::chunk_impl<uint32_t, DataType::UINT32>(tensor, num_chunks, dim);
+        case DataType::BFLOAT16: return adaptor::chunk_impl<float>(tensor, DataType::BFLOAT16, num_chunks, dim);
+        case DataType::FLOAT32: return adaptor::chunk_impl<float>(tensor, DataType::FLOAT32, num_chunks, dim);
+        case DataType::INT32: return adaptor::chunk_impl<int32_t>(tensor, DataType::INT32, num_chunks, dim);
+        case DataType::UINT32: return adaptor::chunk_impl<uint32_t>(tensor, DataType::UINT32, num_chunks, dim);
         default: TT_THROW("Unsupported data type: {}", tensor.dtype());
     }
 }
 
 Tensor concatenate(const std::vector<Tensor>& tensors, int dim) {
+    TT_FATAL(tensors.size() > 0, "Cannot concatenate an empty list of tensors");
     switch (tensors.front().dtype()) {
-        case DataType::BFLOAT16: return adaptor::concatenate_impl<float, DataType::BFLOAT16>(tensors, dim);
-        case DataType::FLOAT32: return adaptor::concatenate_impl<float, DataType::FLOAT32>(tensors, dim);
-        case DataType::INT32: return adaptor::concatenate_impl<int32_t, DataType::INT32>(tensors, dim);
-        case DataType::UINT32: return adaptor::concatenate_impl<uint32_t, DataType::UINT32>(tensors, dim);
+        case DataType::BFLOAT16: return adaptor::concatenate_impl<float>(tensors, DataType::BFLOAT16, dim);
+        case DataType::FLOAT32: return adaptor::concatenate_impl<float>(tensors, DataType::FLOAT32, dim);
+        case DataType::INT32: return adaptor::concatenate_impl<int32_t>(tensors, DataType::INT32, dim);
+        case DataType::UINT32: return adaptor::concatenate_impl<uint32_t>(tensors, DataType::UINT32, dim);
         default: TT_THROW("Unsupported data type: {}", tensors.front().dtype());
     }
 }