From d4fd805dda7c60a2c09983a9cd5aa1b04d9477d1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Feb 2026 09:57:04 -0800
Subject: [PATCH 01/38] Rename folder dpctl to dpctl_ext

---
 .../tensor/libtensor/include/kernels/alignment.hpp              | 0
 .../tensor/libtensor/include/kernels/dpctl_tensor_types.hpp     | 0
 .../libtensor/include/kernels/elementwise_functions/common.hpp  | 0
 .../include/kernels/elementwise_functions/common_detail.hpp     | 0
 .../include/kernels/elementwise_functions/logaddexp.hpp         | 0
 .../libtensor/include/kernels/elementwise_functions/maximum.hpp | 0
 .../libtensor/include/kernels/elementwise_functions/minimum.hpp | 0
 .../include/kernels/elementwise_functions/sycl_complex.hpp      | 0
 .../include/kernels/elementwise_functions/vec_size_util.hpp     | 0
 .../tensor/libtensor/include/utils/indexing_utils.hpp           | 0
 .../tensor/libtensor/include/utils/math_utils.hpp               | 0
 .../tensor/libtensor/include/utils/memory_overlap.hpp           | 0
 .../tensor/libtensor/include/utils/offset_utils.hpp             | 0
 .../tensor/libtensor/include/utils/output_validation.hpp        | 0
 .../tensor/libtensor/include/utils/strided_iters.hpp            | 0
 .../tensor/libtensor/include/utils/sycl_alloc_utils.hpp         | 0
 .../tensor/libtensor/include/utils/sycl_utils.hpp               | 0
 .../tensor/libtensor/include/utils/type_dispatch.hpp            | 0
 .../tensor/libtensor/include/utils/type_dispatch_building.hpp   | 0
 .../tensor/libtensor/include/utils/type_utils.hpp               | 0
 dpnp/backend/extensions/blas/CMakeLists.txt                     | 2 +-
 dpnp/backend/extensions/fft/CMakeLists.txt                      | 2 +-
 dpnp/backend/extensions/indexing/CMakeLists.txt                 | 2 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt                   | 2 +-
 dpnp/backend/extensions/statistics/CMakeLists.txt               | 2 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt                    | 2 +-
 dpnp/backend/extensions/vm/CMakeLists.txt                       | 2 +-
 dpnp/backend/extensions/window/CMakeLists.txt                   | 2 +-
 28 files changed, 8 insertions(+), 8 deletions(-)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%)

diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 0015eda84843..cbc3e31d923b 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -68,7 +68,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 0569ecc8bca4..edc7bff7dce4 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -61,7 +61,7 @@ target_include_directories(
     ${python_module_name}
     PRIVATE
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index c0de75ae3146..39f68ffba846 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -65,7 +65,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 76b25c3a6d10..59499a3b28f8 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -86,7 +86,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index e04279b75e49..8544e816e8d6 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -70,7 +70,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 55a750f8423f..293cef0ab326 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -88,7 +88,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 32d6a6765a00..551c43842af2 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -110,7 +110,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 6fe04e334f42..01274317782d 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -66,7 +66,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 

From c040713d50cd10c628990b628cb74b0a5029f99b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:04:36 -0800
Subject: [PATCH 02/38] Add simplify_iteration_space implementation to
 libtensor

---
 .../source/simplify_iteration_space.cpp       | 544 ++++++++++++++++++
 .../source/simplify_iteration_space.hpp       | 130 +++++
 2 files changed, 674 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp

diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
new file mode 100644
index 000000000000..2526f022e0ac
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -0,0 +1,544 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "simplify_iteration_space.hpp"
+#include "utils/strided_iters.hpp"
+#include <cstddef>
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *const &shape,
+                                std::vector<py::ssize_t> const &strides,
+                                // output
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_stride;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.insert(std::end(simplified_strides),
+                                  std::begin(strides), std::end(strides));
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.push_back((strides[0] >= 0) ? strides[0]
+                                                       : -strides[0]);
+        if ((strides[0] < 0) && (shape[0] > 1)) {
+            offset += (shape[0] - 1) * strides[0];
+        }
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space(int &nd,
+                              const py::ssize_t *const &shape,
+                              std::vector<py::ssize_t> const &src_strides,
+                              std::vector<py::ssize_t> const &dst_strides,
+                              // output
+                              std::vector<py::ssize_t> &simplified_shape,
+                              std::vector<py::ssize_t> &simplified_src_strides,
+                              std::vector<py::ssize_t> &simplified_dst_strides,
+                              py::ssize_t &src_offset,
+                              py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_two_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::begin(simplified_shape), shape,
+                                shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.insert(std::end(simplified_src_strides),
+                                      std::begin(src_strides),
+                                      std::end(src_strides));
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_two_strides(
+            nd, simplified_shape.data(), simplified_src_strides.data(),
+            simplified_dst_strides.data(),
+            src_offset, // modified by reference
+            dst_offset  // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src_strides[0] < 0 && dst_strides[0] < 0) {
+            simplified_src_strides.push_back(-src_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src_offset += (shape[0] - 1) * src_strides[0];
+                dst_offset += (shape[0] - 1) * dst_strides[0];
+            }
+        }
+        else {
+            simplified_src_strides.push_back(src_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_three_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (dst_strides[0] < 0)) {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_4(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // src3
+    std::vector<py::ssize_t> const &src3_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_src3_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &src3_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_four_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src3_strides.reserve(nd);
+        simplified_src3_strides.insert(std::end(simplified_src3_strides),
+                                       std::begin(src3_strides),
+                                       std::end(src3_strides));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_four_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_src3_strides.data(),
+            simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            src3_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_src3_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        src3_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_src3_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (src3_strides[0] < 0) && (dst_strides[0] < 0))
+        {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_src3_strides.push_back(-src3_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                src3_offset += src3_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_src3_strides.push_back(src3_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
+                           const std::vector<py::ssize_t> &strides_vec,
+                           int axis_start,
+                           int axis_end,
+                           std::vector<py::ssize_t> &dir1_shape_vec,
+                           std::vector<py::ssize_t> &dir2_shape_vec,
+                           std::vector<py::ssize_t> &dir1_strides_vec,
+                           std::vector<py::ssize_t> &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(nd - 1 - i) * s;
+        s *= shape.at(nd - 1 - i);
+    }
+
+    return flat_index;
+}
+
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(i) * s;
+        s *= shape.at(i);
+    }
+
+    return flat_index;
+}
+
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[nd - 1 - dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[nd - 1 - dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[0] = i_;
+    }
+    return mi;
+}
+
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[nd - 1] = i_;
+    }
+    return mi;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
new file mode 100644
index 000000000000..d3448ee1f5fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -0,0 +1,130 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *const &,
+                                std::vector<py::ssize_t> const &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
+void simplify_iteration_space(int &,
+                              const py::ssize_t *const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              py::ssize_t &,
+                              py::ssize_t &);
+
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void simplify_iteration_space_4(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // src3
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
+void split_iteration_space(const std::vector<py::ssize_t> &,
+                           const std::vector<py::ssize_t> &,
+                           int,
+                           int,
+                           // output
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &);
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 14b466facfe6b23f92113ccc2dbb224e2727bf3c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:14:43 -0800
Subject: [PATCH 03/38] Extend codespell ignore list for libtensor

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index cdf592535d11..67fb75cb5f54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT"
 quiet-level = 3
 
 [tool.coverage.report]

From dcc421bc61c36549d3e6865927f495abab15d078 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:15:09 -0800
Subject: [PATCH 04/38] Add copy_and_cast kernels to libtensor

---
 .../include/kernels/copy_and_cast.hpp         | 1288 +++++++++++++++++
 .../include/kernels/copy_as_contiguous.hpp    |  655 +++++++++
 .../libtensor/source/copy_as_contig.cpp       |  758 ++++++++++
 .../libtensor/source/copy_as_contig.hpp       |   61 +
 4 files changed, 2762 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
new file mode 100644
index 000000000000..a07d311a7fcb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -0,0 +1,1288 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_and_cast
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_generic_kernel;
+
+template <typename srcT,
+          typename dstT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copy_cast_contig_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_contig_kernel;
+
+template <typename srcTy, typename dstTy>
+class Caster
+{
+public:
+    Caster() = default;
+    dstTy operator()(const srcTy &src) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(src);
+    }
+};
+
+template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
+class GenericCopyFunctor
+{
+private:
+    const srcT *src_ = nullptr;
+    dstT *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer)
+        : src_(src_p), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        static constexpr CastFnT fn{};
+        dst_[dst_offset] = fn(src_[src_offset]);
+    }
+};
+
+/*!
+  @defgroup CopyAndCastKernels
+ */
+
+/*!
+ * @brief Function pointer type for generic array cast and copying function.
+ */
+typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
+ `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have array dimensionality specified via argument `nd`. The
+ `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
+ first `nd` elements encode common shape, second `nd` elements contain strides
+ of `src` array, and the trailing `nd` elements contain strides of `dst` array.
+ `src_p` and `dst_p` represent pointers into respective arrays, but the start of
+ iteration begins at offset of `src_offset` elements for `src` array and at
+ offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
+ `q` with events `depends` and `additional_depends` as dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  nd      Array dimensionality, i.e. number of indices needed to
+ identify an element of each array.
+   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
+ strides.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  src_offset  Offset to the beginning of iteration in number of
+ elements of source array from `src_p`.
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  dst_offset  Offset to the beginning of iteration in number of
+ elements of destination array from `dst_p`.
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+   @param  additional_depends Additional list of events to wait for before
+ starting computations, if any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_generic_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
+                                                shape_and_strides};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<class copy_cast_generic_kernel<
+            srcTy, dstTy, TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
+                               TwoOffsets_StridedIndexer>(src_tp, dst_tp,
+                                                          indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get generic function pointer of type `fnT` for given source
+ * data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_generic_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for contiguous arrays
+
+template <typename srcT,
+          typename dstT,
+          typename CastFnT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class ContigCopyFunctor
+{
+private:
+    std::size_t nelems;
+    const srcT *src_p = nullptr;
+    dstT *dst_p = nullptr;
+
+public:
+    ContigCopyFunctor(const std::size_t nelems_,
+                      const srcT *src_p_,
+                      dstT *dst_p_)
+        : nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr CastFnT fn{};
+
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex_v;
+        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
+                      is_complex_v<dstT>) {
+            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = fn(src_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems) {
+                sycl::vec<dstT, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&src_p[offset]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[offset]);
+
+                    const sycl::vec<srcT, vec_sz> src_vec =
+                        sub_group_load<vec_sz>(sg, src_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; k++) {
+                        dst_vec[k] = fn(src_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t start = base + sg.get_local_id()[0];
+                for (std::size_t k = start; k < nelems; k += sgSize) {
+                    dst_p[k] = fn(src_p[k]);
+                }
+            }
+        }
+    }
+};
+
+/*!
+ * @brief Function pointer type for contiguous array cast and copy function.
+ */
+typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
+ to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have the same number of elements `nelems`.
+ `src_cp` and `dst_cp` represent char pointers to the start of respective
+ arrays. Kernel is submitted to sycl queue `q` with events `depends` as
+ dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_contig_impl(sycl::queue &q,
+                                      std::size_t nelems,
+                                      const char *src_cp,
+                                      char *dst_cp,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
+
+        std::size_t lws = 64;
+        static constexpr std::uint32_t vec_sz = 4;
+        static constexpr std::uint32_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get specialized function pointer for casting and copying
+ * contiguous arrays.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for 1D arrays
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 1D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 2D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Specialized for given array dimension function to copy `nelems`
+ elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
+ to `dstTy`.
+
+   Both arrays have array dimensionality known at compile time and specified in
+ template parameters `nd`. Arrays' shape and strides are provided as
+ `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
+ but the start of iteration begins at offset of `src_offset` elements for `src`
+ array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
+ to sycl queue `q` with events `depends` as dependencies.
+
+   @param q  The queue where the routine should be executed.
+   @param nelems  Number of elements to cast and copy.
+   @param shape   Common shape of the arrays.
+   @param src_strides Strides of the source array.
+   @param dst_strides Strides of the destination array.
+   @param src_p  Kernel accessible USM pointer for the source array
+   @param src_offset  Offset to the beginning of iteration in number of elements
+ of the source array from `src_p`.
+   @param dst_p  Kernel accessible USM pointer for the destination array
+   @param dst_offset  Offset to the beginning of iteration in number of elements
+ of the destination array from `src_p`.
+   @param depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy, int nd>
+sycl::event copy_and_cast_nd_specialized_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const std::array<ssize_t, nd> &shape,
+    const std::array<ssize_t, nd> &src_strides,
+    const std::array<ssize_t, nd> &dst_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
+        const IndexerT indexer{shape, src_strides, dst_strides, src_offset,
+                               dst_offset};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.depends_on(depends);
+        cgh.parallel_for<
+            class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
+                src_tp, dst_tp, indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast1DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast2DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
+        return f;
+    }
+};
+
+// ====================== Copying from host to USM
+
+template <typename AccessorT,
+          typename dstTy,
+          typename CastFnT,
+          typename IndexerT>
+class GenericCopyFromHostFunctor
+{
+private:
+    AccessorT src_acc_;
+    dstTy *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFromHostFunctor(const AccessorT &src_acc,
+                               dstTy *dst_p,
+                               const IndexerT &indexer)
+        : src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        CastFnT fn{};
+        dst_[dst_offset] = fn(src_acc_[src_offset]);
+    }
+};
+
+typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy`.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
+ * `nd` elements contain arrays' shape, next `nd` elements specify source
+ * strides in elements (not bytes), and trailing `nd` elements specify
+ * destination array strides. Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param nd The dimensionality of arrays
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides.
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param src_min_nelem_offset  Smallest value of offset relative to
+ * `host_src_p` in number of elements attained while iterating over elements of
+ * the source array.
+ * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
+ * in number of elements attained while iterating over elements of the source
+ * array.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *host_src_p,
+    ssize_t src_offset,
+    ssize_t src_min_nelem_offset,
+    ssize_t src_max_nelem_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
+        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        const TwoOffsets_StridedIndexer indexer{
+            nd, src_offset - src_min_nelem_offset, dst_offset,
+            const_cast<const ssize_t *>(shape_and_strides)};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
+                                                    TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>,
+                                       TwoOffsets_StridedIndexer>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_impl<D, S>;
+        return f;
+    }
+};
+
+typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  /* nelems */
+    const char *, /* src_pointer */
+    ssize_t,      /* src_offset */
+    char *,       /* dst_pointer */
+    ssize_t,      /* dst_offset */
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy` for contiguous arrays.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param src_stride The stride of source array in elements
+ * @param dst_stride The stride of destimation array in elements
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_contig_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const char *host_src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
+        sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
+        static constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
+        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
+                                                            dst_indexer};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<
+            copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>, IndexerT>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// =============== Copying for reshape ================== //
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class GenericCopyForReshapeFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    GenericCopyForReshapeFunctor(const char *src_ptr,
+                                 char *dst_ptr,
+                                 const SrcIndexerT &src_indexer,
+                                 const DstIndexerT &dst_indexer)
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
+          dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const ssize_t src_offset = src_indexer_(wiid.get(0));
+        const ssize_t dst_offset = dst_indexer_(wiid.get(0));
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // src_nd
+    int,             // dst_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    char *,          // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array while reshaping.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  nelems The number of elements to copy
+ * @param  src_nd Array dimension of the source array
+ * @param  dst_nd Array dimension of the destination array
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
+ * dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+    copy_for_reshape_generic_impl(sycl::queue &q,
+                                  std::size_t nelems,
+                                  int src_nd,
+                                  int dst_nd,
+                                  const ssize_t *packed_shapes_and_strides,
+                                  const char *src_p,
+                                  char *dst_p,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const ssize_t *src_shape_and_strides =
+            const_cast<const ssize_t *>(packed_shapes_and_strides);
+
+        const ssize_t *dst_shape_and_strides = const_cast<const ssize_t *>(
+            packed_shapes_and_strides + (2 * src_nd));
+
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
+                src_p, dst_p, src_indexer, dst_indexer));
+    });
+
+    return copy_for_reshape_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForReshapeGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_reshape_generic_impl<Ty>;
+        return f;
+    }
+};
+
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(std::size_t offset, std::size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    std::size_t operator()(std::size_t gid) const
+    {
+        const std::size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    std::size_t offset_ = 0;
+    std::size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT>
+struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(std::size_t gid) const
+    {
+        return f_(t_(gid));
+    }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const ssize_t *shape,
+                    const ssize_t *strides,
+                    const ssize_t *ndshifts,
+                    ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+private:
+    int nd_ = -1;
+    const ssize_t *shape_ = nullptr;
+    const ssize_t *strides_ = nullptr;
+    const ssize_t *ndshifts_ = nullptr;
+    ssize_t starting_offset_ = 0;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              const SrcIndexerT &src_indexer,
+                              const DstIndexerT &dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const std::size_t gid = wiid.get(0);
+
+        const ssize_t src_offset = src_indexer_(gid);
+        const ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // shift
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_strided_impl(sycl::queue &q,
+                                       std::size_t shift,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *packed_shapes_and_strides,
+                                       const char *src_p,
+                                       ssize_t src_offset,
+                                       char *dst_p,
+                                       ssize_t dst_offset,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        const StridedIndexer src_indexer{nd, src_offset,
+                                         packed_shapes_and_strides};
+        const LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        const CompositeIndexerT rolled_src_indexer(src_indexer,
+                                                   left_roll_transformer);
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  // shift
+    std::size_t,  // num_elements
+    const char *, // src_data_ptr
+    ssize_t,      // src_offset
+    char *,       // dst_data_ptr
+    ssize_t,      // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue &q,
+                                      std::size_t shift,
+                                      std::size_t nelems,
+                                      const char *src_p,
+                                      ssize_t src_offset,
+                                      char *dst_p,
+                                      ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        static constexpr NoOpIndexer src_indexer{};
+        const LeftRolled1DTransformer roller{shift, nelems};
+
+        const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
+        static constexpr NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shape, strides, shifts
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr,
+                                          shifts_ptr, src_offset};
+
+        const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                                 dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
+} // namespace copy_and_cast
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
new file mode 100644
index 000000000000..b4f367448758
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -0,0 +1,655 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_as_contig
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class CopyAsCContigFunctor
+{
+private:
+    std::size_t nelems;
+    const T *src_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT src_indexer;
+
+public:
+    CopyAsCContigFunctor(std::size_t n,
+                         const T *src_,
+                         T *dst_,
+                         const IndexerT &src_indexer_)
+        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static_assert(vec_sz > 0);
+        static_assert(n_vecs > 0);
+
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_max_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
+            // gid % sgSize == gid - (gid / sgSize) * sgSize
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                auto src_offset = src_indexer(offset);
+                dst_p[offset] = src_p[src_offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
+
+            if (base + elems_per_sg < nelems) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
+                    const std::size_t block_start_id = base + it * sgSize;
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[block_start_id]);
+
+                    const std::size_t elem_id0 =
+                        block_start_id + sg.get_local_id();
+                    sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        const std::size_t elem_id = elem_id0 + k * sgSize;
+                        const ssize_t src_offset = src_indexer(elem_id);
+                        dst_vec[k] = src_p[src_offset];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                const std::size_t k0 = base + lane_id;
+                for (std::size_t k = k0; k < nelems; k += sgSize) {
+                    const ssize_t src_offset = src_indexer(k);
+                    dst_p[k] = src_p[src_offset];
+                }
+            }
+        }
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sg_load,
+          typename KernelName>
+sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
+                                     std::size_t nelems,
+                                     const T *src,
+                                     T *dst,
+                                     const IndexerT &src_indexer,
+                                     const std::vector<sycl::event> &depends)
+{
+    static_assert(vec_sz > 0);
+    static_assert(n_vecs > 0);
+
+    static constexpr std::size_t preferred_lws = 256;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t lws =
+        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
+
+    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+    const std::size_t nelems_per_group = nelems_per_wi * lws;
+    const std::size_t n_groups =
+        (nelems + nelems_per_group - 1) / (nelems_per_group);
+
+    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.use_kernel_bundle(kb);
+
+        const sycl::range<1> gRange{n_groups * lws};
+        const sycl::range<1> lRange{lws};
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(gRange, lRange),
+            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
+                nelems, src, dst, src_indexer));
+    });
+    return copy_ev;
+}
+
+template <typename T,
+          typename IndexT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sgload>
+class as_contig_krn;
+
+template <typename T>
+sycl::event
+    as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *shape_and_strides,
+                                       const char *src_p,
+                                       char *dst_p,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
+
+    static constexpr std::uint8_t vec_sz = 4u;
+    static constexpr std::uint8_t n_vecs = 2u;
+
+    using dpctl::tensor::kernels::alignment_utils::
+        disabled_sg_loadstore_wrapper_krn;
+    using dpctl::tensor::kernels::alignment_utils::is_aligned;
+    using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+    sycl::event copy_ev;
+    if (is_aligned<required_alignment>(dst_p)) {
+        static constexpr bool enable_sg_load = true;
+        using KernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           enable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+    else {
+        static constexpr bool disable_sg_load = false;
+        using InnerKernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
+        using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           disable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+
+    return copy_ev;
+}
+
+typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_array_generic_impl<T>;
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint16_t tile_size,
+          std::uint16_t n_lines>
+class as_contig_batch_of_square_matrices_krn;
+
+namespace detail
+{
+/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
+   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
+   to avoid race condition
+ */
+template <typename T, typename BatchIndexerT>
+sycl::event as_c_contiguous_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    const BatchIndexerT &batch_two_offsets_indexer,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    static constexpr std::uint16_t private_tile_size = 4;
+    static constexpr std::uint16_t n_lines = 2;
+    static constexpr std::uint16_t block_size =
+        n_lines * private_tile_size * private_tile_size;
+
+    static constexpr std::uint16_t lws0 = block_size;
+    static constexpr std::uint16_t lws1 = n_lines;
+    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
+
+    static_assert(nelems_per_wi * lws1 == block_size);
+    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
+
+    static constexpr std::uint32_t lws = lws0 * lws1;
+
+    const std::size_t n_tiles = (n + block_size - 1) / block_size;
+
+    const ssize_t src_stride = src_ld;
+    const ssize_t dst_stride = dst_ld;
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    using KernelName =
+        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
+                                               private_tile_size, lws1>;
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
+            // 1. Read block from source array into SLM
+            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
+            const std::size_t gr_id_lin = nd_it.get_group_linear_id();
+
+            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
+            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
+
+            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
+            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
+            const auto &dst_batch_offset =
+                batch_two_offsets.get_second_offset();
+
+            // Block id
+            /* 0 <= src_gr_i1 < n_groups_n1 */
+            const std::size_t src_tile_i1 = rem / n_tiles;
+            /* 0 <= src_gr_i0 < n_groups_n0 */
+            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
+
+            // ID of element within the block
+            /* 0 <= src_i1 < lws1 */
+            const std::uint32_t src_i1 = lid_lin / lws0;
+            /* 0 <= src_i0 < lws0 */
+            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
+
+            // Matrix element ID
+            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
+            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
+            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
+            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
+
+            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
+            // src_stride
+            const std::size_t src_offset0 =
+                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
+            const std::size_t pr_step_src = lws1 * src_stride;
+
+            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
+            const std::uint32_t pr_step_local = lws1 * block_size;
+
+            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                local_block[local_offset0 + pr_step_local * pr_id] =
+                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
+                        ? src_tp[src_offset0 + pr_step_src * pr_id]
+                        : T(0);
+            }
+
+            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start0 + block_size, n) -
+                src_tile_start0);
+            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start1 + block_size, n) -
+                src_tile_start1);
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 2. Permute the block matrix in SLM using two private arrays
+            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
+            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
+
+            // 0 <= lid_lin < lws0 * lws1 ==
+            //       (block_size * block_size / nelems_per_wi) ==
+            //       (block_size/private_tile_size)**2
+            static constexpr std::uint16_t n_private_tiles_per_axis =
+                block_size / private_tile_size;
+            const std::uint16_t local_tile_id0 =
+                lid_lin / n_private_tiles_per_axis;
+            const std::uint16_t local_tile_id1 =
+                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+
+                        const std::uint16_t pr_offset =
+                            pr_i1 * private_tile_size + pr_i0;
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // read (local_tile_id0, local_tile_id1)
+                        const std::uint16_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        private_block_01[pr_offset] =
+                            local_block[local_01_offset];
+
+                        // read (local_tile_id1, local_tile_id0)
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        private_block_10[pr_offset] =
+                            local_block[local_10_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+                        const std::uint16_t pr_offset =
+                            pr_i0 * private_tile_size + pr_i1;
+
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // write back permuted private blocks
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        local_block[local_01_offset] =
+                            private_block_10[pr_offset];
+
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        local_block[local_10_offset] =
+                            private_block_01[pr_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 3. Write out permuted SLM to destination array
+
+            const std::size_t dst_tile_start0 = src_tile_start0;
+            const std::size_t dst_tile_start1 = src_tile_start1;
+
+            if (local_dim0 == block_size && local_dim1 == block_size) {
+                const std::uint16_t dst_i0 = src_i1;
+                const std::uint16_t dst_i1 = src_i0;
+
+                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                const std::size_t dst_offset0 =
+                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                const std::size_t pr_step_dst = lws1 * dst_stride;
+
+                const std::uint16_t _local_offset0 =
+                    dst_i0 * block_size + dst_i1;
+                const std::uint16_t _pr_step_local = lws1 * block_size;
+
+                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
+                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
+                            local_block[_local_offset0 +
+                                        _pr_step_local * pr_id];
+                    }
+                }
+            }
+            else {
+                // map local_linear_id into (local_dim0, local_dim1)
+                for (std::uint16_t el_id = lid_lin;
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
+                {
+
+                    // 0 <= local_i0 < local_dim0
+                    const std::uint16_t loc_i0 = el_id / local_dim1;
+                    // 0 <= local_i1 < local_dim1
+                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
+
+                    const std::uint16_t dst_i0 = loc_i0;
+                    const std::uint16_t dst_i1 = loc_i1;
+
+                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                    const std::size_t dst_offset =
+                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                    const std::uint16_t local_offset =
+                        loc_i0 * block_size + loc_i1;
+
+                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
+                        dst_tp[dst_offset] = local_block[local_offset];
+                    }
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace detail
+
+template <typename T>
+sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    ssize_t src_batch_step,
+    ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT =
+        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
+
+    const auto &src_batch_indexer =
+        Strided1DIndexer(batch_nelems, src_batch_step);
+    const auto &dst_batch_indexer =
+        Strided1DIndexer(batch_nelems, dst_batch_step);
+
+    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
+        dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of batch elements */
+    ssize_t,       /* distance between batches in source array */
+    ssize_t,       /* distance between batches in destination array */
+    std::size_t,   /* size of square matrices in the batch */
+    const char *,
+    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
+                dimension */
+    char *,
+    ssize_t, /* untyped pointer to C-contig destination array, and matrix
+                leading dimension */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContig1DBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_1d_batch_of_square_matrices_impl<T>;
+    }
+};
+
+template <typename T>
+sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    int batch_nd,
+    const ssize_t *src_batch_shape_strides,
+    const ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
+
+    static constexpr ssize_t zero_offset{0};
+
+    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
+                                        src_batch_shape_strides};
+    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
+                                        /* step */ dst_batch_step};
+
+    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
+                                                  dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
+        dst_p, dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of matrices in the batch */
+    int,
+    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
+                        describing iteration over batch in source array */
+    ssize_t,         /* distance between batches in destination array */
+    std::size_t,     /* matrix size */
+    const char *,
+    ssize_t, /* untyped pointer to source array of F-contig matrices, and
+                leading dimension of the matrix */
+    char *,
+    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
+                leading dimension of the matrix */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigNDBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
+    }
+};
+
+} // namespace copy_as_contig
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
new file mode 100644
index 000000000000..53b39ff5874c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -0,0 +1,758 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_as_contiguous.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_array_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+static as_c_contiguous_array_impl_fn_ptr_t
+    as_c_contig_array_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+void init_copy_as_contig_dispatch_vectors(void)
+{
+
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContig1DBatchOfSquareMatricesFactory;
+    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContigNDBatchOfSquareMatricesFactory;
+    using td_ns::DispatchVectorBuilder;
+
+    // Generic to c-contig
+    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
+                          td_ns::num_types>
+        dtv_as_c_contig_array;
+
+    dtv_as_c_contig_array.populate_dispatch_vector(
+        as_c_contig_array_dispatch_vector);
+
+    // 1D batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_1d_batch_of_square_matrices;
+
+    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
+
+    // ND batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_nd_batch_of_square_matrices;
+
+    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
+}
+
+namespace
+{
+
+template <typename dimT>
+std::size_t get_nelems(const std::vector<dimT> &shape)
+{
+    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
+        return prod * static_cast<std::size_t>(term);
+    };
+
+    static constexpr std::size_t unit{1};
+
+    const std::size_t nelems =
+        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
+    return nelems;
+}
+
+} // end of anonymous namespace
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    const int src_nd = src.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.back();
+        if (n == dst_shape_vec[src_nd - 2]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[src_nd - 2] == unit_stride) {
+                return py_as_c_contig_f2c(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be F-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.front();
+        if (n == dst_shape_vec[1]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[1] == unit_stride) {
+                return py_as_f_contig_c2f(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = src_shape_vec.back();
+    if (src_shape_vec[src_nd - 2] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec),
+                               std::end(src_shape_vec) - 2);
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec),
+                                     std::end(src_strides_vec) - 2);
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec),
+                                     std::end(dst_strides_vec) - 2);
+    }
+
+    // simplify batch iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.back(), dst.get_data(),
+                    dst_strides_vec[src_nd - 2], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
+                dst_strides_vec[src_nd - 2], all_depends);
+
+    // async free of shape_strides temporary
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = dst_shape_vec.front();
+    if (dst_shape_vec[1] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[1] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec) + 2,
+                               std::end(src_shape_vec));
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec) + 2,
+                                     std::end(src_strides_vec));
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec) + 2,
+                                     std::end(dst_strides_vec));
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.front(), dst.get_data(),
+                    dst_strides_vec[1], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
+                dst_strides_vec[1], all_depends);
+
+    // async free of shape_strides
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
new file mode 100644
index 000000000000..2de67098b7fa
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+void init_copy_as_contig_dispatch_vectors(void);
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl

From 5a9c14cd5ac07cf0a79da70e67b1cd9c28f063c6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:16:36 -0800
Subject: [PATCH 05/38] Add copy_usm_ndarray_into_usm_ndarray implementation

---
 .../source/copy_and_cast_usm_to_usm.cpp       | 310 ++++++++++++++++++
 .../source/copy_and_cast_usm_to_usm.hpp       |  60 ++++
 2 files changed, 370 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp

diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
new file mode 100644
index 000000000000..0458aa75ac32
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -0,0 +1,310 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
+
+static copy_and_cast_generic_fn_ptr_t
+    copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_1d_fn_ptr_t
+    copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_contig_fn_ptr_t
+    copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // check for applicability of special cases:
+    //      (both C-contiguous || both F-contiguous)
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+    if (both_c_contig || both_f_contig) {
+
+        sycl::event copy_ev;
+        if (src_type_id == dst_type_id) {
+
+            int src_elem_size = src.get_elemsize();
+
+            copy_ev = exec_q.memcpy(static_cast<void *>(dst_data),
+                                    static_cast<const void *>(src_data),
+                                    src_nelems * src_elem_size, depends);
+        }
+        else {
+            auto contig_fn =
+                copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
+            copy_ev =
+                contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+        }
+        // make sure src and dst are not GC-ed before copy_ev is complete
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
+        if (is_dst_c_contig) {
+            return py_as_c_contig(src, dst, exec_q, depends);
+        }
+        else if (is_dst_f_contig) {
+            return py_as_f_contig(src, dst, exec_q, depends);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd < 2) {
+        if (nd == 1) {
+            std::array<py::ssize_t, 1> shape_arr = {simplified_shape[0]};
+            std::array<py::ssize_t, 1> src_strides_arr = {
+                simplified_src_strides[0]};
+            std::array<py::ssize_t, 1> dst_strides_arr = {
+                simplified_dst_strides[0]};
+
+            sycl::event copy_and_cast_1d_event;
+            if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
+                (src_offset == 0) && (dst_offset == 0))
+            {
+                auto contig_fn =
+                    copy_and_cast_contig_dispatch_table[dst_type_id]
+                                                       [src_type_id];
+                copy_and_cast_1d_event =
+                    contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+            }
+            else {
+                auto fn =
+                    copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+                copy_and_cast_1d_event =
+                    fn(exec_q, src_nelems, shape_arr, src_strides_arr,
+                       dst_strides_arr, src_data, src_offset, dst_data,
+                       dst_offset, depends);
+            }
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
+                copy_and_cast_1d_event);
+        }
+        else if (nd == 0) { // case of a scalar
+            assert(src_nelems == 1);
+            std::array<py::ssize_t, 1> shape_arr = {1};
+            std::array<py::ssize_t, 1> src_strides_arr = {1};
+            std::array<py::ssize_t, 1> dst_strides_arr = {1};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_0d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
+                copy_and_cast_0d_event);
+        }
+    }
+
+    // Generic implementation
+    auto copy_and_cast_fn =
+        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const sycl::event &copy_and_cast_generic_ev = copy_and_cast_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
+        dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_and_cast_generic_ev}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_and_cast_generic_ev);
+}
+
+void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory;
+    DispatchTableBuilder<copy_and_cast_contig_fn_ptr_t,
+                         CopyAndCastContigFactory, num_types>
+        dtb_contig;
+    dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
+    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
+                         CopyAndCastGenericFactory, num_types>
+        dtb_generic;
+    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
+    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
+                         num_types>
+        dtb_1d;
+    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
new file mode 100644
index 000000000000..d2a2dcaf7b85
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 4f6334054fc08df7c2c2f7657bc5f4569ee4363a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:18:36 -0800
Subject: [PATCH 06/38] Add pybind11 bindings for dpctl_ext.tensor._tensor_impl

---
 .../tensor/libtensor/source/tensor_ctors.cpp  | 502 ++++++++++++++++++
 1 file changed, 502 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
new file mode 100644
index 000000000000..b41b5c9ce423
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -0,0 +1,502 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+
+// #include "accumulators.hpp"
+// #include "boolean_advanced_indexing.hpp"
+// #include "clip.hpp"
+#include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_as_contig.hpp"
+// #include "copy_for_reshape.hpp"
+// #include "copy_for_roll.hpp"
+// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+// #include "device_support_queries.hpp"
+// #include "eye_ctor.hpp"
+// #include "full_ctor.hpp"
+// #include "integer_advanced_indexing.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+// #include "linear_sequences.hpp"
+// #include "repeat.hpp"
+#include "simplify_iteration_space.hpp"
+// #include "triul_ctor.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/strided_iters.hpp"
+// #include "where.hpp"
+// #include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace
+{
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::py_as_c_contig;
+using dpctl::tensor::py_internal::py_as_f_contig;
+
+/* =========================== Copy for reshape ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+
+/* =========================== Copy for roll ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+
+/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
+
+// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+
+/* ============= linear-sequence ==================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+
+/* ================ Full ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_full;
+
+/* ================ Zeros ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+
+/* ============== Advanced Indexing ============= */
+// using dpctl::tensor::py_internal::usm_ndarray_put;
+// using dpctl::tensor::py_internal::usm_ndarray_take;
+
+// using dpctl::tensor::py_internal::py_extract;
+// using dpctl::tensor::py_internal::py_mask_positions;
+// using dpctl::tensor::py_internal::py_nonzero;
+// using dpctl::tensor::py_internal::py_place;
+
+/* ================= Repeat ====================*/
+// using dpctl::tensor::py_internal::py_cumsum_1d;
+// using dpctl::tensor::py_internal::py_repeat_by_scalar;
+// using dpctl::tensor::py_internal::py_repeat_by_sequence;
+
+/* ================ Eye ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_eye;
+
+/* =========================== Tril and triu ============================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_triul;
+
+/* =========================== Where ============================== */
+
+// using dpctl::tensor::py_internal::py_where;
+
+/* =========================== Clip ============================== */
+// using dpctl::tensor::py_internal::py_clip;
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_and_cast_usm_to_usm_dispatch_tables();
+    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    // init_advanced_indexing_dispatch_tables();
+    // init_where_dispatch_tables();
+    return;
+}
+
+// populate dispatch vectors
+void init_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_as_contig_dispatch_vectors();
+    // init_copy_for_reshape_dispatch_vectors();
+    // init_copy_for_roll_dispatch_vectors();
+    // init_linear_sequences_dispatch_vectors();
+    // init_full_ctor_dispatch_vectors();
+    // init_zeros_ctor_dispatch_vectors();
+    // init_eye_ctor_dispatch_vectors();
+    // init_triul_ctor_dispatch_vectors();
+
+    // populate_masked_extract_dispatch_vectors();
+    // populate_masked_place_dispatch_vectors();
+
+    // populate_mask_positions_dispatch_vectors();
+
+    // populate_cumsum_1d_dispatch_vectors();
+    // init_repeat_dispatch_vectors();
+
+    // init_clip_dispatch_vectors();
+
+    return;
+}
+
+} // namespace
+
+PYBIND11_MODULE(_tensor_impl, m)
+{
+    init_dispatch_tables();
+    init_dispatch_vectors();
+
+    using dpctl::tensor::strides::contract_iter;
+    m.def(
+        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
+        "Simplifies iteration of array of given shape & stride. Returns "
+        "a triple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension, which traverses the same elements as the original "
+        "iterator, possibly in a different order.");
+
+    m.def("_copy_usm_ndarray_into_usm_ndarray",
+          &copy_usm_ndarray_into_usm_ndarray,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same "
+          "shape. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_c_contig", &py_as_c_contig,
+          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_f_contig", &py_as_f_contig,
+          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    using dpctl::tensor::strides::contract_iter2;
+    m.def(
+        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of pair of arrays of given shape "
+        "with strides stride1 and stride2. Returns "
+        "a 5-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter3;
+    m.def(
+        "_contract_iter3", &contract_iter3<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 3-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, and stride3. Returns "
+        "a 7-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter4;
+    m.def(
+        "_contract_iter4", &contract_iter4<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 4-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, stride3, and stride4. Returns "
+        "a 9-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    static constexpr char orderC = 'C';
+    m.def(
+        "_ravel_multi_index",
+        [](const std::vector<py::ssize_t> &mi,
+           const std::vector<py::ssize_t> &shape, char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_ravel_multi_index_c(mi,
+                                                                        shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_ravel_multi_index_f(mi,
+                                                                        shape);
+            }
+        },
+        "");
+
+    m.def(
+        "_unravel_index",
+        [](py::ssize_t flat_index, const std::vector<py::ssize_t> &shape,
+           char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_unravel_index_c(flat_index,
+                                                                    shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_unravel_index_f(flat_index,
+                                                                    shape);
+            }
+        },
+        "");
+
+    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "number of elements using underlying 'C'-contiguous order for
+    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
+    //       comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for flat "
+    //       "traversal with shift. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for " "traversal
+    //       with shifts along each axis. " "Returns a tuple of events:
+    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and step `dt`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and end point `end`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("end"), py::arg("dst"),
+    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
+    //       &copy_numpy_ndarray_into_usm_ndarray,
+    //       "Copy from numpy array `src` into usm_ndarray `dst`
+    //       synchronously.", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_full_usm_ndarray", &usm_ndarray_full,
+    //       "Populate usm_ndarray `dst` with given fill_value.",
+    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_take", &usm_ndarray_take,
+    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` from array `src` and copies them "
+    //       "into usm_ndarray `dst` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //   py::arg("depends") = py::list());
+
+    // m.def("_put", &usm_ndarray_put,
+    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` into array `dst` from "
+    //       "usm_ndarray `val` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_eye", &usm_ndarray_eye,
+    //       "Fills input 2D contiguous usm_ndarray `dst` with "
+    //       "zeros outside of the diagonal "
+    //       "specified by "
+    //       "the diagonal index `k` "
+    //       "which is filled with ones."
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("default_device_fp_type",
+    //       dpctl::tensor::py_internal::default_device_fp_type,
+    //       "Gives default floating point type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_int_type",
+    //       dpctl::tensor::py_internal::default_device_int_type,
+    //       "Gives default signed integer type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_uint_type",
+    //       dpctl::tensor::py_internal::default_device_uint_type,
+    //       "Gives default unsigned integer type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_bool_type",
+    //       dpctl::tensor::py_internal::default_device_bool_type,
+    //       "Gives default boolean type supported by device.", py::arg("dev"));
+
+    // m.def("default_device_complex_type",
+    //       dpctl::tensor::py_internal::default_device_complex_type,
+    //       "Gives default complex floating point type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_index_type",
+    //       dpctl::tensor::py_internal::default_device_index_type,
+    //       "Gives default index type supported by device.", py::arg("dev"));
+
+    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    // };
+    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    // };
+    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+    //       py::arg("cumsum"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
+                      const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &overlap = MemoryOverlap();
+        return overlap(x1, x2);
+    };
+    m.def("_array_overlap", overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    // auto same_logical_tensors =
+    //     [](const dpctl::tensor::usm_ndarray &x1,
+    //        const dpctl::tensor::usm_ndarray &x2) -> bool {
+    //     auto const &same_logical_tensors = SameLogicalTensors();
+    //     return same_logical_tensors(x1, x2);
+    // };
+    // m.def("_same_logical_tensors", same_logical_tensors,
+    //       "Determines if the memory regions indexed by each array are the
+    //       same", py::arg("array1"), py::arg("array2"));
+
+    // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+    //       py::arg("mask_shape"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+    //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+    //                           const dpctl::tensor::usm_ndarray &dst,
+    //                           const dpctl::tensor::usm_ndarray &reps,
+    //                           const dpctl::tensor::usm_ndarray &cumsum,
+    //                           std::optional<int> axis, sycl::queue &exec_q,
+    //                           const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum,
+    //         axis.value(),
+    //                                      exec_q, depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+    //                                      depends);
+    //     }
+    // };
+    // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+    //       py::arg("dst"), py::arg("reps"), py::arg("cumsum"),
+    //       py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+    //                         const dpctl::tensor::usm_ndarray &dst,
+    //                         const py::ssize_t reps, std::optional<int> axis,
+    //                         sycl::queue &exec_q,
+    //                         const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+    //                                    depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+    //     }
+    // };
+    // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+    //       py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_clip", &py_clip,
+    //       "Clamps elements of array `x` to the range "
+    //       "[`min`, `max] and writes the result to the "
+    //       "array `dst` for each element of `x`, `min`, and `max`."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+}

From 634579c5f0d64d44805d0a020cb4ca5ae1d5e774 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:24:11 -0800
Subject: [PATCH 07/38] Add CMake build files for dpctl_ext

---
 dpctl_ext/CMakeLists.txt        | 205 ++++++++++++++++++++++++++++++++
 dpctl_ext/tensor/CMakeLists.txt | 175 +++++++++++++++++++++++++++
 2 files changed, 380 insertions(+)
 create mode 100644 dpctl_ext/CMakeLists.txt
 create mode 100644 dpctl_ext/tensor/CMakeLists.txt

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
new file mode 100644
index 000000000000..bb33a4f57332
--- /dev/null
+++ b/dpctl_ext/CMakeLists.txt
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+find_package(Python REQUIRED COMPONENTS NumPy)
+
+# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
+# -w is to set working directory (and correctly set __pyx_f[] array of filenames)
+set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
+find_package(Cython REQUIRED)
+
+if(WIN32)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+    )
+    string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_CXX_FLAGS_COVERAGE
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+elseif(UNIX)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+        "-fdiagnostics-color=auto "
+    )
+    string(
+        CONCAT SDL_FLAGS
+        "-fstack-protector "
+        "-fstack-protector-all "
+        "-fpic "
+        "-fPIC "
+        "-D_FORTIFY_SOURCE=2 "
+        "-Wformat "
+        "-Wformat-security "
+        #       "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
+        "-fno-delete-null-pointer-checks "
+        "-fwrapv "
+    )
+    string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+else()
+    message(FATAL_ERROR "Unsupported system.")
+endif()
+
+# at build time create include/ directory and copy header files over
+set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+set(CMAKE_INSTALL_RPATH "$ORIGIN")
+
+function(build_dpctl_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPCTL_EXT_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPCTL_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpctl_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpctl_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPCTL_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPCTL_EXT_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
+    endif()
+    if(DPCTL_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    if(DPCTL_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
+add_subdirectory(tensor)
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
new file mode 100644
index 000000000000..ed8294b76615
--- /dev/null
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+if(WIN32)
+    if(${CMAKE_VERSION} VERSION_LESS "3.23")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS
+            "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
+        )
+    endif()
+endif()
+
+set(_static_lib_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+)
+set(_tensor_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+)
+
+set(_static_lib_trgt simplify_iteration_space)
+
+add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
+target_include_directories(
+    ${_static_lib_trgt}
+    PRIVATE
+        ${Python_INCLUDE_DIRS}
+        ${DPCTL_INCLUDE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES})
+set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+set(_py_trgts)
+
+set(python_module_name _tensor_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(_clang_prefix "")
+if(WIN32)
+    set(_clang_prefix "/clang:")
+endif()
+
+set(_no_fast_math_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+)
+list(
+    APPEND _no_fast_math_sources
+    # ${_elementwise_sources}
+    # ${_reduction_sources}
+    # ${_sorting_sources}
+    # ${_linalg_sources}
+    # ${_accumulator_sources}
+)
+
+foreach(_src_fn ${_no_fast_math_sources})
+    get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
+    set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
+    )
+endforeach()
+
+set(_compiler_definitions "")
+
+set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE -fno-sycl-id-queries-fit-in-int
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE -fsycl-device-code-split=per_kernel
+    )
+    if(DPCTL_OFFLOAD_COMPRESS)
+        target_link_options(${python_module_name} PRIVATE --offload-compress)
+    endif()
+
+    target_include_directories(
+        ${python_module_name}
+        PRIVATE
+            ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+            ${Dpctl_INCLUDE_DIR}
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    if(DPCTL_GENERATE_COVERAGE)
+        if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
+            target_compile_options(
+                ${python_module_name}
+                PRIVATE -fprofile-instr-generate -fcoverage-mapping
+            )
+        endif()
+        target_link_options(
+            ${python_module_name}
+            PRIVATE -fprofile-instr-generate -fcoverage-mapping
+        )
+    endif()
+    if(_dpctl_sycl_targets)
+        # make fat binary
+        target_compile_options(
+            ${python_module_name}
+            PRIVATE ${_dpctl_sycl_target_compile_options}
+        )
+        target_link_options(
+            ${python_module_name}
+            PRIVATE ${_dpctl_sycl_target_link_options}
+        )
+    endif()
+    # TODO: update source so they reference individual libraries instead of
+    #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
+    # NOTE: dpctl C-API is resolved at runtime via Python
+    # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
+    if(DPCTL_WITH_REDIST)
+        set_target_properties(
+            ${python_module_name}
+            PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."
+        )
+    endif()
+    # TODO: revert to `DESTINATION "dpctl/tensor"`
+    install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor")
+endforeach()

From 79d40f235d10d1b9d514d9db07939d0bb447086c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:31:12 -0800
Subject: [PATCH 08/38] Add empty __init__ to dpctl_ext/

---
 dpctl_ext/__init__.py        | 27 +++++++++++++++++++++++++++
 dpctl_ext/tensor/__init__.py | 27 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 dpctl_ext/__init__.py
 create mode 100644 dpctl_ext/tensor/__init__.py

diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/tensor/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************

From 7949c17c3586a4ad0222c6abbf3a616202834c68 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:53:03 -0800
Subject: [PATCH 09/38] Enable _same_logical_tensors in _tensor_impl

---
 .../tensor/libtensor/source/tensor_ctors.cpp   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index b41b5c9ce423..ca3b7bd49116 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -430,15 +430,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
-    // auto same_logical_tensors =
-    //     [](const dpctl::tensor::usm_ndarray &x1,
-    //        const dpctl::tensor::usm_ndarray &x2) -> bool {
-    //     auto const &same_logical_tensors = SameLogicalTensors();
-    //     return same_logical_tensors(x1, x2);
-    // };
-    // m.def("_same_logical_tensors", same_logical_tensors,
-    //       "Determines if the memory regions indexed by each array are the
-    //       same", py::arg("array1"), py::arg("array2"));
+    auto same_logical_tensors =
+        [](const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
 
     // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
     //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),

From 29d6c029190714cab8a460c02f32130c7ea59cc6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 05:14:28 -0800
Subject: [PATCH 10/38] Add device_support_queries to enable default device
 types

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../source/device_support_queries.cpp         | 184 ++++++++++++++++++
 .../source/device_support_queries.hpp         |  58 ++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  56 +++---
 4 files changed, 271 insertions(+), 29 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ed8294b76615..ee8da2e49506 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -56,7 +56,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
new file mode 100644
index 000000000000..51eb7dba1b6c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace
+{
+
+std::string _default_device_fp_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "f8";
+    }
+    else {
+        return "f4";
+    }
+}
+
+int get_numpy_major_version()
+{
+    namespace py = pybind11;
+
+    py::module_ numpy = py::module_::import("numpy");
+    py::str version_string = numpy.attr("__version__");
+    py::module_ numpy_lib = py::module_::import("numpy.lib");
+
+    py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+    return major_version;
+}
+
+std::string _default_device_int_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "i8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "l";
+    }
+}
+
+std::string _default_device_uint_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "u8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "L";
+    }
+}
+
+std::string _default_device_complex_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "c16";
+    }
+    else {
+        return "c8";
+    }
+}
+
+std::string _default_device_bool_type(const sycl::device &)
+{
+    return "b1";
+}
+
+std::string _default_device_index_type(const sycl::device &)
+{
+    return "i8";
+}
+
+sycl::device _extract_device(const py::object &arg)
+{
+    auto const &api = dpctl::detail::dpctl_capi::get();
+
+    PyObject *source = arg.ptr();
+    if (api.PySyclQueue_Check_(source)) {
+        const sycl::queue &q = py::cast<sycl::queue>(arg);
+        return q.get_device();
+    }
+    else if (api.PySyclDevice_Check_(source)) {
+        return py::cast<sycl::device>(arg);
+    }
+    else {
+        throw py::type_error(
+            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
+    }
+}
+
+} // namespace
+
+std::string default_device_fp_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_fp_type(d);
+}
+
+std::string default_device_int_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_int_type(d);
+}
+
+std::string default_device_uint_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_uint_type(d);
+}
+
+std::string default_device_bool_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_bool_type(d);
+}
+
+std::string default_device_complex_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_complex_type(d);
+}
+
+std::string default_device_index_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_index_type(d);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
new file mode 100644
index 000000000000..6ea01dcd49d7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::string default_device_fp_type(const py::object &);
+extern std::string default_device_int_type(const py::object &);
+extern std::string default_device_uint_type(const py::object &);
+extern std::string default_device_bool_type(const py::object &);
+extern std::string default_device_complex_type(const py::object &);
+extern std::string default_device_index_type(const py::object &);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index ca3b7bd49116..911d75ebd925 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -52,7 +52,7 @@
 // #include "copy_for_reshape.hpp"
 // #include "copy_for_roll.hpp"
 // #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
-// #include "device_support_queries.hpp"
+#include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 // #include "full_ctor.hpp"
 // #include "integer_advanced_indexing.hpp"
@@ -360,33 +360,33 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
     //       py::arg("depends") = py::list());
 
-    // m.def("default_device_fp_type",
-    //       dpctl::tensor::py_internal::default_device_fp_type,
-    //       "Gives default floating point type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_int_type",
-    //       dpctl::tensor::py_internal::default_device_int_type,
-    //       "Gives default signed integer type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_uint_type",
-    //       dpctl::tensor::py_internal::default_device_uint_type,
-    //       "Gives default unsigned integer type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_bool_type",
-    //       dpctl::tensor::py_internal::default_device_bool_type,
-    //       "Gives default boolean type supported by device.", py::arg("dev"));
-
-    // m.def("default_device_complex_type",
-    //       dpctl::tensor::py_internal::default_device_complex_type,
-    //       "Gives default complex floating point type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_index_type",
-    //       dpctl::tensor::py_internal::default_device_index_type,
-    //       "Gives default index type supported by device.", py::arg("dev"));
+    m.def("default_device_fp_type",
+          dpctl::tensor::py_internal::default_device_fp_type,
+          "Gives default floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_int_type",
+          dpctl::tensor::py_internal::default_device_int_type,
+          "Gives default signed integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_uint_type",
+          dpctl::tensor::py_internal::default_device_uint_type,
+          "Gives default unsigned integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_bool_type",
+          dpctl::tensor::py_internal::default_device_bool_type,
+          "Gives default boolean type supported by device.", py::arg("dev"));
+
+    m.def("default_device_complex_type",
+          dpctl::tensor::py_internal::default_device_complex_type,
+          "Gives default complex floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_index_type",
+          dpctl::tensor::py_internal::default_device_index_type,
+          "Gives default index type supported by device.", py::arg("dev"));
 
     // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
     //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,

From 936e7198e2014330b34c5918a63230ea699e063e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 05:52:17 -0800
Subject: [PATCH 11/38] Enable building and packaging of dpctl_ext

---
 CMakeLists.txt | 1 +
 setup.py       | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 386b17b44294..d2ee5e84c0c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,3 +336,4 @@ if(DEFINED SKBUILD)
 endif()
 
 add_subdirectory(dpnp)
+add_subdirectory(dpctl_ext)
diff --git a/setup.py b/setup.py
index cc21221299c4..a0c54b066dcf 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,9 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
+        # dpctl_ext
+        "dpctl_ext",
+        "dpctl_ext.tensor",
     ],
     package_data={
         "dpnp": [

From cd85f1e333bcad154272946f71c127b9ea9a916b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 06:14:39 -0800
Subject: [PATCH 12/38] Use _tensor_impl from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 2 +-
 dpnp/dpnp_iface.py                          | 2 +-
 dpnp/dpnp_iface_searching.py                | 2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +-
 dpnp/scipy/linalg/_utils.py                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 57bf50422fa0..b63bf61f8dad 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -31,7 +31,6 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._copy_utils as dtc
-import dpctl.tensor._tensor_impl as dti
 import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
@@ -45,6 +44,7 @@
     _validate_dtype,
 )
 
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index fba1a215756a..832446c826ba 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -45,11 +45,11 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 6eefe010b699..fdbd317d31dd 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -40,8 +40,8 @@
 """
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as dti
 
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 30be5d1ff5cb..4d8e3cdfbd0d 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -28,7 +28,6 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -38,6 +37,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 282c645d1095..8eb9187236bf 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -42,9 +42,9 @@
 
 from warnings import warn
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations

From 0c6780a8f8b45e87263fbf316bc17aac5ed91dc1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 09:56:50 -0800
Subject: [PATCH 13/38] Move put() and take() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |  11 +
 dpctl_ext/tensor/_indexing_functions.py       | 329 +++++++
 dpctl_ext/tensor/_numpy_helper.py             |  45 +
 .../kernels/integer_advanced_indexing.hpp     | 427 +++++++++
 .../source/integer_advanced_indexing.cpp      | 819 ++++++++++++++++++
 .../source/integer_advanced_indexing.hpp      |  73 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  42 +-
 8 files changed, 1726 insertions(+), 22 deletions(-)
 create mode 100644 dpctl_ext/tensor/_indexing_functions.py
 create mode 100644 dpctl_ext/tensor/_numpy_helper.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ee8da2e49506..ae8b72d71873 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -49,7 +49,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a71324cb88d8..35453dbf9a46 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -25,3 +25,14 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
+
+
+from dpctl_ext.tensor._indexing_functions import (
+    put,
+    take,
+)
+
+__all__ = [
+    "put",
+    "take",
+]
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
new file mode 100644
index 000000000000..106df09cf97e
--- /dev/null
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -0,0 +1,329 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+
+
+def _get_indexing_mode(name):
+    modes = {"wrap": 0, "clip": 1}
+    try:
+        return modes[name]
+    except KeyError:
+        raise ValueError(
+            "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name)
+        )
+
+
+def put(x, indices, vals, /, *, axis=None, mode="wrap"):
+    """put(x, indices, vals, axis=None, mode="wrap")
+
+    Puts values into an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array the values will be put into.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the result shape
+            ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``.
+        axis (int, optional):
+            The axis along which the values will be placed.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work, e.g.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor as dpt
+
+                def put_vec_duplicates(vec, ind, vals):
+                    "Put values into vec, handling possible duplicates in ind"
+                    assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1
+
+                    # find positions of last occurrences of each
+                    # unique index
+                    ind_flipped = dpt.flip(ind)
+                    ind_uniq = dpt.unique_all(ind_flipped).indices
+                    has_dups = len(ind) != len(ind_uniq)
+
+                    if has_dups:
+                        ind_uniq = dpt.subtract(vec.size - 1, ind_uniq)
+                        ind = dpt.take(ind, ind_uniq)
+                        vals = dpt.take(vals, ind_uniq)
+
+                    dpt.put(vec, ind, vals)
+
+                n = 512
+                ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1)))
+                x = dpt.zeros(ind.size, dtype="int32")
+                vals = dpt.arange(ind.size, dtype=x.dtype)
+
+                # Values corresponding to last positions of
+                # duplicate indices are written into the vector x
+                put_vec_duplicates(x, ind, vals)
+
+                parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype))
+                expected = dpt.concat(parts)
+                assert dpt.all(x == expected)
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        val_shape = indices.shape
+
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
+        )
+    # choose to throw here for consistency with `place`
+    if vals.size == 0:
+        raise ValueError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    if vals.dtype == x.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, put_ev)
+
+
+def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
+    """take(x, indices, axis=None, out=None, mode="wrap")
+
+    Takes elements from an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array that elements will be taken from.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        axis (int, optional):
+            The axis along which the values will be selected.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        out (Optional[usm_ndarray]):
+            Output array to populate. Array must have the correct
+            shape and the expected data type.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+       usm_ndarray:
+          Array with shape
+          ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]``
+          filled with elements from ``x``.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        [x.usm_type, indices.usm_type]
+    )
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        res_shape = indices.shape
+
+    dt = x.dtype
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+        if dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if ti._array_overlap(x, out):
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, take_ev)
+
+    if not (orig_out is None or out is orig_out):
+        # Copy the out data from temporary buffer to original memory
+        ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py
new file mode 100644
index 000000000000..4ad735823cb3
--- /dev/null
+++ b/dpctl_ext/tensor/_numpy_helper.py
@@ -0,0 +1,45 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import numpy as np
+
+_npver = np.lib.NumpyVersion(np.__version__)
+
+if _npver < "1.25.0":  # pragma: no cover
+    from numpy import AxisError
+else:
+    from numpy.exceptions import AxisError
+
+if _npver >= "2.0.0":
+    from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple
+else:  # pragma: no cover
+    from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
+
+
+__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"]
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..1b2c79d2e2a5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -0,0 +1,427 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/indexing_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace indexing
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class TakeFunctor
+{
+private:
+    const char *src_ = nullptr;
+    char *dst_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    TakeFunctor(const char *src_cp,
+                char *dst_cp,
+                char **ind_cp,
+                int k,
+                std::size_t ind_nelems,
+                const ssize_t *axes_shape_and_strides,
+                const OrthogIndexer &orthog_strider_,
+                const IndicesIndexer &ind_strider_,
+                const AxesIndexer &axes_strider_)
+        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const T *src = reinterpret_cast<const T *>(src_);
+        T *dst = reinterpret_cast<T *>(dst_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t src_offset = orthog_offsets.get_first_offset();
+        ssize_t dst_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            src_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        dst_offset += axes_strider(i_along);
+
+        dst[dst_offset] = src[src_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class take_kernel;
+
+typedef sycl::event (*take_fn_ptr_t)(sycl::queue &,
+                                     std::size_t,
+                                     std::size_t,
+                                     int,
+                                     int,
+                                     int,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const char *,
+                                     char *,
+                                     char **,
+                                     ssize_t,
+                                     ssize_t,
+                                     const ssize_t *,
+                                     const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event take_impl(sycl::queue &q,
+                      std::size_t orthog_nelems,
+                      std::size_t ind_nelems,
+                      int nd,
+                      int ind_nd,
+                      int k,
+                      const ssize_t *orthog_shape_and_strides,
+                      const ssize_t *axes_shape_and_strides,
+                      const ssize_t *ind_shape_and_strides,
+                      const char *src_p,
+                      char *dst_p,
+                      char **ind_p,
+                      ssize_t src_offset,
+                      ssize_t dst_offset,
+                      const ssize_t *ind_offsets,
+                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            take_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            TakeFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>(
+                src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return take_ev;
+}
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class PutFunctor
+{
+private:
+    char *dst_ = nullptr;
+    const char *val_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    PutFunctor(char *dst_cp,
+               const char *val_cp,
+               char **ind_cp,
+               int k,
+               std::size_t ind_nelems,
+               const ssize_t *axes_shape_and_strides,
+               const OrthogIndexer &orthog_strider_,
+               const IndicesIndexer &ind_strider_,
+               const AxesIndexer &axes_strider_)
+        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        T *dst = reinterpret_cast<T *>(dst_);
+        const T *val = reinterpret_cast<const T *>(val_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t dst_offset = orthog_offsets.get_first_offset();
+        ssize_t val_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            dst_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        val_offset += axes_strider(i_along);
+
+        dst[dst_offset] = val[val_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class put_kernel;
+
+typedef sycl::event (*put_fn_ptr_t)(sycl::queue &,
+                                    std::size_t,
+                                    std::size_t,
+                                    int,
+                                    int,
+                                    int,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    char *,
+                                    const char *,
+                                    char **,
+                                    ssize_t,
+                                    ssize_t,
+                                    const ssize_t *,
+                                    const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event put_impl(sycl::queue &q,
+                     std::size_t orthog_nelems,
+                     std::size_t ind_nelems,
+                     int nd,
+                     int ind_nd,
+                     int k,
+                     const ssize_t *orthog_shape_and_strides,
+                     const ssize_t *axes_shape_and_strides,
+                     const ssize_t *ind_shape_and_strides,
+                     char *dst_p,
+                     const char *val_p,
+                     char **ind_p,
+                     ssize_t dst_offset,
+                     ssize_t val_offset,
+                     const ssize_t *ind_offsets,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            put_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            PutFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>(
+                dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return put_ev;
+}
+
+template <typename fnT, typename T, typename indT>
+struct TakeWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct TakeClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+} // namespace indexing
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
new file mode 100644
index 000000000000..244acfe3955f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -0,0 +1,819 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.take and
+/// dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/integer_advanced_indexing.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "integer_advanced_indexing.hpp"
+
+#define INDEXING_MODES 2
+#define WRAP_MODE      0
+#define CLIP_MODE      1
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
+using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
+
+static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                        [td_ns::num_types];
+
+static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                      [td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::vector<sycl::event>
+    _populate_kernel_params(sycl::queue &exec_q,
+                            std::vector<sycl::event> &host_task_events,
+                            char **device_ind_ptrs,
+                            py::ssize_t *device_ind_sh_st,
+                            py::ssize_t *device_ind_offsets,
+                            py::ssize_t *device_orthog_sh_st,
+                            py::ssize_t *device_along_sh_st,
+                            const py::ssize_t *inp_shape,
+                            const py::ssize_t *arr_shape,
+                            std::vector<py::ssize_t> &inp_strides,
+                            std::vector<py::ssize_t> &arr_strides,
+                            std::vector<py::ssize_t> &ind_sh_sts,
+                            std::vector<char *> &ind_ptrs,
+                            std::vector<py::ssize_t> &ind_offsets,
+                            int axis_start,
+                            int k,
+                            int ind_nd,
+                            int inp_nd,
+                            int orthog_sh_elems,
+                            int ind_sh_elems)
+{
+
+    using usm_host_allocator_T =
+        dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT sz_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_sh_st_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, sz_allocator);
+
+    std::shared_ptr<shT> host_orthog_sh_st_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
+
+    std::shared_ptr<shT> host_along_sh_st_shp =
+        std::make_shared<shT>(2 * (k + ind_sh_elems), sz_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_sh_st_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
+
+    const sycl::event &device_ind_sh_st_copy_ev =
+        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
+                                 host_ind_sh_st_shp->size());
+
+    const sycl::event &device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), device_ind_offsets,
+        host_ind_offsets_shp->size());
+
+    int orthog_nd = inp_nd - k;
+
+    if (orthog_nd > 0) {
+        if (axis_start > 0) {
+            std::copy(inp_shape, inp_shape + axis_start,
+                      host_orthog_sh_st_shp->begin());
+            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
+            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
+        }
+        if (inp_nd > (axis_start + k)) {
+            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                      host_orthog_sh_st_shp->begin() + axis_start);
+            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
+                          axis_start);
+
+            std::copy(arr_strides.begin() + axis_start + ind_nd,
+                      arr_strides.end(),
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
+                          axis_start);
+        }
+    }
+
+    if (inp_nd > 0) {
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  host_along_sh_st_shp->begin());
+
+        std::copy(inp_strides.begin() + axis_start,
+                  inp_strides.begin() + axis_start + k,
+                  host_along_sh_st_shp->begin() + k);
+    }
+
+    if (ind_nd > 0) {
+        std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k);
+        std::copy(arr_strides.begin() + axis_start,
+                  arr_strides.begin() + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k + ind_nd);
+    }
+
+    const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
+        host_orthog_sh_st_shp->size());
+
+    const sycl::event &device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_along_sh_st_shp->data(), device_along_sh_st,
+        host_along_sh_st_shp->size());
+
+    const sycl::event &shared_ptr_cleanup_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({device_along_sh_st_copy_ev,
+                            device_orthog_sh_st_copy_ev,
+                            device_ind_offsets_copy_ev,
+                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
+            cgh.host_task(
+                [host_ind_offsets_shp = std::move(host_ind_offsets_shp),
+                 host_ind_sh_st_shp = std::move(host_ind_sh_st_shp),
+                 host_ind_ptrs_shp = std::move(host_ind_ptrs_shp),
+                 host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp),
+                 host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
+
+    std::vector<sycl::event> sh_st_pack_deps{
+        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
+        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
+        device_along_sh_st_copy_ev};
+    return sh_st_pack_deps;
+}
+
+/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
+std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
+                                                     const py::object &py_ind)
+{
+    std::size_t ind_count = py::len(py_ind);
+    std::vector<dpctl::tensor::usm_ndarray> res;
+    res.reserve(ind_count);
+
+    bool nd_is_known = false;
+    int nd = -1;
+    for (std::size_t i = 0; i < ind_count; ++i) {
+        py::object el_i = py_ind[py::cast(i)];
+        dpctl::tensor::usm_ndarray arr_i =
+            py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
+            throw py::value_error("Index allocation queue is not compatible "
+                                  "with execution queue");
+        }
+        if (nd_is_known) {
+            if (nd != arr_i.get_ndim()) {
+                throw py::value_error(
+                    "Indices must have the same number of dimensions.");
+            }
+        }
+        else {
+            nd_is_known = true;
+            nd = arr_i.get_ndim();
+        }
+        res.push_back(arr_i);
+    }
+
+    return res;
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &src,
+                     const py::object &py_ind,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     int axis_start,
+                     std::uint8_t mode,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+
+    int k = ind.size();
+
+    if (k == 0) {
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(src_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(src_nd));
+    }
+    if (src_nd == 0) {
+        if (dst_nd != ind_nd) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+    else {
+        if (dst_nd != (src_nd - k + ind_nd)) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (src_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(src_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Array memory overlap.");
+    }
+
+    py::ssize_t src_offset = py::ssize_t(0);
+    py::ssize_t dst_offset = py::ssize_t(0);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shape does not match shape of axis in destination.");
+        }
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * ind_nelems);
+
+    int ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(dst, ind_)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // rearrange to past where indices shapes are checked
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
+
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
+    //                          src_strides[:axis] + src_strides[axis+k:],
+    //                          dst_strides[:axis] +
+    //                          dst_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
+    //                               src_strides[axis:axis+k],
+    //                               dst_shape[axis:axis+ind.ndim],
+    //                               dst_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event take_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
+           src_offset, dst_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {take_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, take_generic_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst,
+                    const py::object &py_ind,
+                    const dpctl::tensor::usm_ndarray &val,
+                    int axis_start,
+                    std::uint8_t mode,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to write to
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int dst_nd = dst.get_ndim();
+    int val_nd = val.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(dst_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(dst_nd));
+    }
+    if (dst_nd == 0) {
+        if (val_nd != ind_nd) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+    else {
+        if (val_nd != (dst_nd - k + ind_nd)) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+
+    std::size_t dst_nelems = dst.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *val_shape = val.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (dst_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(dst_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *dst_data = dst.get_data();
+    char *val_data = val.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(val, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    py::ssize_t dst_offset = py::ssize_t(0);
+    py::ssize_t val_offset = py::ssize_t(0);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int dst_typenum = dst.get_typenum();
+    int val_typenum = val.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
+
+    if (dst_type_id != val_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == val_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shapes does not match shape of axis in vals.");
+        }
+    }
+
+    auto ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(ind_, dst)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
+
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:],
+    //                          val_strides[:axis] +
+    //                          val_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
+    //                               dst_strides[axis:axis+k],
+    //                               val_shape[axis:axis+ind.ndim],
+    //                               val_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto dst_strides = dst.get_strides_vector();
+    auto val_strides = val.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event put_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
+           dst_offset, val_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {put_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, put_generic_ev);
+}
+
+void init_advanced_indexing_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::indexing::TakeClipFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
+        dtb_takeclip;
+    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
+        dtb_takewrap;
+    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutClipFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
+    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutWrapFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
+    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..57f0ddda132c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -0,0 +1,73 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.take and dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &,
+                     const py::object &,
+                     const dpctl::tensor::usm_ndarray &,
+                     int,
+                     std::uint8_t,
+                     sycl::queue &,
+                     const std::vector<sycl::event> & = {});
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &,
+                    const py::object &,
+                    const dpctl::tensor::usm_ndarray &,
+                    int,
+                    std::uint8_t,
+                    sycl::queue &,
+                    const std::vector<sycl::event> & = {});
+
+extern void init_advanced_indexing_dispatch_tables(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 911d75ebd925..c18761031fd0 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -55,7 +55,7 @@
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 // #include "full_ctor.hpp"
-// #include "integer_advanced_indexing.hpp"
+#include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
 // #include "repeat.hpp"
@@ -110,8 +110,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 // using dpctl::tensor::py_internal::usm_ndarray_zeros;
 
 /* ============== Advanced Indexing ============= */
-// using dpctl::tensor::py_internal::usm_ndarray_put;
-// using dpctl::tensor::py_internal::usm_ndarray_take;
+using dpctl::tensor::py_internal::usm_ndarray_put;
+using dpctl::tensor::py_internal::usm_ndarray_take;
 
 // using dpctl::tensor::py_internal::py_extract;
 // using dpctl::tensor::py_internal::py_mask_positions;
@@ -145,7 +145,7 @@ void init_dispatch_tables(void)
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
-    // init_advanced_indexing_dispatch_tables();
+    init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
 }
@@ -332,23 +332,23 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
     //       py::arg("depends") = py::list());
 
-    // m.def("_take", &usm_ndarray_take,
-    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` from array `src` and copies them "
-    //       "into usm_ndarray `dst` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //   py::arg("depends") = py::list());
-
-    // m.def("_put", &usm_ndarray_put,
-    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` into array `dst` from "
-    //       "usm_ndarray `val` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_take", &usm_ndarray_take,
+          "Takes elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` from array `src` and copies them "
+          "into usm_ndarray `dst` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_put", &usm_ndarray_put,
+          "Puts elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` into array `dst` from "
+          "usm_ndarray `val` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_eye", &usm_ndarray_eye,
     //       "Fills input 2D contiguous usm_ndarray `dst` with "

From 87e5482f2faf3bff2549b48c999bbab516fce168 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 09:59:18 -0800
Subject: [PATCH 14/38] Use put/take from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_iface_indexing.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 6e7ab778299b..6421f39fd4e4 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -52,6 +52,8 @@
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -295,7 +297,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 "Input and output allocation queues are not compatible"
             )
 
-        if ti._array_overlap(x, out):
+        if ti_ext._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
@@ -304,7 +306,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
 
-    h_ev, take_ev = ti._take(
+    h_ev, take_ev = ti_ext._take(
         src=x,
         ind=(inds,),
         dst=out,
@@ -813,7 +815,7 @@ def extract(condition, a):
         usm_a = dpt.reshape(usm_a, -1)
         usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0])
+        usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
             usm_a = dpt.reshape(usm_a, -1)
@@ -1713,7 +1715,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
     if axis is None and usm_a.ndim > 1:
         usm_a = dpt.reshape(usm_a, -1)
 
-    dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
+    dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
         in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
 

From b537f30115be31858782e6a7ace1fc52f54c5f9d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 10:33:51 -0800
Subject: [PATCH 15/38] Move full() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |   4 +
 dpctl_ext/tensor/_ctors.py                    | 169 ++++++++++
 .../include/kernels/constructors.hpp          | 171 ++++++++++
 .../tensor/libtensor/source/full_ctor.cpp     | 315 ++++++++++++++++++
 .../tensor/libtensor/source/full_ctor.hpp     |  60 ++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  14 +-
 7 files changed, 727 insertions(+), 8 deletions(-)
 create mode 100644 dpctl_ext/tensor/_ctors.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ae8b72d71873..0c52d766afbf 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -52,7 +52,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 35453dbf9a46..9f4c27608a99 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,12 +27,16 @@
 # *****************************************************************************
 
 
+from dpctl_ext.tensor._ctors import (
+    full,
+)
 from dpctl_ext.tensor._indexing_functions import (
     put,
     take,
 )
 
 __all__ = [
+    "full",
     "put",
     "take",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
new file mode 100644
index 000000000000..5caa07099c56
--- /dev/null
+++ b/dpctl_ext/tensor/_ctors.py
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from numbers import Number
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._data_types import _get_dtype
+from dpctl.tensor._device import normalize_queue_device
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+
+def _cast_fill_val(fill_val, dt):
+    """
+    Casts the Python scalar `fill_val` to another Python type coercible to the
+    requested data type `dt`, if necessary.
+    """
+    val_type = type(fill_val)
+    if val_type in [float, complex] and np.issubdtype(dt, np.integer):
+        return int(fill_val.real)
+    elif val_type is complex and np.issubdtype(dt, np.floating):
+        return fill_val.real
+    elif val_type is int and np.issubdtype(dt, np.integer):
+        return _to_scalar(fill_val, dt)
+    else:
+        return fill_val
+
+
+def _to_scalar(obj, sc_ty):
+    """A way to convert object to NumPy scalar type.
+    Raises OverflowError if obj can not be represented
+    using the requested scalar type.
+    """
+    zd_arr = np.asarray(obj, dtype=sc_ty)
+    return zd_arr[()]
+
+
+def _validate_fill_value(fill_val):
+    """Validates that `fill_val` is a numeric or boolean scalar."""
+    # TODO: verify if `np.True_` and `np.False_` should be instances of
+    # Number in NumPy, like other NumPy scalars and like Python bools
+    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
+    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
+        raise TypeError(
+            f"array cannot be filled with scalar of type {type(fill_val)}"
+        )
+
+
+def full(
+    shape,
+    fill_value,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with `fill_value`.
+
+    Args:
+        shape (tuple):
+            Dimensions of the array to be created.
+        fill_value (int,float,complex,usm_ndarray):
+            fill value
+        dtype (optional): data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+
+    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+        if (
+            isinstance(fill_value, dpt.usm_ndarray)
+            and sycl_queue is None
+            and device is None
+        ):
+            sycl_queue = fill_value.sycl_queue
+        else:
+            sycl_queue = normalize_queue_device(
+                sycl_queue=sycl_queue, device=device
+            )
+        X = dpt.asarray(
+            fill_value,
+            dtype=dtype,
+            order=order,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+    else:
+        _validate_fill_value(fill_value)
+
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    usm_type = usm_type if usm_type is not None else "device"
+    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    fill_value = _cast_fill_val(fill_value, dtype)
+
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
new file mode 100644
index 000000000000..dfd1b889aafe
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -0,0 +1,171 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor constructors.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace constructors
+{
+
+using dpctl::tensor::ssize_t;
+
+/*!
+  @defgroup CtorKernels
+ */
+
+template <typename Ty>
+class full_strided_kernel;
+
+using namespace dpctl::tensor::offset_utils;
+
+/* ================ Full ================== */
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param fill_v  Value to fill the array with
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             dstTy fill_v,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
+        cgh.fill<dstTy>(p, fill_v, nelems);
+    });
+
+    return fill_ev;
+}
+
+template <typename Ty, typename IndexerT>
+class FullStridedFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty fill_v;
+    IndexerT indexer;
+
+public:
+    FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_)
+        : p(p_), fill_v(fill_v_), indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        auto offset = indexer(id.get(0));
+        p[offset] = fill_v;
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param fill_v  Value to fill the array with
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &q,
+                              int nd,
+                              std::size_t nelems,
+                              const ssize_t *shape_strides,
+                              dstTy fill_v,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+
+    dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+    using dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexer strided_indexer(nd, 0, shape_strides);
+
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = full_strided_kernel<dstTy>;
+        using Impl = FullStridedFunctor<dstTy, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(dst_tp, fill_v, strided_indexer));
+    });
+
+    return fill_ev;
+}
+
+} // namespace constructors
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
new file mode 100644
index 000000000000..e1f61be4a12a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -0,0 +1,315 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "full_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            const py::object &,
+                                            char *,
+                                            const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const py::object &py_value,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    sycl::event fill_ev;
+
+    if constexpr (sizeof(dstTy) == sizeof(char)) {
+        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
+        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                       nelems * sizeof(dstTy));
+        });
+    }
+    else {
+        bool is_zero = false;
+        if constexpr (sizeof(dstTy) == 1) {
+            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 2) {
+            is_zero =
+                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 4) {
+            is_zero =
+                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 8) {
+            is_zero =
+                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 16) {
+            struct UInt128
+            {
+
+                constexpr UInt128() : v1{}, v2{} {}
+                UInt128(const UInt128 &) = default;
+
+                operator bool() const
+                {
+                    return bool(!v1) && bool(!v2);
+                }
+
+                std::uint64_t v1;
+                std::uint64_t v2;
+            };
+            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
+        }
+
+        if (is_zero) {
+            static constexpr int memset_val = 0;
+            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                           nelems * sizeof(dstTy));
+            });
+        }
+        else {
+            using dpctl::tensor::kernels::constructors::full_contig_impl;
+
+            fill_ev =
+                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+        }
+    }
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullContigFactory
+{
+    fnT get()
+    {
+        fnT f = full_contig_impl<Ty>;
+        return f;
+    }
+};
+
+typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &,
+                                             int,
+                                             std::size_t,
+                                             py::ssize_t *,
+                                             const py::object &,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given strided memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &exec_q,
+                              int nd,
+                              std::size_t nelems,
+                              py::ssize_t *shape_strides,
+                              const py::object &py_value,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    using dpctl::tensor::kernels::constructors::full_strided_impl;
+    sycl::event fill_ev = full_strided_impl<dstTy>(
+        exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends);
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullStridedFactory
+{
+    fnT get()
+    {
+        fnT f = full_strided_impl<Ty>;
+        return f;
+    }
+};
+
+static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types];
+static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    // py_value should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = full_contig_dispatch_vector[dst_typeid];
+
+        sycl::event full_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), py_value, dst_data,
+               depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {full_contig_event}),
+            full_contig_event);
+    }
+    else {
+        int nd = dst.get_ndim();
+        auto const &dst_shape = dst.get_shape_vector();
+        auto const &dst_strides = dst.get_strides_vector();
+
+        auto fn = full_strided_dispatch_vector[dst_typeid];
+
+        std::vector<sycl::event> host_task_events;
+        host_task_events.reserve(2);
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape, dst_strides);
+        auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+        const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+        py::ssize_t *shape_strides = shape_strides_owner.get();
+
+        const sycl::event &full_strided_ev =
+            fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data,
+               {copy_shape_ev});
+
+        // free shape_strides
+        const auto &temporaries_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {full_strided_ev}, shape_strides_owner);
+        host_task_events.push_back(temporaries_cleanup_ev);
+
+        return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events),
+                              full_strided_ev);
+    }
+}
+
+void init_full_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(full_contig_dispatch_vector);
+
+    DispatchVectorBuilder<full_strided_fn_ptr_t, FullStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
new file mode 100644
index 000000000000..d664b2013506
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends = {});
+
+extern void init_full_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index c18761031fd0..c72c0b49622a 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -54,7 +54,7 @@
 // #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
-// #include "full_ctor.hpp"
+#include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
@@ -103,7 +103,7 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ================ Full ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_full;
+using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Zeros ================== */
 
@@ -159,7 +159,7 @@ void init_dispatch_vectors(void)
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
-    // init_full_ctor_dispatch_vectors();
+    init_full_ctor_dispatch_vectors();
     // init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
     // init_triul_ctor_dispatch_vectors();
@@ -327,10 +327,10 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
     //       py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_full_usm_ndarray", &usm_ndarray_full,
-    //       "Populate usm_ndarray `dst` with given fill_value.",
-    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_full_usm_ndarray", &usm_ndarray_full,
+          "Populate usm_ndarray `dst` with given fill_value.",
+          py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_take", &usm_ndarray_take,
           "Takes elements at usm_ndarray indices `ind` and axes starting "

From d50f263f089dfd52edb4daa15edd3f86807965e5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:06:00 -0800
Subject: [PATCH 16/38] Use full and _full_usm_ndarray from dpctl_ext in dpnp

---
 dpnp/dpnp_algo/dpnp_fill.py | 6 ++++--
 dpnp/dpnp_container.py      | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 112ea3af0fdb..f7e6f0f608b1 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -32,12 +32,14 @@
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 from dpctl.tensor._tensor_impl import (
-    _copy_usm_ndarray_into_usm_ndarray,
-    _full_usm_ndarray,
     _zeros_usm_ndarray,
 )
 
 import dpnp
+from dpctl_ext.tensor._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _full_usm_ndarray,
+)
 
 
 def dpnp_fill(arr, val):
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 4975db17c717..b13bf96cda28 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,7 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 
@@ -228,7 +229,7 @@ def full(
         fill_value = fill_value.get_array()
 
     """Creates `dpnp_array` having a specified shape, filled with fill_value."""
-    array_obj = dpt.full(
+    array_obj = dpt_ext.full(
         shape,
         fill_value,
         dtype=dtype,

From f189dc540477ceadf35dcb127325056c5e0c406b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:22:55 -0800
Subject: [PATCH 17/38] Update .gitignore to ignore .so files in dpctl_ext

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..4ae07ccbbdb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
+
+dpctl_ext/**/*.cpython*.so

From f9a181721784c843907c16e2e1d5569c487cf9e3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:23:51 -0800
Subject: [PATCH 18/38] Move _zeros_usm_ndarray to dpctl_ext

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../tensor/libtensor/source/tensor_ctors.cpp  |  12 +-
 .../tensor/libtensor/source/zeros_ctor.cpp    | 168 ++++++++++++++++++
 .../tensor/libtensor/source/zeros_ctor.hpp    |  59 ++++++
 4 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 0c52d766afbf..cb468b9a226d 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -53,7 +53,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index c72c0b49622a..b55439162f90 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -64,7 +64,7 @@
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 // #include "where.hpp"
-// #include "zeros_ctor.hpp"
+#include "zeros_ctor.hpp"
 
 namespace py = pybind11;
 
@@ -107,7 +107,7 @@ using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Zeros ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+using dpctl::tensor::py_internal::usm_ndarray_zeros;
 
 /* ============== Advanced Indexing ============= */
 using dpctl::tensor::py_internal::usm_ndarray_put;
@@ -160,7 +160,7 @@ void init_dispatch_vectors(void)
     // init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
-    // init_zeros_ctor_dispatch_vectors();
+    init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
     // init_triul_ctor_dispatch_vectors();
 
@@ -323,9 +323,9 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       synchronously.", py::arg("src"), py::arg("dst"),
     //       py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
-    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     m.def("_full_usm_ndarray", &usm_ndarray_full,
           "Populate usm_ndarray `dst` with given fill_value.",
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
new file mode 100644
index 000000000000..4558743b3c22
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -0,0 +1,168 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
+                                             std::size_t,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with zeros.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event zeros_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+
+    static constexpr int memset_val(0);
+    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                   nelems * sizeof(dstTy));
+    });
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct ZerosContigFactory
+{
+    fnT get()
+    {
+        fnT f = zeros_contig_impl<Ty>;
+        return f;
+    }
+};
+
+static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+{
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = zeros_contig_dispatch_vector[dst_typeid];
+
+        sycl::event zeros_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), dst_data, depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
+            zeros_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_zeros_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
new file mode 100644
index 000000000000..51270a3443cc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -0,0 +1,59 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_zeros_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 4b8505acf111ec2636afa0d2a9a25cf8677e02c7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:25:05 -0800
Subject: [PATCH 19/38] Use _zeros_usm_ndarray from dpctl_ext in dpnp_fill.py

---
 dpnp/dpnp_algo/dpnp_fill.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index f7e6f0f608b1..0d6640c3b8b5 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -31,14 +31,12 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
-from dpctl.tensor._tensor_impl import (
-    _zeros_usm_ndarray,
-)
 
 import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
+    _zeros_usm_ndarray,
 )
 
 

From 61106b2e208d7f331bebc3335a49bc23212510c1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:39:35 -0800
Subject: [PATCH 20/38] Move linear-sequence implementations to
 dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/constructors.hpp          | 178 ++++++++++
 .../libtensor/source/linear_sequences.cpp     | 312 ++++++++++++++++++
 .../libtensor/source/linear_sequences.hpp     |  69 ++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 +--
 5 files changed, 579 insertions(+), 20 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index cb468b9a226d..af0e2a7aa49f 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,7 +48,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index dfd1b889aafe..20775b071ea8 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -58,11 +58,189 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
+template <typename Ty>
+class linear_sequence_step_kernel;
+template <typename Ty, typename wTy>
+class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
+// template <typename Ty> class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
+template <typename Ty>
+class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and and data
+
+template <typename Ty, typename wTy>
+class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    std::size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            using reT = typename Ty::value_type;
+            auto _w = static_cast<reT>(w);
+            auto _wc = static_cast<reT>(wc);
+            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
+            re_comb =
+                sycl::fma(end_v.real(), _wc,
+                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb =
+                sycl::fma(start_v.imag(), _w,
+                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
+            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else if constexpr (std::is_floating_point<Ty>::value) {
+            Ty _w = static_cast<Ty>(w);
+            Ty _wc = static_cast<Ty>(wc);
+            auto affine_comb =
+                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
+            affine_comb = sycl::fma(end_v, _wc, affine_comb);
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Stating value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    const bool device_supports_doubles =
+        exec_q.get_device().has(sycl::aspect::fp64);
+    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
+
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            using KernelName = linear_sequence_affine_kernel<Ty, double>;
+            using Impl = LinearSequenceAffineFunctor<Ty, double>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+        else {
+            using KernelName = linear_sequence_affine_kernel<Ty, float>;
+            using Impl = LinearSequenceAffineFunctor<Ty, float>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 000000000000..02c4a8ad0fa1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,312 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpnp4pybind11.hpp"
+#include <complex>
+#include <cstddef>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const py::object &start,
+                                const py::object &step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty step_v = py::cast<Ty>(step);
+
+    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const py::object &start,
+                                  const py::object &end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty end_v = py::cast<Ty>(end);
+
+    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+using dpctl::utils::keep_args_alive;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_step(const py::object &start,
+                                     const py::object &dt,
+                                     const dpctl::tensor::usm_ndarray &dst,
+                                     sycl::queue &exec_q,
+                                     const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_affine(const py::object &start,
+                                       const py::object &end,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       bool include_endpoint,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
+                               end, include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 000000000000..321cd2f23efe
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
+    const py::object &start,
+    const py::object &dt,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    const py::object &start,
+    const py::object &end,
+    const dpctl::tensor::usm_ndarray &dst,
+    bool include_endpoint,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index b55439162f90..dd660c497f9a 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -57,7 +57,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-// #include "linear_sequences.hpp"
+#include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 // #include "triul_ctor.hpp"
@@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ============= linear-sequence ==================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -158,7 +158,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
-    // init_linear_sequences_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
@@ -300,22 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
     //       py::list());
 
-    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and step `dt`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and step `dt`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("dt"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and end point `end`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("end"), py::arg("dst"),
-    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and end point `end`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("end"), py::arg("dst"),
+          py::arg("include_endpoint"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_copy_numpy_ndarray_into_usm_ndarray",
     //       &copy_numpy_ndarray_into_usm_ndarray,

From a030579be8525d6f23674d5c9a4a171ab842f500 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:40:33 -0800
Subject: [PATCH 21/38] Use _tensor_impl from dpctl_ext in dpnp_utils_fft.py

---
 dpnp/fft/dpnp_utils_fft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 4e2b7aaaf842..c692774a424f 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,7 +42,6 @@
 from collections.abc import Sequence
 
 import dpctl
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -51,6 +50,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 

From a1d6fa39ba8607b191177d6acb0ca2f3cf8f49fc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 03:03:08 -0800
Subject: [PATCH 22/38] Move tril()/triu() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |   4 +
 dpctl_ext/tensor/_ctors.py                    | 157 +++++++++++
 .../include/kernels/constructors.hpp          | 138 ++++++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  46 ++--
 .../tensor/libtensor/source/triul_ctor.cpp    | 253 ++++++++++++++++++
 .../tensor/libtensor/source/triul_ctor.hpp    |  62 +++++
 7 files changed, 638 insertions(+), 24 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index af0e2a7aa49f..1375c8316754 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -54,7 +54,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 9f4c27608a99..3c6939eff7a0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -29,6 +29,8 @@
 
 from dpctl_ext.tensor._ctors import (
     full,
+    tril,
+    triu,
 )
 from dpctl_ext.tensor._indexing_functions import (
     put,
@@ -39,4 +41,6 @@
     "full",
     "put",
     "take",
+    "tril",
+    "triu",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5caa07099c56..a0e7b28e66ff 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -26,6 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import operator
 from numbers import Number
 
 import dpctl
@@ -167,3 +168,159 @@ def full(
     hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
     _manager.add_event_pair(hev, full_ev)
     return res
+
+
+def tril(x, /, *, k=0):
+    """
+    Returns the lower triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The lower triangular part of the matrix is defined as the elements on and
+    below the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal above which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            A lower-triangular array or a stack of lower-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k >= shape[nd - 1] - 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    elif k < -shape[nd - 2]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, tril_ev = ti._tril(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, tril_ev)
+
+    return res
+
+
+def triu(x, /, *, k=0):
+    """
+    Returns the upper triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The upper triangular part of the matrix is defined as the elements on and
+    above the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal below which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            An upper-triangular array or a stack of upper-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k > shape[nd - 1]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    elif k <= -shape[nd - 2] + 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, triu_ev = ti._triu(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, triu_ev)
+
+    return res
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 20775b071ea8..8d53655b2754 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -343,6 +343,144 @@ sycl::event full_strided_impl(sycl::queue &q,
     return fill_ev;
 }
 
+/* =========================== Tril and triu ============================== */
+
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
+                                    ssize_t,   // inner_range  //ssize_t
+                                    ssize_t,   // outer_range
+                                    char *,    // src_data_ptr
+                                    char *,    // dst_data_ptr
+                                    ssize_t,   // nd
+                                    ssize_t *, // shape_and_strides
+                                    ssize_t,   // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy triangular matrices from source stack to destination
+ * stack.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param inner_range  Number of elements in each matrix.
+ * @param outer_range  Number of matrices to copy.
+ * @param src_p  Kernel accessible USM pointer for the source array.
+ * @param dst_p  Kernel accessible USM pointer for the destination array.
+ * @param nd  The array dimensionality of source and destination arrays.
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides of arrays.
+ * @param k Position of the diagonal above/below which to copy filling the rest
+ * with zero elements.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends  List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty, bool>
+class tri_kernel;
+template <typename Ty, bool upper>
+sycl::event tri_impl(sycl::queue &exec_q,
+                     ssize_t inner_range,
+                     ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     ssize_t nd,
+                     ssize_t *shape_and_strides,
+                     ssize_t k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    static constexpr int d2 = 2;
+    ssize_t src_s = nd;
+    ssize_t dst_s = 2 * nd;
+    ssize_t nd_1 = nd - 1;
+    ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        cgh.parallel_for<tri_kernel<Ty, upper>>(
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                ssize_t outer_gid = idx[0] / inner_range;
+                ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
+                ssize_t src_inner_offset = 0, dst_inner_offset = 0;
+                bool to_copy{false};
+
+                {
+                    using dpctl::tensor::strides::CIndexer_array;
+                    CIndexer_array<d2, ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if constexpr (upper)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                ssize_t src_offset = 0;
+                ssize_t dst_offset = 0;
+                {
+                    using dpctl::tensor::strides::CIndexer_vector;
+                    CIndexer_vector<ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
 } // namespace constructors
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index dd660c497f9a..f2afce105f7f 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -60,7 +60,7 @@
 #include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
-// #include "triul_ctor.hpp"
+#include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 // #include "where.hpp"
@@ -129,7 +129,7 @@ using dpctl::tensor::py_internal::usm_ndarray_take;
 
 /* =========================== Tril and triu ============================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_triul;
+using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 /* =========================== Where ============================== */
 
@@ -162,7 +162,7 @@ void init_dispatch_vectors(void)
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
-    // init_triul_ctor_dispatch_vectors();
+    init_triul_ctor_dispatch_vectors();
 
     // populate_masked_extract_dispatch_vectors();
     // populate_masked_place_dispatch_vectors();
@@ -388,27 +388,27 @@ PYBIND11_MODULE(_tensor_impl, m)
           dpctl::tensor::py_internal::default_device_index_type,
           "Gives default index type supported by device.", py::arg("dev"));
 
-    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
-    // };
-    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    };
+    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
-    // };
-    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    };
+    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
     //       py::arg("cumsum"), py::arg("sycl_queue"),
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
new file mode 100644
index 000000000000..0890dfdb4766
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -0,0 +1,253 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm> // for std::copy
+#include <cstddef>   // for std::size_t
+#include <memory>    // for std::make_shared
+#include <stdexcept> // for std::runtime_error
+#include <utility>   // for std::pair, std::move
+#include <vector>    // for std::vector, std::begin, std::end
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types];
+static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+
+    const shT iter_src_strides(std::begin(src_strides),
+                               std::begin(src_strides) + nd);
+    const shT iter_dst_strides(std::begin(dst_strides),
+                               std::begin(dst_strides) + nd);
+
+    simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
+    nd += 2;
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    auto shp_host_shape_and_strides =
+        std::make_shared<usmshT>(3 * nd, allocator);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shp_host_shape_and_strides->begin());
+    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
+    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
+
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shp_host_shape_and_strides->begin() + nd);
+    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
+
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shp_host_shape_and_strides->begin() + 2 * nd);
+    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    auto dev_shape_and_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(3 * nd,
+                                                                     exec_q);
+    py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get();
+
+    const sycl::event &copy_shape_and_strides = exec_q.copy<py::ssize_t>(
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(tri_ev);
+        const auto &ctx = exec_q.get_context();
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides),
+             dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl_free_noexcept(dev_shape_and_strides, ctx);
+            });
+    });
+    // since host_task now owns USM allocation, release ownership by smart
+    // pointer
+    dev_shape_and_strides_owner.release();
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev);
+}
+
+void init_triul_ctor_dispatch_vectors(void)
+{
+
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
+    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
+    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
+    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
new file mode 100644
index 000000000000..08889df6227f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -0,0 +1,62 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_triul_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From f1d6e5650910eec6f330b2de902a93a1ae95df5f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 03:05:03 -0800
Subject: [PATCH 23/38] Use tril/triu/_tril from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_container.py           | 4 ++--
 dpnp/linalg/dpnp_utils_linalg.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index b13bf96cda28..c8e28529cd57 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -270,13 +270,13 @@ def ones(
 
 def tril(x1, /, *, k=0):
     """Creates `dpnp_array` as lower triangular part of an input array."""
-    array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
 def triu(x1, /, *, k=0):
     """Creates `dpnp_array` as upper triangular part of an input array."""
-    array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 196cd2ae9da5..5fb1c099dde2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -42,12 +42,12 @@
 
 from typing import NamedTuple
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations

From 668079060d9ece02fbb6887c2313edca9e6ecbef Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Feb 2026 02:47:35 -0800
Subject: [PATCH 24/38] Disable pylint no-name-in-module for dpctl_ext

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 1 +
 dpnp/dpnp_iface.py                          | 3 +--
 dpnp/dpnp_iface_searching.py                | 1 +
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index b63bf61f8dad..d8235b84e2d0 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -44,6 +44,7 @@
     _validate_dtype,
 )
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 832446c826ba..6220c61db6d9 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -40,6 +40,7 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=no-name-in-module
 
 import os
 
@@ -53,8 +54,6 @@
 import dpnp
 
 from .dpnp_array import dpnp_array
-
-# pylint: disable=no-name-in-module
 from .dpnp_utils import (
     dpnp_descriptor,
     map_dtype_to_device,
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index fdbd317d31dd..74fbc9b37d13 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -41,6 +41,7 @@
 
 import dpctl.tensor as dpt
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 4d8e3cdfbd0d..2de2bc15372c 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -37,6 +37,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi

From 263b7175f4aab799cd4fa100602011e8e23d046b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 04:31:01 -0800
Subject: [PATCH 25/38] Add TODO comments

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 2 ++
 dpnp/dpnp_iface.py                          | 2 ++
 dpnp/dpnp_iface_searching.py                | 2 ++
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 ++
 dpnp/scipy/linalg/_utils.py                 | 2 ++
 setup.py                                    | 2 +-
 6 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index d8235b84e2d0..88abcee5035c 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -45,6 +45,8 @@
 )
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 6220c61db6d9..50b474014666 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -50,6 +50,8 @@
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 74fbc9b37d13..16ab633d506b 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -42,6 +42,8 @@
 import dpctl.tensor as dpt
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 2de2bc15372c..3dfd3c23ee7f 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -38,6 +38,8 @@
 from dpctl.utils import ExecutionPlacementError
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 8eb9187236bf..ce832d8f4529 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -44,6 +44,8 @@
 
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
diff --git a/setup.py b/setup.py
index a0c54b066dcf..7ffef3bed9d8 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
-        # dpctl_ext
+        # TODO: replace with dpctl; dpctl.tensor
         "dpctl_ext",
         "dpctl_ext.tensor",
     ],

From 4130c1b80aa108ca127040a6c4ea15bcaa86173f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 04:53:39 -0800
Subject: [PATCH 26/38] Use default_device_complex_type from dpctl_ext on
 test_array_api_info.py

---
 dpnp/tests/test_array_api_info.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index b310192ffc59..32730c8724dc 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -1,9 +1,11 @@
-import numpy
 import pytest
 from dpctl import SyclDeviceCreationError, get_devices, select_default_device
-from dpctl.tensor._tensor_impl import default_device_complex_type
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor....`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._tensor_impl import default_device_complex_type
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,

From 17ca9ab52368f3bbdbfbdf6410b82823c98c53c0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 06:59:55 -0800
Subject: [PATCH 27/38] Remove unused build_dpctl_ext function

---
 dpctl_ext/CMakeLists.txt | 80 ----------------------------------------
 1 file changed, 80 deletions(-)

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index bb33a4f57332..cdb007a2d230 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -122,84 +122,4 @@ set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
-function(build_dpctl_ext _trgt _src _dest)
-    set(options SYCL)
-    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
-    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
-    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
-    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
-    if(BUILD_DPCTL_EXT_SYCL)
-        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
-        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
-        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
-        if(DPCTL_OFFLOAD_COMPRESS)
-            target_link_options(${_trgt} PRIVATE --offload-compress)
-        endif()
-        if(_dpctl_sycl_targets)
-            # make fat binary
-            target_compile_options(
-                ${_trgt}
-                PRIVATE ${_dpctl_sycl_target_compile_options}
-            )
-            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
-        endif()
-    endif()
-    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
-    if(DPCTL_GENERATE_COVERAGE)
-        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
-        if(BUILD_DPCTL_EXT_SYCL)
-            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
-        endif()
-    endif()
-    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
-    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-    target_link_options(${_trgt} PRIVATE ${_linker_options})
-    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
-    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
-    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
-    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
-
-    # TODO: create separate folder inside build folder that contains only
-    #   headers related to this target and appropriate folder structure to
-    #   eliminate shadow dependencies
-    get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY)
-    # TODO: do not set directory if we did not generate header
-    target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir})
-    set(_rpath_value "$ORIGIN")
-    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
-        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
-    endif()
-    if(DPCTL_WITH_REDIST)
-        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
-    endif()
-    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
-
-    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
-    install(
-        FILES ${_generated_api_h}
-        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    install(
-        FILES ${_generated_public_h}
-        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    if(DPCTL_GENERATE_COVERAGE)
-        get_filename_component(_original_src_dir ${_src} DIRECTORY)
-        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
-        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
-    endif()
-
-    # Create target with headers only, because python is managing all the
-    # library imports at runtime
-    set(_trgt_headers ${_trgt}_headers)
-    add_library(${_trgt_headers} INTERFACE)
-    add_dependencies(${_trgt_headers} ${_trgt})
-    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
-    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
-endfunction()
-
 add_subdirectory(tensor)

From 79cb2a45f28f5099701c0728a6def5c8961c5279 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 07:51:45 -0800
Subject: [PATCH 28/38] Apply remarks for CMake files

---
 dpctl_ext/CMakeLists.txt        | 10 ++-------
 dpctl_ext/tensor/CMakeLists.txt | 38 ++++++++++++++++++---------------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index cdb007a2d230..e58693091422 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -27,13 +27,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-find_package(Python REQUIRED COMPONENTS NumPy)
-
-# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
-# -w is to set working directory (and correctly set __pyx_f[] array of filenames)
-set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
-find_package(Cython REQUIRED)
-
+# TODO: rework this logic to remove current duplication
 if(WIN32)
     string(
         CONCAT WARNING_FLAGS
@@ -118,7 +112,7 @@ else()
 endif()
 
 # at build time create include/ directory and copy header files over
-set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ee8da2e49506..28e7a4cb55f4 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -27,8 +27,10 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+find_package(Python COMPONENTS Development)
+
 if(WIN32)
-    if(${CMAKE_VERSION} VERSION_LESS "3.23")
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
         # linker to ignore it.
         set(CMAKE_CXX_LINK_FLAGS
@@ -37,6 +39,7 @@ if(WIN32)
     endif()
 endif()
 
+# TODO: reuse this library for dpnp ufunc extension build
 set(_static_lib_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
 )
@@ -67,11 +70,11 @@ add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
 target_include_directories(
     ${_static_lib_trgt}
     PRIVATE
-        ${Python_INCLUDE_DIRS}
-        ${DPCTL_INCLUDE_DIR}
+        # ${Python_INCLUDE_DIRS}
+        # ${Dpctl_INCLUDE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
 )
-target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES})
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python)
 set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 set(_py_trgts)
@@ -94,14 +97,14 @@ set(_no_fast_math_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
-list(
-    APPEND _no_fast_math_sources
-    # ${_elementwise_sources}
-    # ${_reduction_sources}
-    # ${_sorting_sources}
-    # ${_linalg_sources}
-    # ${_accumulator_sources}
-)
+#list(
+#APPEND _no_fast_math_sources
+# ${_elementwise_sources}
+# ${_reduction_sources}
+# ${_sorting_sources}
+# ${_linalg_sources}
+# ${_accumulator_sources}
+#)
 
 foreach(_src_fn ${_no_fast_math_sources})
     get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
@@ -114,7 +117,7 @@ endforeach()
 
 set(_compiler_definitions "")
 
-set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+set(_linker_options "LINKER:${DPNP_LDFLAGS}")
 foreach(python_module_name ${_py_trgts})
     target_compile_options(
         ${python_module_name}
@@ -124,6 +127,7 @@ foreach(python_module_name ${_py_trgts})
         ${python_module_name}
         PRIVATE -fsycl-device-code-split=per_kernel
     )
+    # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level
     if(DPCTL_OFFLOAD_COMPRESS)
         target_link_options(${python_module_name} PRIVATE --offload-compress)
     endif()
@@ -149,22 +153,22 @@ foreach(python_module_name ${_py_trgts})
             PRIVATE -fprofile-instr-generate -fcoverage-mapping
         )
     endif()
-    if(_dpctl_sycl_targets)
+    if(_dpnp_sycl_targets)
         # make fat binary
         target_compile_options(
             ${python_module_name}
-            PRIVATE ${_dpctl_sycl_target_compile_options}
+            PRIVATE ${_dpnp_sycl_target_compile_options}
         )
         target_link_options(
             ${python_module_name}
-            PRIVATE ${_dpctl_sycl_target_link_options}
+            PRIVATE ${_dpnp_sycl_target_link_options}
         )
     endif()
     # TODO: update source so they reference individual libraries instead of
     #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
     # NOTE: dpctl C-API is resolved at runtime via Python
     # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
-    if(DPCTL_WITH_REDIST)
+    if(DPNP_WITH_REDIST)
         set_target_properties(
             ${python_module_name}
             PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."

From 4bf080edc0e5d277441fe39b31733571fbad0de3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 08:30:03 -0800
Subject: [PATCH 29/38] Apply remarks for c++ files

---
 .../include/kernels/copy_and_cast.hpp         | 18 ++++-----------
 .../include/kernels/copy_as_contiguous.hpp    | 19 ++++-----------
 .../source/copy_and_cast_usm_to_usm.cpp       | 23 ++++---------------
 .../source/copy_and_cast_usm_to_usm.hpp       | 11 ++-------
 .../libtensor/source/copy_as_contig.cpp       | 14 ++++-------
 .../libtensor/source/copy_as_contig.hpp       | 11 ++-------
 .../source/device_support_queries.cpp         | 13 ++++-------
 .../source/device_support_queries.hpp         | 12 ++--------
 .../source/simplify_iteration_space.cpp       | 12 ++++------
 .../source/simplify_iteration_space.hpp       | 11 +++------
 .../tensor/libtensor/source/tensor_ctors.cpp  | 10 ++++----
 11 files changed, 43 insertions(+), 111 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
index a07d311a7fcb..d6001a11e471 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -33,11 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>
-#include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
@@ -45,13 +46,7 @@
 #include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_and_cast
+namespace dpctl::tensor::kernels::copy_and_cast
 {
 
 using dpctl::tensor::ssize_t;
@@ -1282,7 +1277,4 @@ struct CopyForRollNDShiftFactory
     }
 };
 
-} // namespace copy_and_cast
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::copy_and_cast
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index b4f367448758..37126a22dc64 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -33,11 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>
-#include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
@@ -45,13 +46,7 @@
 #include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_as_contig
+namespace dpctl::tensor::kernels::copy_as_contig
 {
 
 using dpctl::tensor::ssize_t;
@@ -648,8 +643,4 @@ struct AsCContigNDBatchOfSquareMatricesFactory
         return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
     }
 };
-
-} // namespace copy_as_contig
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 0458aa75ac32..3d20be02f885 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -32,21 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <complex>
+#include <array>
 #include <cstddef>
-#include <cstdint>
-#include <stdexcept>
 #include <sycl/sycl.hpp>
-#include <thread>
-#include <type_traits>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/copy_and_cast.hpp"
 #include "utils/memory_overlap.hpp"
@@ -54,16 +48,11 @@
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
 
 #include "copy_as_contig.hpp"
 #include "simplify_iteration_space.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace td_ns = dpctl::tensor::type_dispatch;
@@ -305,6 +294,4 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
     dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
index d2a2dcaf7b85..d2e07b08d38f 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -38,13 +38,8 @@
 #include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
@@ -55,6 +50,4 @@ extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
 
 extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
index 53b39ff5874c..7105202fe2ff 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -32,10 +32,11 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <numeric>
 #include <stdexcept>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -54,13 +55,10 @@
 #include "copy_as_contig.hpp"
 #include "simplify_iteration_space.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
+namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
 using dpctl::tensor::kernels::copy_as_contig::
@@ -753,6 +751,4 @@ std::pair<sycl::event, sycl::event>
                           ascontig_ev);
 }
 
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
index 2de67098b7fa..bfe3159c8813 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
@@ -32,14 +32,9 @@
 #include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 std::pair<sycl::event, sycl::event>
@@ -56,6 +51,4 @@ std::pair<sycl::event, sycl::event>
 
 void init_copy_as_contig_dispatch_vectors(void);
 
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
index 51eb7dba1b6c..97a8ba83831e 100644
--- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -39,13 +39,11 @@
 #include <pybind11/stl.h>
 #include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
+namespace py = pybind11;
+
 namespace
 {
 
@@ -61,7 +59,6 @@ std::string _default_device_fp_type(const sycl::device &d)
 
 int get_numpy_major_version()
 {
-    namespace py = pybind11;
 
     py::module_ numpy = py::module_::import("numpy");
     py::str version_string = numpy.attr("__version__");
@@ -179,6 +176,4 @@ std::string default_device_index_type(const py::object &arg)
     return _default_device_index_type(d);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
index 6ea01dcd49d7..adde7aefe3dd 100644
--- a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
@@ -36,14 +36,8 @@
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::string default_device_fp_type(const py::object &);
@@ -53,6 +47,4 @@ extern std::string default_device_bool_type(const py::object &);
 extern std::string default_device_complex_type(const py::object &);
 extern std::string default_device_index_type(const py::object &);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
index 2526f022e0ac..e3cff701ed50 100644
--- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -34,15 +34,13 @@
 
 #include "simplify_iteration_space.hpp"
 #include "utils/strided_iters.hpp"
+#include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <pybind11/pybind11.h>
 #include <vector>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace py = pybind11;
@@ -539,6 +537,4 @@ std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
     return mi;
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
index d3448ee1f5fd..acbc833157d1 100644
--- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -36,11 +36,7 @@
 #include <pybind11/pybind11.h>
 #include <vector>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace py = pybind11;
@@ -125,6 +121,5 @@ std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
                                           std::vector<py::ssize_t> const &);
 std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
                                           std::vector<py::ssize_t> const &);
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 911d75ebd925..be69ee1a8c7e 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -32,15 +32,17 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <complex>
-#include <cstdint>
+// #include <algorithm>
+// #include <complex>
+// #include <cstdint>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <sycl/sycl.hpp>
-#include <thread>
+// #include <thread>
+#include <optional>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "dpnp4pybind11.hpp"
 

From cfa6cd69735591e79ca3437cc05c326ce115ffc9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 11:31:42 -0800
Subject: [PATCH 30/38] Remove linear-sequence implementations

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/constructors.hpp          | 177 ----------
 .../libtensor/source/linear_sequences.cpp     | 312 ------------------
 .../libtensor/source/linear_sequences.hpp     |  69 ----
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 +--
 5 files changed, 19 insertions(+), 579 deletions(-)
 delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 1375c8316754..baf8ef5ce5f6 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,7 +48,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 8d53655b2754..f43614e13766 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -58,189 +58,12 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
-template <typename Ty>
-class linear_sequence_step_kernel;
-template <typename Ty, typename wTy>
-class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
 // template <typename Ty> class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
-template <typename Ty>
-class LinearSequenceStepFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty step_v;
-
-public:
-    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            p[i] = Ty{start_v.real() + i * step_v.real(),
-                      start_v.imag() + i * step_v.imag()};
-        }
-        else {
-            p[i] = start_v + i * step_v;
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting value and
- * increment.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start_v Typed starting value of the sequence
- * @param step_v  Typed increment of the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                Ty start_v,
-                                Ty step_v,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
-    });
-
-    return lin_space_step_event;
-}
-
-// Constructor to populate tensor with linear sequence defined by
-// start and and data
-
-template <typename Ty, typename wTy>
-class LinearSequenceAffineFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty end_v;
-    std::size_t n;
-
-public:
-    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
-          n((den == 0) ? 1 : den)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        wTy wc = wTy(i) / n;
-        wTy w = wTy(n - i) / n;
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            using reT = typename Ty::value_type;
-            auto _w = static_cast<reT>(w);
-            auto _wc = static_cast<reT>(wc);
-            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
-            re_comb =
-                sycl::fma(end_v.real(), _wc,
-                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
-            auto im_comb =
-                sycl::fma(start_v.imag(), _w,
-                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
-            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
-            Ty affine_comb = Ty{re_comb, im_comb};
-            p[i] = affine_comb;
-        }
-        else if constexpr (std::is_floating_point<Ty>::value) {
-            Ty _w = static_cast<Ty>(w);
-            Ty _wc = static_cast<Ty>(wc);
-            auto affine_comb =
-                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
-            affine_comb = sycl::fma(end_v, _wc, affine_comb);
-            p[i] = affine_comb;
-        }
-        else {
-            using dpctl::tensor::type_utils::convert_impl;
-            auto affine_comb = start_v * w + end_v * wc;
-            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting and end values.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence.
- * @param start_v Stating value of the sequence.
- * @param end_v   End-value of the sequence.
- * @param include_endpoint  Whether the end-value is included in the sequence.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  Ty start_v,
-                                  Ty end_v,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-
-    const bool device_supports_doubles =
-        exec_q.get_device().has(sycl::aspect::fp64);
-    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
-
-    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        if (device_supports_doubles) {
-            using KernelName = linear_sequence_affine_kernel<Ty, double>;
-            using Impl = LinearSequenceAffineFunctor<Ty, double>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-        else {
-            using KernelName = linear_sequence_affine_kernel<Ty, float>;
-            using Impl = LinearSequenceAffineFunctor<Ty, float>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-    });
-
-    return lin_space_affine_event;
-}
-
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
deleted file mode 100644
index 02c4a8ad0fa1..000000000000
--- a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2026, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpnp4pybind11.hpp"
-#include <complex>
-#include <cstddef>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "linear_sequences.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-// Constructor to populate tensor with linear sequence defined by
-// start and step data
-
-typedef sycl::event (*lin_space_step_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &step,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by starting value and increment
- * given as Python objects.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start Starting value of the sequence as Python object. Must be
- * convertible to array element data type `Ty`.
- * @param step  Increment of the sequence as Python object. Must be convertible
- * to array element data type `Ty`.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const py::object &start,
-                                const py::object &step,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty step_v = py::cast<Ty>(step);
-
-    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
-
-    auto lin_space_step_event = lin_space_step_impl<Ty>(
-        exec_q, nelems, start_v, step_v, array_data, depends);
-
-    return lin_space_step_event;
-}
-
-typedef sycl::event (*lin_space_affine_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &end,
-    bool include_endpoint,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified  by starting and end values given
- * as Python objects.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param start Stating value of the sequence as Python object. Must be
- * convertible to array data element type `Ty`.
- * @param end   End-value of the sequence as Python object. Must be convertible
- * to array data element type `Ty`.
- * @param include_endpoint  Whether the end-value is included in the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const py::object &start,
-                                  const py::object &end,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty end_v = py::cast<Ty>(end);
-
-    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
-
-    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
-        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
-
-    return lin_space_affine_event;
-}
-
-using dpctl::utils::keep_args_alive;
-
-static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
-
-static lin_space_affine_fn_ptr_t
-    lin_space_affine_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-    usm_ndarray_linear_sequence_step(const py::object &start,
-                                     const py::object &dt,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with the allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_step_event;
-
-    auto fn = lin_space_step_dispatch_vector[dst_typeid];
-
-    linspace_step_event =
-        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
-                          linspace_step_event);
-}
-
-std::pair<sycl::event, sycl::event>
-    usm_ndarray_linear_sequence_affine(const py::object &start,
-                                       const py::object &end,
-                                       const dpctl::tensor::usm_ndarray &dst,
-                                       bool include_endpoint,
-                                       sycl::queue &exec_q,
-                                       const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_affine_event;
-
-    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
-
-    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
-                               end, include_endpoint, dst_data, depends);
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
-        linspace_affine_event);
-}
-
-/*!
- * @brief  Factor to get function pointer of type `fnT` for array with elements
- * of type `Ty`.
- * @defgroup CtorKernels
- */
-template <typename fnT, typename Ty>
-struct LinSpaceStepFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_step_impl<Ty>;
-        return f;
-    }
-};
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for array data type
- * `Ty`.
- */
-template <typename fnT, typename Ty>
-struct LinSpaceAffineFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_affine_impl<Ty>;
-        return f;
-    }
-};
-
-void init_linear_sequences_dispatch_vectors(void)
-{
-    using namespace td_ns;
-
-    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
-
-    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
deleted file mode 100644
index 321cd2f23efe..000000000000
--- a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2026, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
-    const py::object &start,
-    const py::object &dt,
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
-    const py::object &start,
-    const py::object &end,
-    const dpctl::tensor::usm_ndarray &dst,
-    bool include_endpoint,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern void init_linear_sequences_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index f2afce105f7f..7e4253c0cbb6 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -57,7 +57,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-#include "linear_sequences.hpp"
+// #include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
@@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ============= linear-sequence ==================== */
 
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -158,7 +158,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
-    init_linear_sequences_dispatch_vectors();
+    // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
@@ -300,22 +300,20 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
     //       py::list());
 
-    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and step `dt`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("dt"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and end point `end`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("end"), py::arg("dst"),
-          py::arg("include_endpoint"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
+    //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and step
+    //           `dt`. " "Returns a tuple of events: (ht_event, comp_event)",
+    //           py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //           py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    //     m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and end
+    //           point `end`. " "Returns a tuple of events: (ht_event,
+    //           comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"),
+    //           py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //           py::arg("depends") = py::list());
 
     // m.def("_copy_numpy_ndarray_into_usm_ndarray",
     //       &copy_numpy_ndarray_into_usm_ndarray,

From 087a2ecbfff6262224ff115c9948202ecf45e6ba Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 11:58:15 -0800
Subject: [PATCH 31/38] Use _tensor_impl from dpctl_ext in dpnp

---
 dpnp/dpnp_algo/dpnp_fill.py      |  3 +++
 dpnp/dpnp_iface.py               |  1 +
 dpnp/dpnp_iface_indexing.py      | 11 +++++++----
 dpnp/fft/dpnp_utils_fft.py       | 14 +++++++++++---
 dpnp/linalg/dpnp_utils_linalg.py |  3 +++
 dpnp/scipy/linalg/_utils.py      |  1 +
 6 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 0d6640c3b8b5..4137a2794747 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -33,6 +33,9 @@
 from dpctl.tensor._ctors import _cast_fill_val
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 50b474014666..533bdc36c617 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -50,6 +50,7 @@
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 6421f39fd4e4..a01a036e16cc 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -45,7 +45,6 @@
 from collections.abc import Iterable
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._copy_utils import _nonzero_impl
@@ -53,7 +52,11 @@
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
 import dpctl_ext.tensor as dpt_ext
-import dpctl_ext.tensor._tensor_impl as ti_ext
+
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -297,7 +300,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 "Input and output allocation queues are not compatible"
             )
 
-        if ti_ext._array_overlap(x, out):
+        if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
@@ -306,7 +309,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
 
-    h_ev, take_ev = ti_ext._take(
+    h_ev, take_ev = ti._take(
         src=x,
         ind=(inds,),
         dst=out,
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index c692774a424f..60f89a933284 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,6 +42,11 @@
 from collections.abc import Sequence
 
 import dpctl
+
+# pylint: disable=no-name-in-module
+# TODO: remove it when ti.__linspace_step
+# is migrated to dpctl_ext/tensor
+import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -50,7 +55,10 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
-import dpctl_ext.tensor._tensor_impl as ti
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 
@@ -196,7 +204,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         if (
             out is not None
             and out.strides == tuple(out_strides)
-            and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
+            and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = dpnp.get_usm_ndarray(out)
             result = out
@@ -524,7 +532,7 @@ def _truncate_or_pad(a, shape, axes):
             )
             _manager = dpu.SequentialOrderManager[exec_q]
             dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray(
                 src=dpnp.get_usm_ndarray(a),
                 dst=z.get_array()[tuple(index)],
                 sycl_queue=exec_q,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 5fb1c099dde2..171ac38a141c 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -47,6 +47,9 @@
 from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index ce832d8f4529..665a4e1595ad 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -44,6 +44,7 @@
 
 import dpctl.utils as dpu
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti

From f4492fbc8048d2fcc598a089715b85ed6504f02d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 12:28:16 -0800
Subject: [PATCH 32/38] Add missing include

---
 .../tensor/libtensor/include/kernels/constructors.hpp  |  3 ++-
 .../include/kernels/integer_advanced_indexing.hpp      |  4 +---
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp        |  8 ++++----
 dpctl_ext/tensor/libtensor/source/full_ctor.hpp        |  5 ++++-
 .../libtensor/source/integer_advanced_indexing.cpp     | 10 ++++++----
 .../libtensor/source/integer_advanced_indexing.hpp     |  6 +++++-
 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp       |  3 +--
 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp       |  2 ++
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp       |  7 ++-----
 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp       |  3 ++-
 10 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index f43614e13766..3bc4a1d16271 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -33,8 +33,9 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <array>
 #include <cstddef>
+#include <vector>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 1b2c79d2e2a5..d0ec5227731c 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -33,12 +33,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <algorithm>
-#include <complex>
 #include <cstddef>
-#include <cstdint>
 #include <sycl/sycl.hpp>
 #include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "utils/indexing_utils.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index e1f61be4a12a..279bb9f470bc 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -32,15 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
+#include <cstdint>
+#include <tuple>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "kernels/constructors.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
index d664b2013506..43b30fc8341c 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -33,13 +33,16 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index 244acfe3955f..ed72096bff8f 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -34,21 +34,23 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
+#include <memory>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
+#include <string>
 #include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "kernels/integer_advanced_indexing.hpp"
 #include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
index 57f0ddda132c..5dfbd2f04d93 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -34,13 +34,17 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
index 0890dfdb4766..f0f592c52938 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -34,8 +34,8 @@
 
 #include <algorithm> // for std::copy
 #include <cstddef>   // for std::size_t
+#include <iterator>  // for std::begin, std::end
 #include <memory>    // for std::make_shared
-#include <stdexcept> // for std::runtime_error
 #include <utility>   // for std::pair, std::move
 #include <vector>    // for std::vector, std::begin, std::end
 
@@ -47,7 +47,6 @@
 #include "kernels/constructors.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
index 08889df6227f..c61d95eef7ec 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -40,6 +40,8 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index 4558743b3c22..d7370f55e8cb 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -32,21 +32,18 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
-#include "kernels/constructors.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
 
 #include "zeros_ctor.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
index 51270a3443cc..ec3bce994ef6 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -33,10 +33,11 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 

From b367c9fd3b4b538e132afb5838584137a6f8a25c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 12:36:24 -0800
Subject: [PATCH 33/38] Use nested namespace syntax

---
 .../libtensor/include/kernels/constructors.hpp      | 13 ++-----------
 .../include/kernels/integer_advanced_indexing.hpp   | 13 ++-----------
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp     | 10 ++--------
 dpctl_ext/tensor/libtensor/source/full_ctor.hpp     | 10 ++--------
 .../libtensor/source/integer_advanced_indexing.cpp  | 10 ++--------
 .../libtensor/source/integer_advanced_indexing.hpp  | 10 ++--------
 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp    | 10 ++--------
 10 files changed, 20 insertions(+), 86 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 3bc4a1d16271..47726319b3e1 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -44,13 +44,7 @@
 #include "utils/strided_iters.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace constructors
+namespace dpctl::tensor::kernels::constructors
 {
 
 using dpctl::tensor::ssize_t;
@@ -305,7 +299,4 @@ struct TriuGenericFactory
     }
 };
 
-} // namespace constructors
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::constructors
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index d0ec5227731c..7351502dbc11 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -43,13 +43,7 @@
 #include "utils/offset_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace indexing
+namespace dpctl::tensor::kernels::indexing
 {
 
 using dpctl::tensor::ssize_t;
@@ -419,7 +413,4 @@ struct PutClipFactory
     }
 };
 
-} // namespace indexing
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index 279bb9f470bc..ca4a17f28f77 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -53,11 +53,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -310,6 +306,4 @@ void init_full_ctor_dispatch_vectors(void)
     dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
index 43b30fc8341c..18c15de87a40 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -43,11 +43,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -58,6 +54,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_full_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index ed72096bff8f..77322381d517 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -62,11 +62,7 @@
 #define WRAP_MODE      0
 #define CLIP_MODE      1
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace td_ns = dpctl::tensor::type_dispatch;
@@ -816,6 +812,4 @@ void init_advanced_indexing_dispatch_tables(void)
     dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
index 5dfbd2f04d93..bc0136288e1c 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -45,11 +45,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -72,6 +68,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_advanced_indexing_dispatch_tables(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
index f0f592c52938..13e909196460 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -54,11 +54,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -247,6 +243,4 @@ void init_triul_ctor_dispatch_vectors(void)
     dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
index c61d95eef7ec..47cc4ce8892d 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -42,11 +42,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -59,6 +55,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_triul_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index d7370f55e8cb..b9a2e01bea4a 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -50,11 +50,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -160,6 +156,4 @@ void init_zeros_ctor_dispatch_vectors(void)
     return;
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
index ec3bce994ef6..51a1903a0f36 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -41,11 +41,7 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -55,6 +51,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_zeros_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal

From 3113716a13a131dc44f819140489176be5ff7cba Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 02:50:47 -0800
Subject: [PATCH 34/38] Add missing include complex

---
 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp   | 1 +
 .../libtensor/include/kernels/integer_advanced_indexing.hpp   | 4 +++-
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp               | 2 ++
 .../tensor/libtensor/source/integer_advanced_indexing.cpp     | 2 ++
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp              | 2 ++
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 47726319b3e1..22189ee3129c 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -34,6 +34,7 @@
 
 #pragma once
 #include <array>
+#include <complex>
 #include <cstddef>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 7351502dbc11..7be2b3ea8591 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -33,11 +33,13 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
+#include <complex>
 #include <cstddef>
-#include <sycl/sycl.hpp>
 #include <type_traits>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpctl_tensor_types.hpp"
 #include "utils/indexing_utils.hpp"
 #include "utils/offset_utils.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index ca4a17f28f77..aef57836666e 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -32,6 +32,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
@@ -41,6 +42,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "kernels/constructors.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index 77322381d517..925cc2e895ed 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -34,6 +34,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -46,6 +47,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index b9a2e01bea4a..2eb05e49f382 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -32,6 +32,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
+#include <complex>
 #include <cstddef>
 #include <stdexcept>
 #include <utility>
@@ -40,6 +41,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "utils/output_validation.hpp"

From 978afee9115d8feaebe72c80ce3e827e13c66770 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 03:13:50 -0800
Subject: [PATCH 35/38] Add missing memory and queue checks

---
 .../libtensor/source/copy_as_contig.cpp       | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
index 7105202fe2ff..bbee24c95d4d 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -189,6 +189,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     const auto &src_strides_vec = src.get_strides_vector();
 
     if (src_nd >= 2) {
@@ -314,6 +320,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     const auto &src_strides_vec = src.get_strides_vector();
 
     if (src_nd >= 2) {
@@ -459,6 +471,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     if (nelems == 0) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
@@ -624,6 +642,20 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Unexpected destination array layout");
     }
 
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     int src_typenum = src.get_typenum();
     int dst_typenum = dst.get_typenum();
 

From 19e93b99c7c2c238f1b697dfefe5b70525370819 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 04:34:09 -0800
Subject: [PATCH 36/38] Update .gitignore to ignore .so files in dpctl_ext

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..0cfebe53f623 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
+
+# TODO: revert to `dpctl/`
+# when dpnp fully migrates dpctl/tensor
+dpctl_ext/**/*.cpython*.so

From b111e49b784168180c835569d5dbe97958521f16 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 04:35:23 -0800
Subject: [PATCH 37/38] Remove unused includes in tensor_ctors.cpp

---
 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index be69ee1a8c7e..54d6adbc8f6e 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -32,18 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-// #include <algorithm>
-// #include <complex>
-// #include <cstdint>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-// #include <thread>
 #include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 
 // #include "accumulators.hpp"

From c082224e07df5e4d4960112ef5ec4e5faef2a452 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 05:40:59 -0800
Subject: [PATCH 38/38] Use Python::Module for dpctl_ext static lib to avoid
 libpython dependency

---
 dpctl_ext/tensor/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 28e7a4cb55f4..ed69b4f10cba 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -27,7 +27,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-find_package(Python COMPONENTS Development)
+find_package(Python COMPONENTS Development.Module)
 
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
@@ -74,7 +74,7 @@ target_include_directories(
         # ${Dpctl_INCLUDE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
 )
-target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module)
 set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 set(_py_trgts)