From d4fd805dda7c60a2c09983a9cd5aa1b04d9477d1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Feb 2026 09:57:04 -0800 Subject: [PATCH 01/38] Rename folder dpctl to dpctl_ext --- .../tensor/libtensor/include/kernels/alignment.hpp | 0 .../tensor/libtensor/include/kernels/dpctl_tensor_types.hpp | 0 .../libtensor/include/kernels/elementwise_functions/common.hpp | 0 .../include/kernels/elementwise_functions/common_detail.hpp | 0 .../include/kernels/elementwise_functions/logaddexp.hpp | 0 .../libtensor/include/kernels/elementwise_functions/maximum.hpp | 0 .../libtensor/include/kernels/elementwise_functions/minimum.hpp | 0 .../include/kernels/elementwise_functions/sycl_complex.hpp | 0 .../include/kernels/elementwise_functions/vec_size_util.hpp | 0 .../tensor/libtensor/include/utils/indexing_utils.hpp | 0 .../tensor/libtensor/include/utils/math_utils.hpp | 0 .../tensor/libtensor/include/utils/memory_overlap.hpp | 0 .../tensor/libtensor/include/utils/offset_utils.hpp | 0 .../tensor/libtensor/include/utils/output_validation.hpp | 0 .../tensor/libtensor/include/utils/strided_iters.hpp | 0 .../tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 0 .../tensor/libtensor/include/utils/sycl_utils.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch_building.hpp | 0 .../tensor/libtensor/include/utils/type_utils.hpp | 0 dpnp/backend/extensions/blas/CMakeLists.txt | 2 +- dpnp/backend/extensions/fft/CMakeLists.txt | 2 +- dpnp/backend/extensions/indexing/CMakeLists.txt | 2 +- dpnp/backend/extensions/lapack/CMakeLists.txt | 2 +- dpnp/backend/extensions/statistics/CMakeLists.txt | 2 +- dpnp/backend/extensions/ufunc/CMakeLists.txt | 2 +- dpnp/backend/extensions/vm/CMakeLists.txt | 2 +- dpnp/backend/extensions/window/CMakeLists.txt | 2 +- 28 files changed, 8 insertions(+), 8 deletions(-) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%) diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt index 0015eda84843..cbc3e31d923b 100644 --- a/dpnp/backend/extensions/blas/CMakeLists.txt +++ b/dpnp/backend/extensions/blas/CMakeLists.txt @@ -68,7 +68,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt index 0569ecc8bca4..edc7bff7dce4 100644 --- a/dpnp/backend/extensions/fft/CMakeLists.txt +++ b/dpnp/backend/extensions/fft/CMakeLists.txt @@ -61,7 +61,7 @@ target_include_directories( ${python_module_name} PRIVATE ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt index c0de75ae3146..39f68ffba846 100644 --- a/dpnp/backend/extensions/indexing/CMakeLists.txt +++ b/dpnp/backend/extensions/indexing/CMakeLists.txt @@ -65,7 +65,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 76b25c3a6d10..59499a3b28f8 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -86,7 +86,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt index e04279b75e49..8544e816e8d6 100644 --- a/dpnp/backend/extensions/statistics/CMakeLists.txt +++ b/dpnp/backend/extensions/statistics/CMakeLists.txt @@ -70,7 +70,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt index 55a750f8423f..293cef0ab326 100644 --- a/dpnp/backend/extensions/ufunc/CMakeLists.txt +++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt @@ -88,7 +88,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt index 32d6a6765a00..551c43842af2 100644 --- a/dpnp/backend/extensions/vm/CMakeLists.txt +++ b/dpnp/backend/extensions/vm/CMakeLists.txt @@ -110,7 +110,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt index 6fe04e334f42..01274317782d 100644 --- a/dpnp/backend/extensions/window/CMakeLists.txt +++ b/dpnp/backend/extensions/window/CMakeLists.txt @@ -66,7 +66,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) From c040713d50cd10c628990b628cb74b0a5029f99b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:04:36 -0800 Subject: [PATCH 02/38] Add simplify_iteration_space implementation to libtensor --- .../source/simplify_iteration_space.cpp | 544 ++++++++++++++++++ .../source/simplify_iteration_space.hpp | 130 +++++ 2 files changed, 674 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp new file mode 100644 index 000000000000..2526f022e0ac --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -0,0 +1,544 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "simplify_iteration_space.hpp" +#include "utils/strided_iters.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + using dpctl::tensor::strides::simplify_iteration_stride; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + + simplified_strides.reserve(nd); + simplified_strides.insert(std::end(simplified_strides), + std::begin(strides), std::end(strides)); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + simplified_strides.push_back((strides[0] >= 0) ? strides[0] + : -strides[0]); + if ((strides[0] < 0) && (shape[0] > 1)) { + offset += (shape[0] - 1) * strides[0]; + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &src_strides, + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_two_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::begin(simplified_shape), shape, + shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_src_strides.insert(std::end(simplified_src_strides), + std::begin(src_strides), + std::end(src_strides)); + assert(simplified_src_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_two_strides( + nd, simplified_shape.data(), simplified_src_strides.data(), + simplified_dst_strides.data(), + src_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src_strides[0] < 0 && dst_strides[0] < 0) { + simplified_src_strides.push_back(-src_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src_offset += (shape[0] - 1) * src_strides[0]; + dst_offset += (shape[0] - 1) * dst_strides[0]; + } + } + else { + simplified_src_strides.push_back(src_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_three_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (dst_strides[0] < 0)) { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_4( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // src3 + std::vector const &src3_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_src3_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &src3_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_four_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_src3_strides.reserve(nd); + simplified_src3_strides.insert(std::end(simplified_src3_strides), + std::begin(src3_strides), + std::end(src3_strides)); + assert(simplified_src3_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_four_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_src3_strides.data(), + simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + src3_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_src3_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + src3_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_src3_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (src3_strides[0] < 0) && (dst_strides[0] < 0)) + { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_src3_strides.push_back(-src3_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + src3_offset += src3_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_src3_strides.push_back(src3_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +void split_iteration_space(const std::vector &shape_vec, + const std::vector &strides_vec, + int axis_start, + int axis_end, + std::vector &dir1_shape_vec, + std::vector &dir2_shape_vec, + std::vector &dir1_strides_vec, + std::vector &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +py::ssize_t _ravel_multi_index_c(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(nd - 1 - i) * s; + s *= shape.at(nd - 1 - i); + } + + return flat_index; +} + +py::ssize_t _ravel_multi_index_f(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(i) * s; + s *= shape.at(i); + } + + return flat_index; +} + +std::vector _unravel_index_c(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[nd - 1 - dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[nd - 1 - dim] = r; + i_ = q; + } + if (nd) { + mi[0] = i_; + } + return mi; +} + +std::vector _unravel_index_f(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[dim] = r; + i_ = q; + } + if (nd) { + mi[nd - 1] = i_; + } + return mi; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp new file mode 100644 index 000000000000..d3448ee1f5fd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -0,0 +1,130 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector &, + std::vector &, + py::ssize_t &); + +void simplify_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector const &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_3(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_4(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // src3 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + +void split_iteration_space(const std::vector &, + const std::vector &, + int, + int, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &); + +py::ssize_t _ravel_multi_index_c(std::vector const &, + std::vector const &); +py::ssize_t _ravel_multi_index_f(std::vector const &, + std::vector const &); +std::vector _unravel_index_c(py::ssize_t, + std::vector const &); +std::vector _unravel_index_f(py::ssize_t, + std::vector const &); +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 14b466facfe6b23f92113ccc2dbb224e2727bf3c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:14:43 -0800 Subject: [PATCH 03/38] Extend codespell ignore list for libtensor --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cdf592535d11..67fb75cb5f54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT" quiet-level = 3 [tool.coverage.report] From dcc421bc61c36549d3e6865927f495abab15d078 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:15:09 -0800 Subject: [PATCH 04/38] Add copy_and_cast kernels to libtensor --- .../include/kernels/copy_and_cast.hpp | 1288 +++++++++++++++++ .../include/kernels/copy_as_contiguous.hpp | 655 +++++++++ .../libtensor/source/copy_as_contig.cpp | 758 ++++++++++ .../libtensor/source/copy_as_contig.hpp | 61 + 4 files changed, 2762 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 000000000000..a07d311a7fcb --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,1288 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_and_cast +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class copy_cast_generic_kernel; + +template +class copy_cast_contig_kernel; + +template +class copy_cast_from_host_kernel; + +template +class copy_cast_from_host_contig_kernel; + +template +class Caster +{ +public: + Caster() = default; + dstTy operator()(const srcTy &src) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(src); + } +}; + +template +class GenericCopyFunctor +{ +private: + const srcT *src_ = nullptr; + dstT *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer) + : src_(src_p), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + static constexpr CastFnT fn{}; + dst_[dst_offset] = fn(src_[src_offset]); + } +}; + +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specified via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_generic_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset, + shape_and_strides}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor, + TwoOffsets_StridedIndexer>(src_tp, dst_tp, + indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for contiguous arrays + +template +class ContigCopyFunctor +{ +private: + std::size_t nelems; + const srcT *src_p = nullptr; + dstT *dst_p = nullptr; + +public: + ContigCopyFunctor(const std::size_t nelems_, + const srcT *src_p_, + dstT *dst_p_) + : nelems(nelems_), src_p(src_p_), dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr CastFnT fn{}; + + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex_v; + if constexpr (!enable_sg_loadstore || is_complex_v || + is_complex_v) { + std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * elems_per_sg + (gid % sgSize) + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = fn(src_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&src_p[offset]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[offset]); + + const sycl::vec src_vec = + sub_group_load(sg, src_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; k++) { + dst_vec[k] = fn(src_vec[k]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < nelems; k += sgSize) { + dst_p[k] = fn(src_p[k]); + } + } + } + } +}; + +/*! + * @brief Function pointer type for contiguous array cast and copy function. + */ +typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const std::vector &); + +/*! + * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray + to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have the same number of elements `nelems`. + `src_cp` and `dst_cp` represent char pointers to the start of respective + arrays. Kernel is submitted to sycl queue `q` with events `depends` as + dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param src_p Kernel accessible USM pointer for the source array + @param dst_p Kernel accessible USM pointer for the destination array + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *src_cp, + char *dst_cp, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const srcTy *src_tp = reinterpret_cast(src_cp); + dstTy *dst_tp = reinterpret_cast(dst_cp); + + std::size_t lws = 64; + static constexpr std::uint32_t vec_sz = 4; + static constexpr std::uint32_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(src_cp) && + is_aligned(dst_cp)) + { + static constexpr bool enable_sg_loadstore = true; + using KernelName = + copy_cast_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, enable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + copy_cast_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, disable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get specialized function pointer for casting and copying + * contiguous arrays. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_contig_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_nd_specialized_impl( + sycl::queue &q, + std::size_t nelems, + const std::array &shape, + const std::array &src_strides, + const std::array &dst_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + using IndexerT = TwoOffsets_FixedDimStridedIndexer; + const IndexerT indexer{shape, src_strides, dst_strides, src_offset, + dst_offset}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.depends_on(depends); + cgh.parallel_for< + class copy_cast_generic_kernel>( + sycl::range<1>(nelems), + GenericCopyFunctor, IndexerT>( + src_tp, dst_tp, indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + dstTy *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFromHostFunctor(const AccessorT &src_acc, + dstTy *dst_p, + const IndexerT &indexer) + : src_acc_(src_acc), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + CastFnT fn{}; + dst_[dst_offset] = fn(src_acc_[src_offset]); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + ssize_t, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *host_src_p, + ssize_t src_offset, + ssize_t src_min_nelem_offset, + ssize_t src_max_nelem_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + const TwoOffsets_StridedIndexer indexer{ + nd, src_offset - src_min_nelem_offset, dst_offset, + const_cast(shape_and_strides)}; + + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, + TwoOffsets_StridedIndexer>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, /* nelems */ + const char *, /* src_pointer */ + ssize_t, /* src_offset */ + char *, /* dst_pointer */ + ssize_t, /* dst_offset */ + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy` for contiguous arrays. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param src_stride The stride of source array in elements + * @param dst_stride The stride of destimation array in elements + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_contig_impl( + sycl::queue &q, + std::size_t nelems, + const char *host_src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_offset, + sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + using IndexerT = TwoOffsets_CombinedIndexer; + static constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; + static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, + dst_indexer}; + + dstTy *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for< + copy_cast_from_host_contig_kernel>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, IndexerT>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_contig_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template +class copy_for_reshape_generic_kernel; + +template +class GenericCopyForReshapeFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + GenericCopyForReshapeFunctor(const char *src_ptr, + char *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const ssize_t src_offset = src_indexer_(wiid.get(0)); + const ssize_t dst_offset = dst_indexer_(wiid.get(0)); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // src_nd + int, // dst_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index(i, + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event + copy_for_reshape_generic_impl(sycl::queue &q, + std::size_t nelems, + int src_nd, + int dst_nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + + const ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); + + const ssize_t *dst_shape_and_strides = const_cast( + packed_shapes_and_strides + (2 * src_nd)); + + const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; + const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; + + using KernelName = + copy_for_reshape_generic_kernel; + + cgh.parallel_for( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor( + src_p, dst_p, src_indexer, dst_indexer)); + }); + + return copy_for_reshape_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +// ================== Copying for roll ================== // + +/*! @brief Functor to cyclically roll global_id to the left */ +struct LeftRolled1DTransformer +{ + LeftRolled1DTransformer(std::size_t offset, std::size_t size) + : offset_(offset), size_(size) + { + } + + std::size_t operator()(std::size_t gid) const + { + const std::size_t shifted_gid = + ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); + return shifted_gid; + } + +private: + std::size_t offset_ = 0; + std::size_t size_ = 1; +}; + +/*! @brief Indexer functor to compose indexer and transformer */ +template +struct CompositionIndexer +{ + CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} + + auto operator()(std::size_t gid) const + { + return f_(t_(gid)); + } + +private: + IndexerT f_; + TransformerT t_; +}; + +/*! @brief Indexer functor to find offset for nd-shifted indices lifted from + * iteration id */ +struct RolledNDIndexer +{ + RolledNDIndexer(int nd, + const ssize_t *shape, + const ssize_t *strides, + const ssize_t *ndshifts, + ssize_t starting_offset) + : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), + starting_offset_(starting_offset) + { + } + + ssize_t operator()(std::size_t gid) const + { + return compute_offset(gid); + } + +private: + int nd_ = -1; + const ssize_t *shape_ = nullptr; + const ssize_t *strides_ = nullptr; + const ssize_t *ndshifts_ = nullptr; + ssize_t starting_offset_ = 0; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd_); + ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( + gid, + shape_, // shape ptr + strides_, // strides ptr + ndshifts_, // shifts ptr + relative_offset_); + return starting_offset_ + relative_offset_; + } +}; + +template +class copy_for_roll_strided_kernel; + +template +class StridedCopyForRollFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + StridedCopyForRollFunctor(const Ty *src_ptr, + Ty *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const std::size_t gid = wiid.get(0); + + const ssize_t src_offset = src_indexer_(gid); + const ssize_t dst_offset = dst_indexer_(gid); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param nd Array dimensionality of the destination and source arrays + * @param packed_shapes_and_strides Kernel accessible USM array + * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of first element of src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of first element of dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_strided_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 3 * nd + // [ common_shape; src_strides; dst_strides ] + + const StridedIndexer src_indexer{nd, src_offset, + packed_shapes_and_strides}; + const LeftRolled1DTransformer left_roll_transformer{shift, nelems}; + + using CompositeIndexerT = + CompositionIndexer; + + const CompositeIndexerT rolled_src_indexer(src_indexer, + left_roll_transformer); + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, + packed_shapes_and_strides, + packed_shapes_and_strides + 2 * nd}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, rolled_src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +// define function type +typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +class copy_for_roll_contig_kernel; + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of the start of array src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of the start of array dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_contig_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + static constexpr NoOpIndexer src_indexer{}; + const LeftRolled1DTransformer roller{shift, nelems}; + + const CompositionIndexer + left_rolled_src_indexer{src_indexer, roller}; + static constexpr NoOpIndexer dst_indexer{}; + + using KernelName = copy_for_roll_contig_kernel; + + const Ty *src_tp = reinterpret_cast(src_p) + src_offset; + Ty *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor< + Ty, CompositionIndexer, + NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer, + dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollStridedFactory +{ + fnT get() + { + fnT f = copy_for_roll_strided_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollContigFactory +{ + fnT get() + { + fnT f = copy_for_roll_contig_impl; + return f; + } +}; + +template +class copy_for_roll_ndshift_strided_kernel; + +// define function type +typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +sycl::event copy_for_roll_ndshift_strided_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides_and_shifts, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides_and_shifts: + // USM array of size 4 * nd + // [ common_shape; src_strides; dst_strides; shifts ] + + const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const ssize_t *src_strides_ptr = + packed_shapes_and_strides_and_shifts + nd; + const ssize_t *dst_strides_ptr = + packed_shapes_and_strides_and_shifts + 2 * nd; + const ssize_t *shifts_ptr = + packed_shapes_and_strides_and_shifts + 3 * nd; + + const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, + shifts_ptr, src_offset}; + + const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr, + dst_strides_ptr}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollNDShiftFactory +{ + fnT get() + { + fnT f = copy_for_roll_ndshift_strided_impl; + return f; + } +}; + +} // namespace copy_and_cast +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp new file mode 100644 index 000000000000..b4f367448758 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -0,0 +1,655 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_as_contig +{ + +using dpctl::tensor::ssize_t; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class CopyAsCContigFunctor +{ +private: + std::size_t nelems; + const T *src_p = nullptr; + T *dst_p = nullptr; + IndexerT src_indexer; + +public: + CopyAsCContigFunctor(std::size_t n, + const T *src_, + T *dst_, + const IndexerT &src_indexer_) + : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_max_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize) + // gid % sgSize == gid - (gid / sgSize) * sgSize + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + auto src_offset = src_indexer(offset); + dst_p[offset] = src_p[src_offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + const std::uint16_t elems_per_sg = elems_per_wi * sgSize; + + if (base + elems_per_sg < nelems) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + // it == vec_id * vec_sz, for 0 <= vec_id < n_vecs + const std::size_t block_start_id = base + it * sgSize; + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[block_start_id]); + + const std::size_t elem_id0 = + block_start_id + sg.get_local_id(); + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + const std::size_t elem_id = elem_id0 + k * sgSize; + const ssize_t src_offset = src_indexer(elem_id); + dst_vec[k] = src_p[src_offset]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + const std::size_t k0 = base + lane_id; + for (std::size_t k = k0; k < nelems; k += sgSize) { + const ssize_t src_offset = src_indexer(k); + dst_p[k] = src_p[src_offset]; + } + } + } + } +}; + +template +sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, + std::size_t nelems, + const T *src, + T *dst, + const IndexerT &src_indexer, + const std::vector &depends) +{ + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::size_t preferred_lws = 256; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t lws = + ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; + + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + const std::size_t nelems_per_group = nelems_per_wi * lws; + const std::size_t n_groups = + (nelems + nelems_per_group - 1) / (nelems_per_group); + + sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + + const sycl::range<1> gRange{n_groups * lws}; + const sycl::range<1> lRange{lws}; + + cgh.parallel_for( + sycl::nd_range<1>(gRange, lRange), + CopyAsCContigFunctor( + nelems, src, dst, src_indexer)); + }); + return copy_ev; +} + +template +class as_contig_krn; + +template +sycl::event + as_c_contiguous_array_generic_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); + + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + + using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; + using dpctl::tensor::kernels::alignment_utils::is_aligned; + using dpctl::tensor::kernels::alignment_utils::required_alignment; + + sycl::event copy_ev; + if (is_aligned(dst_p)) { + static constexpr bool enable_sg_load = true; + using KernelName = + as_contig_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + else { + static constexpr bool disable_sg_load = false; + using InnerKernelName = + as_contig_krn; + using KernelName = disabled_sg_loadstore_wrapper_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + + return copy_ev; +} + +typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + char *, + const std::vector &); + +template +struct AsCContigFactory +{ + fnT get() + { + return as_c_contiguous_array_generic_impl; + } +}; + +template +class as_contig_batch_of_square_matrices_krn; + +namespace detail +{ +/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination + strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks + to avoid race condition + */ +template +sycl::event as_c_contiguous_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + const BatchIndexerT &batch_two_offsets_indexer, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + static constexpr std::uint16_t private_tile_size = 4; + static constexpr std::uint16_t n_lines = 2; + static constexpr std::uint16_t block_size = + n_lines * private_tile_size * private_tile_size; + + static constexpr std::uint16_t lws0 = block_size; + static constexpr std::uint16_t lws1 = n_lines; + static constexpr std::uint16_t nelems_per_wi = (block_size / lws1); + + static_assert(nelems_per_wi * lws1 == block_size); + static_assert(nelems_per_wi == private_tile_size * private_tile_size); + + static constexpr std::uint32_t lws = lws0 * lws1; + + const std::size_t n_tiles = (n + block_size - 1) / block_size; + + const ssize_t src_stride = src_ld; + const ssize_t dst_stride = dst_ld; + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + using KernelName = + as_contig_batch_of_square_matrices_krn; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::local_accessor local_block(block_size * block_size, cgh); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { + // 1. Read block from source array into SLM + const std::uint32_t lid_lin = nd_it.get_local_linear_id(); + const std::size_t gr_id_lin = nd_it.get_group_linear_id(); + + const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); + const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); + + const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id); + const auto &src_batch_offset = batch_two_offsets.get_first_offset(); + const auto &dst_batch_offset = + batch_two_offsets.get_second_offset(); + + // Block id + /* 0 <= src_gr_i1 < n_groups_n1 */ + const std::size_t src_tile_i1 = rem / n_tiles; + /* 0 <= src_gr_i0 < n_groups_n0 */ + const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles; + + // ID of element within the block + /* 0 <= src_i1 < lws1 */ + const std::uint32_t src_i1 = lid_lin / lws0; + /* 0 <= src_i0 < lws0 */ + const std::uint32_t src_i0 = lid_lin - src_i1 * lws0; + + // Matrix element ID + const std::size_t src_tile_start0 = src_tile_i0 * block_size; + const std::size_t src_tile_start1 = src_tile_i1 * block_size; + const std::size_t src_gid0 = (src_tile_start0 + src_i0); + const std::size_t src_gid1 = (src_tile_start1 + src_i1); + + // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) * + // src_stride + const std::size_t src_offset0 = + src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride; + const std::size_t pr_step_src = lws1 * src_stride; + + const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size; + const std::uint32_t pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + local_block[local_offset0 + pr_step_local * pr_id] = + (src_gid0 < n && src_gid1 + pr_id * lws1 < n) + ? src_tp[src_offset0 + pr_step_src * pr_id] + : T(0); + } + + const std::uint32_t local_dim0 = static_cast( + std::min(src_tile_start0 + block_size, n) - + src_tile_start0); + const std::uint32_t local_dim1 = static_cast( + std::min(src_tile_start1 + block_size, n) - + src_tile_start1); + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 2. Permute the block matrix in SLM using two private arrays + std::array private_block_01 = {T(0)}; + std::array private_block_10 = {T(0)}; + + // 0 <= lid_lin < lws0 * lws1 == + // (block_size * block_size / nelems_per_wi) == + // (block_size/private_tile_size)**2 + static constexpr std::uint16_t n_private_tiles_per_axis = + block_size / private_tile_size; + const std::uint16_t local_tile_id0 = + lid_lin / n_private_tiles_per_axis; + const std::uint16_t local_tile_id1 = + lid_lin - local_tile_id0 * n_private_tiles_per_axis; + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + + const std::uint16_t pr_offset = + pr_i1 * private_tile_size + pr_i0; + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // read (local_tile_id0, local_tile_id1) + const std::uint16_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + private_block_01[pr_offset] = + local_block[local_01_offset]; + + // read (local_tile_id1, local_tile_id0) + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + private_block_10[pr_offset] = + local_block[local_10_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + const std::uint16_t pr_offset = + pr_i0 * private_tile_size + pr_i1; + + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // write back permuted private blocks + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + local_block[local_01_offset] = + private_block_10[pr_offset]; + + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + local_block[local_10_offset] = + private_block_01[pr_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 3. Write out permuted SLM to destination array + + const std::size_t dst_tile_start0 = src_tile_start0; + const std::size_t dst_tile_start1 = src_tile_start1; + + if (local_dim0 == block_size && local_dim1 == block_size) { + const std::uint16_t dst_i0 = src_i1; + const std::uint16_t dst_i1 = src_i0; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset0 = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::size_t pr_step_dst = lws1 * dst_stride; + + const std::uint16_t _local_offset0 = + dst_i0 * block_size + dst_i1; + const std::uint16_t _pr_step_local = lws1 * block_size; + + for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) { + dst_tp[dst_offset0 + pr_step_dst * pr_id] = + local_block[_local_offset0 + + _pr_step_local * pr_id]; + } + } + } + else { + // map local_linear_id into (local_dim0, local_dim1) + for (std::uint16_t el_id = lid_lin; + el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) + { + + // 0 <= local_i0 < local_dim0 + const std::uint16_t loc_i0 = el_id / local_dim1; + // 0 <= local_i1 < local_dim1 + const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1; + + const std::uint16_t dst_i0 = loc_i0; + const std::uint16_t dst_i1 = loc_i1; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::uint16_t local_offset = + loc_i0 * block_size + loc_i1; + + if ((dst_gid1 < n) && (dst_gid0 < n)) { + dst_tp[dst_offset] = local_block[local_offset]; + } + } + } + }); + }); + + return e; +} + +} // end of namespace detail + +template +sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + ssize_t src_batch_step, + ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = + TwoOffsets_CombinedIndexer; + + const auto &src_batch_indexer = + Strided1DIndexer(batch_nelems, src_batch_step); + const auto &dst_batch_indexer = + Strided1DIndexer(batch_nelems, dst_batch_step); + + const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p, + dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of batch elements */ + ssize_t, /* distance between batches in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* size of square matrices in the batch */ + const char *, + ssize_t, /* untyped pointer to F-contig source array, and matrix leading + dimension */ + char *, + ssize_t, /* untyped pointer to C-contig destination array, and matrix + leading dimension */ + const std::vector &); + +template +struct AsCContig1DBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_1d_batch_of_square_matrices_impl; + } +}; + +template +sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + int batch_nd, + const ssize_t *src_batch_shape_strides, + const ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = TwoOffsets_CombinedIndexer; + + static constexpr ssize_t zero_offset{0}; + + const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, + src_batch_shape_strides}; + const DstIndexerT dst_batch_indexer{/* size */ batch_nelems, + /* step */ dst_batch_step}; + + const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer, + dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld, + dst_p, dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of matrices in the batch */ + int, + const ssize_t *, /* dimensionality, and packed [shape, src_strides] + describing iteration over batch in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* matrix size */ + const char *, + ssize_t, /* untyped pointer to source array of F-contig matrices, and + leading dimension of the matrix */ + char *, + ssize_t, /* untyped pointer to destination array of F-contig matrices, and + leading dimension of the matrix */ + const std::vector &); + +template +struct AsCContigNDBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_nd_batch_of_square_matrices_impl; + } +}; + +} // namespace copy_as_contig +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp new file mode 100644 index 000000000000..53b39ff5874c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -0,0 +1,758 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/copy_as_contiguous.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_array_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +static as_c_contiguous_array_impl_fn_ptr_t + as_c_contig_array_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +void init_copy_as_contig_dispatch_vectors(void) +{ + + using dpctl::tensor::kernels::copy_as_contig:: + AsCContig1DBatchOfSquareMatricesFactory; + using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory; + using dpctl::tensor::kernels::copy_as_contig:: + AsCContigNDBatchOfSquareMatricesFactory; + using td_ns::DispatchVectorBuilder; + + // Generic to c-contig + DispatchVectorBuilder + dtv_as_c_contig_array; + + dtv_as_c_contig_array.populate_dispatch_vector( + as_c_contig_array_dispatch_vector); + + // 1D batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t, + AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_1d_batch_of_square_matrices; + + dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_1d_batch_of_square_matrices_dispatch_vector); + + // ND batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t, + AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_nd_batch_of_square_matrices; + + dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_nd_batch_of_square_matrices_dispatch_vector); +} + +namespace +{ + +template +std::size_t get_nelems(const std::vector &shape) +{ + auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t { + return prod * static_cast(term); + }; + + static constexpr std::size_t unit{1}; + + const std::size_t nelems = + std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); + return nelems; +} + +} // end of anonymous namespace + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + const int src_nd = src.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.back(); + if (n == dst_shape_vec[src_nd - 2]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[src_nd - 2] == unit_stride) { + return py_as_c_contig_f2c(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be F-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.front(); + if (n == dst_shape_vec[1]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[1] == unit_stride) { + return py_as_f_contig_c2f(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = src_shape_vec.back(); + if (src_shape_vec[src_nd - 2] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec), + std::end(src_shape_vec) - 2); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec), + std::end(src_strides_vec) - 2); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec), + std::end(dst_strides_vec) - 2); + } + + // simplify batch iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], all_depends); + + // async free of shape_strides temporary + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = dst_shape_vec.front(); + if (dst_shape_vec[1] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[1] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec) + 2, + std::end(src_shape_vec)); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec) + 2, + std::end(src_strides_vec)); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec) + 2, + std::end(dst_strides_vec)); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], all_depends); + + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp new file mode 100644 index 000000000000..2de67098b7fa --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +void init_copy_as_contig_dispatch_vectors(void); + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl From 5a9c14cd5ac07cf0a79da70e67b1cd9c28f063c6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:16:36 -0800 Subject: [PATCH 05/38] Add copy_usm_ndarray_into_usm_ndarray implementation --- .../source/copy_and_cast_usm_to_usm.cpp | 310 ++++++++++++++++++ .../source/copy_and_cast_usm_to_usm.hpp | 60 ++++ 2 files changed, 370 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp new file mode 100644 index 000000000000..0458aa75ac32 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -0,0 +1,310 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; + +static copy_and_cast_generic_fn_ptr_t + copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_1d_fn_ptr_t + copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_contig_fn_ptr_t + copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && (i < src_nd); ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // check for applicability of special cases: + // (both C-contiguous || both F-contiguous) + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + if (both_c_contig || both_f_contig) { + + sycl::event copy_ev; + if (src_type_id == dst_type_id) { + + int src_elem_size = src.get_elemsize(); + + copy_ev = exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + } + else { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id]; + copy_ev = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + if ((src_type_id == dst_type_id) && (src_nd > 1)) { + if (is_dst_c_contig) { + return py_as_c_contig(src, dst, exec_q, depends); + } + else if (is_dst_f_contig) { + return py_as_f_contig(src, dst, exec_q, depends); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd < 2) { + if (nd == 1) { + std::array shape_arr = {simplified_shape[0]}; + std::array src_strides_arr = { + simplified_src_strides[0]}; + std::array dst_strides_arr = { + simplified_dst_strides[0]}; + + sycl::event copy_and_cast_1d_event; + if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) && + (src_offset == 0) && (dst_offset == 0)) + { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id] + [src_type_id]; + copy_and_cast_1d_event = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + else { + auto fn = + copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + copy_and_cast_1d_event = + fn(exec_q, src_nelems, shape_arr, src_strides_arr, + dst_strides_arr, src_data, src_offset, dst_data, + dst_offset, depends); + } + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), + copy_and_cast_1d_event); + } + else if (nd == 0) { // case of a scalar + assert(src_nelems == 1); + std::array shape_arr = {1}; + std::array src_strides_arr = {1}; + std::array dst_strides_arr = {1}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_0d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), + copy_and_cast_0d_event); + } + } + + // Generic implementation + auto copy_and_cast_fn = + copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_and_cast_generic_ev); +} + +void init_copy_and_cast_usm_to_usm_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory; + DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder + dtb_generic; + dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + DispatchTableBuilder + dtb_1d; + dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp new file mode 100644 index 000000000000..d2a2dcaf7b85 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4f6334054fc08df7c2c2f7657bc5f4569ee4363a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:18:36 -0800 Subject: [PATCH 06/38] Add pybind11 bindings for dpctl_ext.tensor._tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 502 ++++++++++++++++++ 1 file changed, 502 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp new file mode 100644 index 000000000000..b41b5c9ce423 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -0,0 +1,502 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +// #include "accumulators.hpp" +// #include "boolean_advanced_indexing.hpp" +// #include "clip.hpp" +#include "copy_and_cast_usm_to_usm.hpp" +#include "copy_as_contig.hpp" +// #include "copy_for_reshape.hpp" +// #include "copy_for_roll.hpp" +// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +// #include "device_support_queries.hpp" +// #include "eye_ctor.hpp" +// #include "full_ctor.hpp" +// #include "integer_advanced_indexing.hpp" +#include "kernels/dpctl_tensor_types.hpp" +// #include "linear_sequences.hpp" +// #include "repeat.hpp" +#include "simplify_iteration_space.hpp" +// #include "triul_ctor.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/strided_iters.hpp" +// #include "where.hpp" +// #include "zeros_ctor.hpp" + +namespace py = pybind11; + +static_assert(std::is_same_v); + +namespace +{ + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +using dpctl::tensor::overlap::MemoryOverlap; +using dpctl::tensor::overlap::SameLogicalTensors; + +using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::py_as_c_contig; +using dpctl::tensor::py_internal::py_as_f_contig; + +/* =========================== Copy for reshape ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; + +/* =========================== Copy for roll ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; + +/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ + +// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; + +/* ============= linear-sequence ==================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; + +/* ================ Full ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_full; + +/* ================ Zeros ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_zeros; + +/* ============== Advanced Indexing ============= */ +// using dpctl::tensor::py_internal::usm_ndarray_put; +// using dpctl::tensor::py_internal::usm_ndarray_take; + +// using dpctl::tensor::py_internal::py_extract; +// using dpctl::tensor::py_internal::py_mask_positions; +// using dpctl::tensor::py_internal::py_nonzero; +// using dpctl::tensor::py_internal::py_place; + +/* ================= Repeat ====================*/ +// using dpctl::tensor::py_internal::py_cumsum_1d; +// using dpctl::tensor::py_internal::py_repeat_by_scalar; +// using dpctl::tensor::py_internal::py_repeat_by_sequence; + +/* ================ Eye ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_eye; + +/* =========================== Tril and triu ============================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_triul; + +/* =========================== Where ============================== */ + +// using dpctl::tensor::py_internal::py_where; + +/* =========================== Clip ============================== */ +// using dpctl::tensor::py_internal::py_clip; + +// populate dispatch tables +void init_dispatch_tables(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_and_cast_usm_to_usm_dispatch_tables(); + // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + // init_advanced_indexing_dispatch_tables(); + // init_where_dispatch_tables(); + return; +} + +// populate dispatch vectors +void init_dispatch_vectors(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_as_contig_dispatch_vectors(); + // init_copy_for_reshape_dispatch_vectors(); + // init_copy_for_roll_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); + // init_full_ctor_dispatch_vectors(); + // init_zeros_ctor_dispatch_vectors(); + // init_eye_ctor_dispatch_vectors(); + // init_triul_ctor_dispatch_vectors(); + + // populate_masked_extract_dispatch_vectors(); + // populate_masked_place_dispatch_vectors(); + + // populate_mask_positions_dispatch_vectors(); + + // populate_cumsum_1d_dispatch_vectors(); + // init_repeat_dispatch_vectors(); + + // init_clip_dispatch_vectors(); + + return; +} + +} // namespace + +PYBIND11_MODULE(_tensor_impl, m) +{ + init_dispatch_tables(); + init_dispatch_vectors(); + + using dpctl::tensor::strides::contract_iter; + m.def( + "_contract_iter", &contract_iter, + "Simplifies iteration of array of given shape & stride. Returns " + "a triple: shape, stride and offset for the new iterator of possible " + "smaller dimension, which traverses the same elements as the original " + "iterator, possibly in a different order."); + + m.def("_copy_usm_ndarray_into_usm_ndarray", + ©_usm_ndarray_into_usm_ndarray, + "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same " + "shape. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_c_contig", &py_as_c_contig, + "Copies from usm_ndarray `src` into C-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_f_contig", &py_as_f_contig, + "Copies from usm_ndarray `src` into F-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + using dpctl::tensor::strides::contract_iter2; + m.def( + "_contract_iter2", &contract_iter2, + "Simplifies iteration over elements of pair of arrays of given shape " + "with strides stride1 and stride2. Returns " + "a 5-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter3; + m.def( + "_contract_iter3", &contract_iter3, + "Simplifies iteration over elements of 3-tuple of arrays of given " + "shape " + "with strides stride1, stride2, and stride3. Returns " + "a 7-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter4; + m.def( + "_contract_iter4", &contract_iter4, + "Simplifies iteration over elements of 4-tuple of arrays of given " + "shape " + "with strides stride1, stride2, stride3, and stride4. Returns " + "a 9-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + static constexpr char orderC = 'C'; + m.def( + "_ravel_multi_index", + [](const std::vector &mi, + const std::vector &shape, char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_ravel_multi_index_c(mi, + shape); + } + else { + return dpctl::tensor::py_internal::_ravel_multi_index_f(mi, + shape); + } + }, + ""); + + m.def( + "_unravel_index", + [](py::ssize_t flat_index, const std::vector &shape, + char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_unravel_index_c(flat_index, + shape); + } + else { + return dpctl::tensor::py_internal::_unravel_index_f(flat_index, + shape); + } + }, + ""); + + // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "number of elements using underlying 'C'-contiguous order for + // flat " "traversal. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for flat " + // "traversal with shift. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("src"), py::arg("dst"), py::arg("shift"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for " "traversal + // with shifts along each axis. " "Returns a tuple of events: + // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and step `dt`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and end point `end`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_copy_numpy_ndarray_into_usm_ndarray", + // ©_numpy_ndarray_into_usm_ndarray, + // "Copy from numpy array `src` into usm_ndarray `dst` + // synchronously.", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_full_usm_ndarray", &usm_ndarray_full, + // "Populate usm_ndarray `dst` with given fill_value.", + // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_take", &usm_ndarray_take, + // "Takes elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` from array `src` and copies them " + // "into usm_ndarray `dst` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("ind"), py::arg("dst"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_put", &usm_ndarray_put, + // "Puts elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` into array `dst` from " + // "usm_ndarray `val` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("dst"), py::arg("ind"), py::arg("val"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_eye", &usm_ndarray_eye, + // "Fills input 2D contiguous usm_ndarray `dst` with " + // "zeros outside of the diagonal " + // "specified by " + // "the diagonal index `k` " + // "which is filled with ones." + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("default_device_fp_type", + // dpctl::tensor::py_internal::default_device_fp_type, + // "Gives default floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_int_type", + // dpctl::tensor::py_internal::default_device_int_type, + // "Gives default signed integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_uint_type", + // dpctl::tensor::py_internal::default_device_uint_type, + // "Gives default unsigned integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_bool_type", + // dpctl::tensor::py_internal::default_device_bool_type, + // "Gives default boolean type supported by device.", py::arg("dev")); + + // m.def("default_device_complex_type", + // dpctl::tensor::py_internal::default_device_complex_type, + // "Gives default complex floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_index_type", + // dpctl::tensor::py_internal::default_device_index_type, + // "Gives default index type supported by device.", py::arg("dev")); + + // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + // }; + // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + // }; + // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + // py::arg("cumsum"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto overlap = [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &overlap = MemoryOverlap(); + return overlap(x1, x2); + }; + m.def("_array_overlap", overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + // auto same_logical_tensors = + // [](const dpctl::tensor::usm_ndarray &x1, + // const dpctl::tensor::usm_ndarray &x2) -> bool { + // auto const &same_logical_tensors = SameLogicalTensors(); + // return same_logical_tensors(x1, x2); + // }; + // m.def("_same_logical_tensors", same_logical_tensors, + // "Determines if the memory regions indexed by each array are the + // same", py::arg("array1"), py::arg("array2")); + + // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + // py::arg("mask_shape"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), + // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const dpctl::tensor::usm_ndarray &reps, + // const dpctl::tensor::usm_ndarray &cumsum, + // std::optional axis, sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_sequence(src, dst, reps, cumsum, + // axis.value(), + // exec_q, depends); + // } + // else { + // return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + // depends); + // } + // }; + // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), + // py::arg("dst"), py::arg("reps"), py::arg("cumsum"), + // py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const py::ssize_t reps, std::optional axis, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + // depends); + // } + // else { + // return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + // } + // }; + // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + // py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_clip", &py_clip, + // "Clamps elements of array `x` to the range " + // "[`min`, `max] and writes the result to the " + // "array `dst` for each element of `x`, `min`, and `max`." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); +} From 634579c5f0d64d44805d0a020cb4ca5ae1d5e774 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:24:11 -0800 Subject: [PATCH 07/38] Add CMake build files for dpctl_ext --- dpctl_ext/CMakeLists.txt | 205 ++++++++++++++++++++++++++++++++ dpctl_ext/tensor/CMakeLists.txt | 175 +++++++++++++++++++++++++++ 2 files changed, 380 insertions(+) create mode 100644 dpctl_ext/CMakeLists.txt create mode 100644 dpctl_ext/tensor/CMakeLists.txt diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt new file mode 100644 index 000000000000..bb33a4f57332 --- /dev/null +++ b/dpctl_ext/CMakeLists.txt @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +find_package(Python REQUIRED COMPONENTS NumPy) + +# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) +# -w is to set working directory (and correctly set __pyx_f[] array of filenames) +set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") +find_package(Cython REQUIRED) + +if(WIN32) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + ) + string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_CXX_FLAGS_COVERAGE + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +elseif(UNIX) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + "-fdiagnostics-color=auto " + ) + string( + CONCAT SDL_FLAGS + "-fstack-protector " + "-fstack-protector-all " + "-fpic " + "-fPIC " + "-D_FORTIFY_SOURCE=2 " + "-Wformat " + "-Wformat-security " + # "-fno-strict-overflow " # no-strict-overflow is implied by -fwrapv + "-fno-delete-null-pointer-checks " + "-fwrapv " + ) + string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +else() + message(FATAL_ERROR "Unsupported system.") +endif() + +# at build time create include/ directory and copy header files over +set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) + +set(CMAKE_INSTALL_RPATH "$ORIGIN") + +function(build_dpctl_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) + add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) + set(_cythonize_trgt "${_trgt}_cythonize_pyx") + python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) + if(BUILD_DPCTL_EXT_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${_trgt} PRIVATE --offload-compress) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${_trgt} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE Python::NumPy) + if(DPCTL_GENERATE_COVERAGE) + target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) + if(BUILD_DPCTL_EXT_SYCL) + target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") + target_link_options(${_trgt} PRIVATE ${_linker_options}) + get_filename_component(_name_wle ${_generated_src} NAME_WLE) + get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) + set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") + set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") + + # TODO: create separate folder inside build folder that contains only + # headers related to this target and appropriate folder structure to + # eliminate shadow dependencies + get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) + # TODO: do not set directory if we did not generate header + target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) + set(_rpath_value "$ORIGIN") + if(BUILD_DPCTL_EXT_RELATIVE_PATH) + set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") + endif() + if(DPCTL_WITH_REDIST) + set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") + endif() + set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) + + install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) + install( + FILES ${_generated_api_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + install( + FILES ${_generated_public_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + if(DPCTL_GENERATE_COVERAGE) + get_filename_component(_original_src_dir ${_src} DIRECTORY) + file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) + install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) + endif() + + # Create target with headers only, because python is managing all the + # library imports at runtime + set(_trgt_headers ${_trgt}_headers) + add_library(${_trgt_headers} INTERFACE) + add_dependencies(${_trgt_headers} ${_trgt}) + get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) +endfunction() + +add_subdirectory(tensor) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt new file mode 100644 index 000000000000..ed8294b76615 --- /dev/null +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +if(WIN32) + if(${CMAKE_VERSION} VERSION_LESS "3.23") + # this is a work-around for target_link_options inserting option after -link option, cause + # linker to ignore it. + set(CMAKE_CXX_LINK_FLAGS + "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel" + ) + endif() +endif() + +set(_static_lib_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp +) +set(_tensor_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp +) + +set(_static_lib_trgt simplify_iteration_space) + +add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) +target_include_directories( + ${_static_lib_trgt} + PRIVATE + ${Python_INCLUDE_DIRS} + ${DPCTL_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include +) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +set(_py_trgts) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(_clang_prefix "") +if(WIN32) + set(_clang_prefix "/clang:") +endif() + +set(_no_fast_math_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp +) +list( + APPEND _no_fast_math_sources + # ${_elementwise_sources} + # ${_reduction_sources} + # ${_sorting_sources} + # ${_linalg_sources} + # ${_accumulator_sources} +) + +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() + +set(_compiler_definitions "") + +set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +foreach(python_module_name ${_py_trgts}) + target_compile_options( + ${python_module_name} + PRIVATE -fno-sycl-id-queries-fit-in-int + ) + target_link_options( + ${python_module_name} + PRIVATE -fsycl-device-code-split=per_kernel + ) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${python_module_name} PRIVATE --offload-compress) + endif() + + target_include_directories( + ${python_module_name} + PRIVATE + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${Dpctl_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + if(DPCTL_GENERATE_COVERAGE) + if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_link_options} + ) + endif() + # TODO: update source so they reference individual libraries instead of + # dpctl4pybind11.hpp. It will allow to simplify dependency tree + # NOTE: dpctl C-API is resolved at runtime via Python + # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) + if(DPCTL_WITH_REDIST) + set_target_properties( + ${python_module_name} + PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." + ) + endif() + # TODO: revert to `DESTINATION "dpctl/tensor"` + install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor") +endforeach() From 79d40f235d10d1b9d514d9db07939d0bb447086c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:31:12 -0800 Subject: [PATCH 08/38] Add empty __init__ to dpctl_ext/ --- dpctl_ext/__init__.py | 27 +++++++++++++++++++++++++++ dpctl_ext/tensor/__init__.py | 27 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 dpctl_ext/__init__.py create mode 100644 dpctl_ext/tensor/__init__.py diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/tensor/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** From 7949c17c3586a4ad0222c6abbf3a616202834c68 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:53:03 -0800 Subject: [PATCH 09/38] Enable _same_logical_tensors in _tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b41b5c9ce423..ca3b7bd49116 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -430,15 +430,15 @@ PYBIND11_MODULE(_tensor_impl, m) "Determines if the memory regions indexed by each array overlap", py::arg("array1"), py::arg("array2")); - // auto same_logical_tensors = - // [](const dpctl::tensor::usm_ndarray &x1, - // const dpctl::tensor::usm_ndarray &x2) -> bool { - // auto const &same_logical_tensors = SameLogicalTensors(); - // return same_logical_tensors(x1, x2); - // }; - // m.def("_same_logical_tensors", same_logical_tensors, - // "Determines if the memory regions indexed by each array are the - // same", py::arg("array1"), py::arg("array2")); + auto same_logical_tensors = + [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &same_logical_tensors = SameLogicalTensors(); + return same_logical_tensors(x1, x2); + }; + m.def("_same_logical_tensors", same_logical_tensors, + "Determines if the memory regions indexed by each array are the same", + py::arg("array1"), py::arg("array2")); // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), From 29d6c029190714cab8a460c02f32130c7ea59cc6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:14:28 -0800 Subject: [PATCH 10/38] Add device_support_queries to enable default device types --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../source/device_support_queries.cpp | 184 ++++++++++++++++++ .../source/device_support_queries.hpp | 58 ++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 56 +++--- 4 files changed, 271 insertions(+), 29 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ed8294b76615..ee8da2e49506 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -56,7 +56,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp new file mode 100644 index 000000000000..51eb7dba1b6c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace +{ + +std::string _default_device_fp_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "f8"; + } + else { + return "f4"; + } +} + +int get_numpy_major_version() +{ + namespace py = pybind11; + + py::module_ numpy = py::module_::import("numpy"); + py::str version_string = numpy.attr("__version__"); + py::module_ numpy_lib = py::module_::import("numpy.lib"); + + py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); + int major_version = numpy_version.attr("major").cast(); + + return major_version; +} + +std::string _default_device_int_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "i8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "l"; + } +} + +std::string _default_device_uint_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "u8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "L"; + } +} + +std::string _default_device_complex_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "c16"; + } + else { + return "c8"; + } +} + +std::string _default_device_bool_type(const sycl::device &) +{ + return "b1"; +} + +std::string _default_device_index_type(const sycl::device &) +{ + return "i8"; +} + +sycl::device _extract_device(const py::object &arg) +{ + auto const &api = dpctl::detail::dpctl_capi::get(); + + PyObject *source = arg.ptr(); + if (api.PySyclQueue_Check_(source)) { + const sycl::queue &q = py::cast(arg); + return q.get_device(); + } + else if (api.PySyclDevice_Check_(source)) { + return py::cast(arg); + } + else { + throw py::type_error( + "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`."); + } +} + +} // namespace + +std::string default_device_fp_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_fp_type(d); +} + +std::string default_device_int_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_int_type(d); +} + +std::string default_device_uint_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_uint_type(d); +} + +std::string default_device_bool_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_bool_type(d); +} + +std::string default_device_complex_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_complex_type(d); +} + +std::string default_device_index_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_index_type(d); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp new file mode 100644 index 000000000000..6ea01dcd49d7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -0,0 +1,58 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::string default_device_fp_type(const py::object &); +extern std::string default_device_int_type(const py::object &); +extern std::string default_device_uint_type(const py::object &); +extern std::string default_device_bool_type(const py::object &); +extern std::string default_device_complex_type(const py::object &); +extern std::string default_device_index_type(const py::object &); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index ca3b7bd49116..911d75ebd925 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ // #include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" -// #include "device_support_queries.hpp" +#include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" // #include "integer_advanced_indexing.hpp" @@ -360,33 +360,33 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("default_device_fp_type", - // dpctl::tensor::py_internal::default_device_fp_type, - // "Gives default floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_int_type", - // dpctl::tensor::py_internal::default_device_int_type, - // "Gives default signed integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_uint_type", - // dpctl::tensor::py_internal::default_device_uint_type, - // "Gives default unsigned integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_bool_type", - // dpctl::tensor::py_internal::default_device_bool_type, - // "Gives default boolean type supported by device.", py::arg("dev")); - - // m.def("default_device_complex_type", - // dpctl::tensor::py_internal::default_device_complex_type, - // "Gives default complex floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_index_type", - // dpctl::tensor::py_internal::default_device_index_type, - // "Gives default index type supported by device.", py::arg("dev")); + m.def("default_device_fp_type", + dpctl::tensor::py_internal::default_device_fp_type, + "Gives default floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_int_type", + dpctl::tensor::py_internal::default_device_int_type, + "Gives default signed integer type supported by device.", + py::arg("dev")); + + m.def("default_device_uint_type", + dpctl::tensor::py_internal::default_device_uint_type, + "Gives default unsigned integer type supported by device.", + py::arg("dev")); + + m.def("default_device_bool_type", + dpctl::tensor::py_internal::default_device_bool_type, + "Gives default boolean type supported by device.", py::arg("dev")); + + m.def("default_device_complex_type", + dpctl::tensor::py_internal::default_device_complex_type, + "Gives default complex floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_index_type", + dpctl::tensor::py_internal::default_device_index_type, + "Gives default index type supported by device.", py::arg("dev")); // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, From 936e7198e2014330b34c5918a63230ea699e063e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:52:17 -0800 Subject: [PATCH 11/38] Enable building and packaging of dpctl_ext --- CMakeLists.txt | 1 + setup.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 386b17b44294..d2ee5e84c0c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,3 +336,4 @@ if(DEFINED SKBUILD) endif() add_subdirectory(dpnp) +add_subdirectory(dpctl_ext) diff --git a/setup.py b/setup.py index cc21221299c4..a0c54b066dcf 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,9 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", + # dpctl_ext + "dpctl_ext", + "dpctl_ext.tensor", ], package_data={ "dpnp": [ From cd85f1e333bcad154272946f71c127b9ea9a916b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 06:14:39 -0800 Subject: [PATCH 12/38] Use _tensor_impl from dpctl_ext.tensor in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 +- dpnp/dpnp_iface.py | 2 +- dpnp/dpnp_iface_searching.py | 2 +- dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +- dpnp/scipy/linalg/_utils.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 57bf50422fa0..b63bf61f8dad 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -31,7 +31,6 @@ import dpctl.tensor as dpt import dpctl.tensor._copy_utils as dtc -import dpctl.tensor._tensor_impl as dti import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy @@ -45,6 +44,7 @@ _validate_dtype, ) +import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index fba1a215756a..832446c826ba 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -45,11 +45,11 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._device import normalize_queue_device +import dpctl_ext.tensor._tensor_impl as ti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 6eefe010b699..fdbd317d31dd 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -40,8 +40,8 @@ """ import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as dti +import dpctl_ext.tensor._tensor_impl as dti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 30be5d1ff5cb..4d8e3cdfbd0d 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -28,7 +28,6 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -38,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 282c645d1095..8eb9187236bf 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -42,9 +42,9 @@ from warnings import warn -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 0c6780a8f8b45e87263fbf316bc17aac5ed91dc1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:56:50 -0800 Subject: [PATCH 13/38] Move put() and take() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 11 + dpctl_ext/tensor/_indexing_functions.py | 329 +++++++ dpctl_ext/tensor/_numpy_helper.py | 45 + .../kernels/integer_advanced_indexing.hpp | 427 +++++++++ .../source/integer_advanced_indexing.cpp | 819 ++++++++++++++++++ .../source/integer_advanced_indexing.hpp | 73 ++ .../tensor/libtensor/source/tensor_ctors.cpp | 42 +- 8 files changed, 1726 insertions(+), 22 deletions(-) create mode 100644 dpctl_ext/tensor/_indexing_functions.py create mode 100644 dpctl_ext/tensor/_numpy_helper.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..ae8b72d71873 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -49,7 +49,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a71324cb88d8..35453dbf9a46 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -25,3 +25,14 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** + + +from dpctl_ext.tensor._indexing_functions import ( + put, + take, +) + +__all__ = [ + "put", + "take", +] diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py new file mode 100644 index 000000000000..106df09cf97e --- /dev/null +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -0,0 +1,329 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils + +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index + + +def _get_indexing_mode(name): + modes = {"wrap": 0, "clip": 1} + try: + return modes[name] + except KeyError: + raise ValueError( + "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name) + ) + + +def put(x, indices, vals, /, *, axis=None, mode="wrap"): + """put(x, indices, vals, axis=None, mode="wrap") + + Puts values into an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array the values will be put into. + indices (usm_ndarray): + One-dimensional array of indices. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the result shape + ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``. + axis (int, optional): + The axis along which the values will be placed. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work, e.g. + + :Example: + + .. code-block:: python + + from dpctl import tensor as dpt + + def put_vec_duplicates(vec, ind, vals): + "Put values into vec, handling possible duplicates in ind" + assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1 + + # find positions of last occurrences of each + # unique index + ind_flipped = dpt.flip(ind) + ind_uniq = dpt.unique_all(ind_flipped).indices + has_dups = len(ind) != len(ind_uniq) + + if has_dups: + ind_uniq = dpt.subtract(vec.size - 1, ind_uniq) + ind = dpt.take(ind, ind_uniq) + vals = dpt.take(vals, ind_uniq) + + dpt.put(vec, ind, vals) + + n = 512 + ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1))) + x = dpt.zeros(ind.size, dtype="int32") + vals = dpt.arange(ind.size, dtype=x.dtype) + + # Values corresponding to last positions of + # duplicate indices are written into the vector x + put_vec_duplicates(x, ind, vals) + + parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype)) + expected = dpt.concat(parts) + assert dpt.all(x == expected) + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + val_shape = indices.shape + + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q + ) + # choose to throw here for consistency with `place` + if vals.size == 0: + raise ValueError( + "cannot put into non-empty indices along an empty axis" + ) + if vals.dtype == x.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, x.dtype) + rhs = dpt.broadcast_to(rhs, val_shape) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) + + +def take(x, indices, /, *, axis=None, out=None, mode="wrap"): + """take(x, indices, axis=None, out=None, mode="wrap") + + Takes elements from an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array that elements will be taken from. + indices (usm_ndarray): + One-dimensional array of indices. + axis (int, optional): + The axis along which the values will be selected. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + out (Optional[usm_ndarray]): + Output array to populate. Array must have the correct + shape and the expected data type. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + Array with shape + ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]`` + filled with elements from ``x``. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue]) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + res_usm_type = dpctl.utils.get_coerced_usm_type( + [x.usm_type, indices.usm_type] + ) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + res_shape = indices.shape + + dt = x.dtype + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + if dt != out.dtype: + raise ValueError( + f"Output array of type {dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpctl.utils.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if ti._array_overlap(x, out): + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) + + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_ev) + out = orig_out + + return out diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py new file mode 100644 index 000000000000..4ad735823cb3 --- /dev/null +++ b/dpctl_ext/tensor/_numpy_helper.py @@ -0,0 +1,45 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import numpy as np + +_npver = np.lib.NumpyVersion(np.__version__) + +if _npver < "1.25.0": # pragma: no cover + from numpy import AxisError +else: + from numpy.exceptions import AxisError + +if _npver >= "2.0.0": + from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple +else: # pragma: no cover + from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple + + +__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"] diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..1b2c79d2e2a5 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -0,0 +1,427 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/indexing_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +using dpctl::tensor::ssize_t; + +template +class TakeFunctor +{ +private: + const char *src_ = nullptr; + char *dst_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + TakeFunctor(const char *src_cp, + char *dst_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + const T *src = reinterpret_cast(src_); + T *dst = reinterpret_cast(dst_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t src_offset = orthog_offsets.get_first_offset(); + ssize_t dst_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + src_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + dst_offset += axes_strider(i_along); + + dst[dst_offset] = src[src_offset]; + } +}; + +template +class take_kernel; + +typedef sycl::event (*take_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + const char *, + char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event take_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + const char *src_p, + char *dst_p, + char **ind_p, + ssize_t src_offset, + ssize_t dst_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event take_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + take_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + TakeFunctor( + src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return take_ev; +} + +template +class PutFunctor +{ +private: + char *dst_ = nullptr; + const char *val_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + PutFunctor(char *dst_cp, + const char *val_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + T *dst = reinterpret_cast(dst_); + const T *val = reinterpret_cast(val_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t dst_offset = orthog_offsets.get_first_offset(); + ssize_t val_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + dst_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + val_offset += axes_strider(i_along); + + dst[dst_offset] = val[val_offset]; + } +}; + +template +class put_kernel; + +typedef sycl::event (*put_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + char *, + const char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event put_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + char *dst_p, + const char *val_p, + char **ind_p, + ssize_t dst_offset, + ssize_t val_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event put_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + put_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + PutFunctor( + dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return put_ev; +} + +template +struct TakeWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct TakeClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp new file mode 100644 index 000000000000..244acfe3955f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -0,0 +1,819 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.take and +/// dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/integer_advanced_indexing.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "integer_advanced_indexing.hpp" + +#define INDEXING_MODES 2 +#define WRAP_MODE 0 +#define CLIP_MODE 1 + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing::put_fn_ptr_t; +using dpctl::tensor::kernels::indexing::take_fn_ptr_t; + +static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::vector + _populate_kernel_params(sycl::queue &exec_q, + std::vector &host_task_events, + char **device_ind_ptrs, + py::ssize_t *device_ind_sh_st, + py::ssize_t *device_ind_offsets, + py::ssize_t *device_orthog_sh_st, + py::ssize_t *device_along_sh_st, + const py::ssize_t *inp_shape, + const py::ssize_t *arr_shape, + std::vector &inp_strides, + std::vector &arr_strides, + std::vector &ind_sh_sts, + std::vector &ind_ptrs, + std::vector &ind_offsets, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int orthog_sh_elems, + int ind_sh_elems) +{ + + using usm_host_allocator_T = + dpctl::tensor::alloc_utils::usm_host_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using shT = std::vector; + + usm_host_allocatorT sz_allocator(exec_q); + std::shared_ptr host_ind_sh_st_shp = + std::make_shared(ind_sh_elems * (k + 1), sz_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, sz_allocator); + + std::shared_ptr host_orthog_sh_st_shp = + std::make_shared(3 * orthog_sh_elems, sz_allocator); + + std::shared_ptr host_along_sh_st_shp = + std::make_shared(2 * (k + ind_sh_elems), sz_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_sh_st_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size()); + + const sycl::event &device_ind_sh_st_copy_ev = + exec_q.copy(host_ind_sh_st_shp->data(), device_ind_sh_st, + host_ind_sh_st_shp->size()); + + const sycl::event &device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), device_ind_offsets, + host_ind_offsets_shp->size()); + + int orthog_nd = inp_nd - k; + + if (orthog_nd > 0) { + if (axis_start > 0) { + std::copy(inp_shape, inp_shape + axis_start, + host_orthog_sh_st_shp->begin()); + std::copy(inp_strides.begin(), inp_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + orthog_sh_elems); + std::copy(arr_strides.begin(), arr_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems); + } + if (inp_nd > (axis_start + k)) { + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + host_orthog_sh_st_shp->begin() + axis_start); + std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(), + host_orthog_sh_st_shp->begin() + orthog_sh_elems + + axis_start); + + std::copy(arr_strides.begin() + axis_start + ind_nd, + arr_strides.end(), + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems + + axis_start); + } + } + + if (inp_nd > 0) { + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + host_along_sh_st_shp->begin()); + + std::copy(inp_strides.begin() + axis_start, + inp_strides.begin() + axis_start + k, + host_along_sh_st_shp->begin() + k); + } + + if (ind_nd > 0) { + std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k); + std::copy(arr_strides.begin() + axis_start, + arr_strides.begin() + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k + ind_nd); + } + + const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy( + host_orthog_sh_st_shp->data(), device_orthog_sh_st, + host_orthog_sh_st_shp->size()); + + const sycl::event &device_along_sh_st_copy_ev = exec_q.copy( + host_along_sh_st_shp->data(), device_along_sh_st, + host_along_sh_st_shp->size()); + + const sycl::event &shared_ptr_cleanup_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({device_along_sh_st_copy_ev, + device_orthog_sh_st_copy_ev, + device_ind_offsets_copy_ev, + device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); + }); + host_task_events.push_back(shared_ptr_cleanup_ev); + + std::vector sh_st_pack_deps{ + device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev, + device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev, + device_along_sh_st_copy_ev}; + return sh_st_pack_deps; +} + +/* Utility to parse python object py_ind into vector of `usm_ndarray`s */ +std::vector parse_py_ind(const sycl::queue &q, + const py::object &py_ind) +{ + std::size_t ind_count = py::len(py_ind); + std::vector res; + res.reserve(ind_count); + + bool nd_is_known = false; + int nd = -1; + for (std::size_t i = 0; i < ind_count; ++i) { + py::object el_i = py_ind[py::cast(i)]; + dpctl::tensor::usm_ndarray arr_i = + py::cast(el_i); + if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { + throw py::value_error("Index allocation queue is not compatible " + "with execution queue"); + } + if (nd_is_known) { + if (nd != arr_i.get_ndim()) { + throw py::value_error( + "Indices must have the same number of dimensions."); + } + } + else { + nd_is_known = true; + nd = arr_i.get_ndim(); + } + res.push_back(arr_i); + } + + return res; +} + +std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &dst, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + + int k = ind.size(); + + if (k == 0) { + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(src_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(src_nd)); + } + if (src_nd == 0) { + if (dst_nd != ind_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + else { + if (dst_nd != (src_nd - k + ind_nd)) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (src_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(src_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Array memory overlap."); + } + + py::ssize_t src_offset = py::ssize_t(0); + py::ssize_t dst_offset = py::ssize_t(0); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == dst_shape[axis_start + i])) { + throw py::value_error( + "Indices shape does not match shape of axis in destination."); + } + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * ind_nelems); + + int ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + + std::vector ind_offsets; + ind_offsets.reserve(k); + + std::vector ind_sh_sts((k + 1) * ind_sh_elems, 0); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(dst, ind_)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // rearrange to past where indices shapes are checked + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(src_nd - k, 1); + + // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], + // src_strides[:axis] + src_strides[axis+k:], + // dst_strides[:axis] + + // dst_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [src_shape[axis:axis+k], + // src_strides[axis:axis+k], + // dst_shape[axis:axis+ind.ndim], + // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event take_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, + src_offset, dst_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, take_generic_ev); +} + +std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &val, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + int k = ind.size(); + + if (k == 0) { + // no indices to write to + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int dst_nd = dst.get_ndim(); + int val_nd = val.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(dst_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(dst_nd)); + } + if (dst_nd == 0) { + if (val_nd != ind_nd) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + else { + if (val_nd != (dst_nd - k + ind_nd)) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + + std::size_t dst_nelems = dst.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *val_shape = val.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (dst_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(dst_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + char *dst_data = dst.get_data(); + char *val_data = val.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(val, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + py::ssize_t dst_offset = py::ssize_t(0); + py::ssize_t val_offset = py::ssize_t(0); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int dst_typenum = dst.get_typenum(); + int val_typenum = val.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + int val_type_id = array_types.typenum_to_lookup_id(val_typenum); + + if (dst_type_id != val_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == val_shape[axis_start + i])) { + throw py::value_error( + "Indices shapes does not match shape of axis in vals."); + } + } + + auto ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(ind_, dst)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(dst_nd - k, 1); + + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:], + // val_strides[:axis] + + // val_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [dst_shape[axis:axis+k], + // dst_strides[axis:axis+k], + // val_shape[axis:axis+ind.ndim], + // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto dst_strides = dst.get_strides_vector(); + auto val_strides = val.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event put_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, + dst_offset, val_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events); + + return std::make_pair(arg_cleanup_ev, put_generic_ev); +} + +void init_advanced_indexing_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::indexing::TakeClipFactory; + DispatchTableBuilder + dtb_takeclip; + dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::TakeWrapFactory; + DispatchTableBuilder + dtb_takewrap; + dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]); + + using dpctl::tensor::kernels::indexing::PutClipFactory; + DispatchTableBuilder dtb_putclip; + dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::PutWrapFactory; + DispatchTableBuilder dtb_putwrap; + dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..57f0ddda132c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -0,0 +1,73 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.take and dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern void init_advanced_indexing_dispatch_tables(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..c18761031fd0 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -55,7 +55,7 @@ #include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" -// #include "integer_advanced_indexing.hpp" +#include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" // #include "repeat.hpp" @@ -110,8 +110,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; // using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ -// using dpctl::tensor::py_internal::usm_ndarray_put; -// using dpctl::tensor::py_internal::usm_ndarray_take; +using dpctl::tensor::py_internal::usm_ndarray_put; +using dpctl::tensor::py_internal::usm_ndarray_take; // using dpctl::tensor::py_internal::py_extract; // using dpctl::tensor::py_internal::py_mask_positions; @@ -145,7 +145,7 @@ void init_dispatch_tables(void) init_copy_and_cast_usm_to_usm_dispatch_tables(); // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); - // init_advanced_indexing_dispatch_tables(); + init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; } @@ -332,23 +332,23 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_take", &usm_ndarray_take, - // "Takes elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` from array `src` and copies them " - // "into usm_ndarray `dst` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("src"), py::arg("ind"), py::arg("dst"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); - - // m.def("_put", &usm_ndarray_put, - // "Puts elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` into array `dst` from " - // "usm_ndarray `val` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("dst"), py::arg("ind"), py::arg("val"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_take", &usm_ndarray_take, + "Takes elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` from array `src` and copies them " + "into usm_ndarray `dst` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_put", &usm_ndarray_put, + "Puts elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` into array `dst` from " + "usm_ndarray `val` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_eye", &usm_ndarray_eye, // "Fills input 2D contiguous usm_ndarray `dst` with " From 87e5482f2faf3bff2549b48c999bbab516fce168 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:59:18 -0800 Subject: [PATCH 14/38] Use put/take from dpctl_ext.tensor in dpnp --- dpnp/dpnp_iface_indexing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6e7ab778299b..6421f39fd4e4 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -52,6 +52,8 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp # pylint: disable=no-name-in-module @@ -295,7 +297,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti._array_overlap(x, out): + if ti_ext._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -304,7 +306,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti._take( + h_ev, take_ev = ti_ext._take( src=x, ind=(inds,), dst=out, @@ -813,7 +815,7 @@ def extract(condition, a): usm_a = dpt.reshape(usm_a, -1) usm_cond = dpt.reshape(usm_cond, -1) - usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0]) + usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: usm_a = dpt.reshape(usm_a, -1) @@ -1713,7 +1715,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if axis is None and usm_a.ndim > 1: usm_a = dpt.reshape(usm_a, -1) - dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) + dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) From b537f30115be31858782e6a7ace1fc52f54c5f9d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 10:33:51 -0800 Subject: [PATCH 15/38] Move full() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 169 ++++++++++ .../include/kernels/constructors.hpp | 171 ++++++++++ .../tensor/libtensor/source/full_ctor.cpp | 315 ++++++++++++++++++ .../tensor/libtensor/source/full_ctor.hpp | 60 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 14 +- 7 files changed, 727 insertions(+), 8 deletions(-) create mode 100644 dpctl_ext/tensor/_ctors.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ae8b72d71873..0c52d766afbf 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -52,7 +52,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 35453dbf9a46..9f4c27608a99 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,12 +27,16 @@ # ***************************************************************************** +from dpctl_ext.tensor._ctors import ( + full, +) from dpctl_ext.tensor._indexing_functions import ( put, take, ) __all__ = [ + "full", "put", "take", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py new file mode 100644 index 000000000000..5caa07099c56 --- /dev/null +++ b/dpctl_ext/tensor/_ctors.py @@ -0,0 +1,169 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numbers import Number + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._data_types import _get_dtype +from dpctl.tensor._device import normalize_queue_device + +import dpctl_ext.tensor._tensor_impl as ti + + +def _cast_fill_val(fill_val, dt): + """ + Casts the Python scalar `fill_val` to another Python type coercible to the + requested data type `dt`, if necessary. + """ + val_type = type(fill_val) + if val_type in [float, complex] and np.issubdtype(dt, np.integer): + return int(fill_val.real) + elif val_type is complex and np.issubdtype(dt, np.floating): + return fill_val.real + elif val_type is int and np.issubdtype(dt, np.integer): + return _to_scalar(fill_val, dt) + else: + return fill_val + + +def _to_scalar(obj, sc_ty): + """A way to convert object to NumPy scalar type. + Raises OverflowError if obj can not be represented + using the requested scalar type. + """ + zd_arr = np.asarray(obj, dtype=sc_ty) + return zd_arr[()] + + +def _validate_fill_value(fill_val): + """Validates that `fill_val` is a numeric or boolean scalar.""" + # TODO: verify if `np.True_` and `np.False_` should be instances of + # Number in NumPy, like other NumPy scalars and like Python bools + # check for `np.bool_` separately as NumPy<2 has no `np.bool` + if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_): + raise TypeError( + f"array cannot be filled with scalar of type {type(fill_val)}" + ) + + +def full( + shape, + fill_value, + *, + dtype=None, + order="C", + device=None, + usm_type=None, + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with `fill_value`. + + Args: + shape (tuple): + Dimensions of the array to be created. + fill_value (int,float,complex,usm_ndarray): + fill value + dtype (optional): data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=True) + + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + if ( + isinstance(fill_value, dpt.usm_ndarray) + and sycl_queue is None + and device is None + ): + sycl_queue = fill_value.sycl_queue + else: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + X = dpt.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + return dpt.copy(dpt.broadcast_to(X, shape), order=order) + else: + _validate_fill_value(fill_value) + + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + usm_type = usm_type if usm_type is not None else "device" + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + fill_value = _cast_fill_val(fill_value, dtype) + + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 000000000000..dfd1b889aafe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,171 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace constructors +{ + +using dpctl::tensor::ssize_t; + +/*! + @defgroup CtorKernels + */ + +template +class full_strided_kernel; + +using namespace dpctl::tensor::offset_utils; + +/* ================ Full ================== */ + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &q, + std::size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +template +class FullStridedFunctor +{ +private: + Ty *p = nullptr; + Ty fill_v; + IndexerT indexer; + +public: + FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_) + : p(p_), fill_v(fill_v_), indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + auto offset = indexer(id.get(0)); + p[offset] = fill_v; + } +}; + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &q, + int nd, + std::size_t nelems, + const ssize_t *shape_strides, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + dstTy *dst_tp = reinterpret_cast(dst_p); + + using dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexer strided_indexer(nd, 0, shape_strides); + + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = full_strided_kernel; + using Impl = FullStridedFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(dst_tp, fill_v, strided_indexer)); + }); + + return fill_ev; +} + +} // namespace constructors +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp new file mode 100644 index 000000000000..e1f61be4a12a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -0,0 +1,315 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "full_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + sycl::event fill_ev; + + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const + { + return bool(!v1) && bool(!v2); + } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + static constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } + + return fill_ev; +} + +template +struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, + int, + std::size_t, + py::ssize_t *, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given strided memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &exec_q, + int nd, + std::size_t nelems, + py::ssize_t *shape_strides, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + using dpctl::tensor::kernels::constructors::full_strided_impl; + sycl::event fill_ev = full_strided_impl( + exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends); + + return fill_ev; +} + +template +struct FullStridedFactory +{ + fnT get() + { + fnT f = full_strided_impl; + return f; + } +}; + +static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types]; +static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // py_value should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = full_contig_dispatch_vector[dst_typeid]; + + sycl::event full_contig_event = + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {full_contig_event}), + full_contig_event); + } + else { + int nd = dst.get_ndim(); + auto const &dst_shape = dst.get_shape_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + auto fn = full_strided_dispatch_vector[dst_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event &full_strided_ev = + fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, + {copy_shape_ev}); + + // free shape_strides + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), + full_strided_ev); + } +} + +void init_full_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(full_contig_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(full_strided_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp new file mode 100644 index 000000000000..d664b2013506 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_full_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c18761031fd0..c72c0b49622a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -54,7 +54,7 @@ // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" -// #include "full_ctor.hpp" +#include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" @@ -103,7 +103,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ================ Full ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_full; +using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ @@ -159,7 +159,7 @@ void init_dispatch_vectors(void) // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); - // init_full_ctor_dispatch_vectors(); + init_full_ctor_dispatch_vectors(); // init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -327,10 +327,10 @@ PYBIND11_MODULE(_tensor_impl, m) // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_full_usm_ndarray", &usm_ndarray_full, - // "Populate usm_ndarray `dst` with given fill_value.", - // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_full_usm_ndarray", &usm_ndarray_full, + "Populate usm_ndarray `dst` with given fill_value.", + py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_take", &usm_ndarray_take, "Takes elements at usm_ndarray indices `ind` and axes starting " From d50f263f089dfd52edb4daa15edd3f86807965e5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:06:00 -0800 Subject: [PATCH 16/38] Use full and _full_usm_ndarray from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 6 ++++-- dpnp/dpnp_container.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 112ea3af0fdb..f7e6f0f608b1 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,12 +32,14 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val from dpctl.tensor._tensor_impl import ( - _copy_usm_ndarray_into_usm_ndarray, - _full_usm_ndarray, _zeros_usm_ndarray, ) import dpnp +from dpctl_ext.tensor._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _full_usm_ndarray, +) def dpnp_fill(arr, val): diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 4975db17c717..b13bf96cda28 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,7 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -228,7 +229,7 @@ def full( fill_value = fill_value.get_array() """Creates `dpnp_array` having a specified shape, filled with fill_value.""" - array_obj = dpt.full( + array_obj = dpt_ext.full( shape, fill_value, dtype=dtype, From f189dc540477ceadf35dcb127325056c5e0c406b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:22:55 -0800 Subject: [PATCH 17/38] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..4ae07ccbbdb9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +dpctl_ext/**/*.cpython*.so From f9a181721784c843907c16e2e1d5569c487cf9e3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:23:51 -0800 Subject: [PATCH 18/38] Move _zeros_usm_ndarray to dpctl_ext --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../tensor/libtensor/source/tensor_ctors.cpp | 12 +- .../tensor/libtensor/source/zeros_ctor.cpp | 168 ++++++++++++++++++ .../tensor/libtensor/source/zeros_ctor.hpp | 59 ++++++ 4 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 0c52d766afbf..cb468b9a226d 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -53,7 +53,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c72c0b49622a..b55439162f90 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -64,7 +64,7 @@ #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" -// #include "zeros_ctor.hpp" +#include "zeros_ctor.hpp" namespace py = pybind11; @@ -107,7 +107,7 @@ using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_zeros; +using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ using dpctl::tensor::py_internal::usm_ndarray_put; @@ -160,7 +160,7 @@ void init_dispatch_vectors(void) // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); - // init_zeros_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -323,9 +323,9 @@ PYBIND11_MODULE(_tensor_impl, m) // synchronously.", py::arg("src"), py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, - // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); m.def("_full_usm_ndarray", &usm_ndarray_full, "Populate usm_ndarray `dst` with given fill_value.", diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 000000000000..4558743b3c22 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,168 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + static constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template +struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 000000000000..51270a3443cc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,59 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4b8505acf111ec2636afa0d2a9a25cf8677e02c7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:25:05 -0800 Subject: [PATCH 19/38] Use _zeros_usm_ndarray from dpctl_ext in dpnp_fill.py --- dpnp/dpnp_algo/dpnp_fill.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index f7e6f0f608b1..0d6640c3b8b5 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -31,14 +31,12 @@ import dpctl.tensor as dpt import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -from dpctl.tensor._tensor_impl import ( - _zeros_usm_ndarray, -) import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, + _zeros_usm_ndarray, ) From 61106b2e208d7f331bebc3335a49bc23212510c1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:39:35 -0800 Subject: [PATCH 20/38] Move linear-sequence implementations to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 178 ++++++++++ .../libtensor/source/linear_sequences.cpp | 312 ++++++++++++++++++ .../libtensor/source/linear_sequences.hpp | 69 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 579 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index cb468b9a226d..af0e2a7aa49f 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index dfd1b889aafe..20775b071ea8 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,11 +58,189 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ +template +class linear_sequence_step_kernel; +template +class linear_sequence_affine_kernel; template class full_strided_kernel; +// template class eye_kernel; using namespace dpctl::tensor::offset_utils; +template +class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +template +class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + std::size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using reT = typename Ty::value_type; + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); + re_comb = + sycl::fma(end_v.real(), _wc, + re_comb); // start_v.real() * _w + end_v.real() * _wc; + auto im_comb = + sycl::fma(start_v.imag(), _w, + reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; + im_comb = sycl::fma(end_v.imag(), _wc, im_comb); + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else if constexpr (std::is_floating_point::value) { + Ty _w = static_cast(w); + Ty _wc = static_cast(wc); + auto affine_comb = + sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; + affine_comb = sycl::fma(end_v, _wc, affine_comb); + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const bool device_supports_doubles = + exec_q.get_device().has(sycl::aspect::fp64); + const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; + + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + else { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + }); + + return lin_space_affine_event; +} + /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 000000000000..02c4a8ad0fa1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,312 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "linear_sequences.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty step_v = py::cast(step); + + using dpctl::tensor::kernels::constructors::lin_space_step_impl; + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty end_v = py::cast(end); + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_linear_sequence_step(const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair + usm_ndarray_linear_sequence_affine(const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template +struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template +struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 000000000000..321cd2f23efe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair usm_ndarray_linear_sequence_step( + const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b55439162f90..dd660c497f9a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -// #include "linear_sequences.hpp" +#include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" // #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - // init_linear_sequences_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and step `dt`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("dt"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and step `dt`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("dt"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and end point `end`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("end"), py::arg("dst"), - // py::arg("include_endpoint"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and end point `end`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("end"), py::arg("dst"), + py::arg("include_endpoint"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From a030579be8525d6f23674d5c9a4a171ab842f500 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:40:33 -0800 Subject: [PATCH 21/38] Use _tensor_impl from dpctl_ext in dpnp_utils_fft.py --- dpnp/fft/dpnp_utils_fft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 4e2b7aaaf842..c692774a424f 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,7 +42,6 @@ from collections.abc import Sequence import dpctl -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -51,6 +50,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.fft._fft_impl as fi From a1d6fa39ba8607b191177d6acb0ca2f3cf8f49fc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:03:08 -0800 Subject: [PATCH 22/38] Move tril()/triu() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 157 +++++++++++ .../include/kernels/constructors.hpp | 138 ++++++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 46 ++-- .../tensor/libtensor/source/triul_ctor.cpp | 253 ++++++++++++++++++ .../tensor/libtensor/source/triul_ctor.hpp | 62 +++++ 7 files changed, 638 insertions(+), 24 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index af0e2a7aa49f..1375c8316754 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -54,7 +54,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 9f4c27608a99..3c6939eff7a0 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -29,6 +29,8 @@ from dpctl_ext.tensor._ctors import ( full, + tril, + triu, ) from dpctl_ext.tensor._indexing_functions import ( put, @@ -39,4 +41,6 @@ "full", "put", "take", + "tril", + "triu", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 5caa07099c56..a0e7b28e66ff 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -26,6 +26,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import operator from numbers import Number import dpctl @@ -167,3 +168,159 @@ def full( hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) _manager.add_event_pair(hev, full_ev) return res + + +def tril(x, /, *, k=0): + """ + Returns the lower triangular part of a matrix (or a stack of matrices) + ``x``. + + The lower triangular part of the matrix is defined as the elements on and + below the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal above which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + A lower-triangular array or a stack of lower-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k >= shape[nd - 1] - 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + elif k < -shape[nd - 2]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, tril_ev) + + return res + + +def triu(x, /, *, k=0): + """ + Returns the upper triangular part of a matrix (or a stack of matrices) + ``x``. + + The upper triangular part of the matrix is defined as the elements on and + above the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal below which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + An upper-triangular array or a stack of upper-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k > shape[nd - 1]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + elif k <= -shape[nd - 2] + 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, triu_ev) + + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 20775b071ea8..8d53655b2754 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -343,6 +343,144 @@ sycl::event full_strided_impl(sycl::queue &q, return fill_ev; } +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, + ssize_t, // inner_range //ssize_t + ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + ssize_t, // nd + ssize_t *, // shape_and_strides + ssize_t, // k + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +class tri_kernel; +template +sycl::event tri_impl(sycl::queue &exec_q, + ssize_t inner_range, + ssize_t outer_range, + char *src_p, + char *dst_p, + ssize_t nd, + ssize_t *shape_and_strides, + ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + static constexpr int d2 = 2; + ssize_t src_s = nd; + ssize_t dst_s = 2 * nd; + ssize_t nd_1 = nd - 1; + ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + ssize_t outer_gid = idx[0] / inner_range; + ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + ssize_t src_inner_offset = 0, dst_inner_offset = 0; + bool to_copy{false}; + + { + using dpctl::tensor::strides::CIndexer_array; + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if constexpr (upper) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + ssize_t src_offset = 0; + ssize_t dst_offset = 0; + { + using dpctl::tensor::strides::CIndexer_vector; + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + } // namespace constructors } // namespace kernels } // namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index dd660c497f9a..f2afce105f7f 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -60,7 +60,7 @@ #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" -// #include "triul_ctor.hpp" +#include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" @@ -129,7 +129,7 @@ using dpctl::tensor::py_internal::usm_ndarray_take; /* =========================== Tril and triu ============================== */ -// using dpctl::tensor::py_internal::usm_ndarray_triul; +using dpctl::tensor::py_internal::usm_ndarray_triul; /* =========================== Where ============================== */ @@ -162,7 +162,7 @@ void init_dispatch_vectors(void) init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); - // init_triul_ctor_dispatch_vectors(); + init_triul_ctor_dispatch_vectors(); // populate_masked_extract_dispatch_vectors(); // populate_masked_place_dispatch_vectors(); @@ -388,27 +388,27 @@ PYBIND11_MODULE(_tensor_impl, m) dpctl::tensor::py_internal::default_device_index_type, "Gives default index type supported by device.", py::arg("dev")); - // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); - // }; - // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + }; + m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); - // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); - // }; - // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + }; + m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), // py::arg("cumsum"), py::arg("sycl_queue"), diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp new file mode 100644 index 000000000000..0890dfdb4766 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -0,0 +1,253 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include // for std::copy +#include // for std::size_t +#include // for std::make_shared +#include // for std::runtime_error +#include // for std::pair, std::move +#include // for std::vector, std::begin, std::end + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/constructors.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; + +static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types]; +static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + if (src_nd < 2) { + throw py::value_error("Array dimensions less than 2."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + auto array_types = td_ns::usm_ndarray_types(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (dst_typeid != src_typeid) { + throw py::value_error("Array dtype are not the same."); + } + + // check same queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation contexts"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd - 2; + const py::ssize_t *shape = src_shape; + + const shT iter_src_strides(std::begin(src_strides), + std::begin(src_strides) + nd); + const shT iter_dst_strides(std::begin(dst_strides), + std::begin(dst_strides) + nd); + + simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (src_offset != 0 || dst_offset != 0) { + throw py::value_error("Reversed slice for dst is not supported"); + } + + nd += 2; + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using usmshT = std::vector; + + usm_host_allocatorT allocator(exec_q); + auto shp_host_shape_and_strides = + std::make_shared(3 * nd, allocator); + + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_and_strides->begin()); + (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; + (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; + + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_and_strides->begin() + nd); + (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; + + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_and_strides->begin() + 2 * nd); + (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; + + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + + const sycl::event ©_shape_and_strides = exec_q.copy( + shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); + + py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; + py::ssize_t outer_range = src_nelems / inner_range; + + sycl::event tri_ev; + if (part == 'l') { + auto fn = tril_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + else { + auto fn = triu_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(tri_ev); + const auto &ctx = exec_q.get_context(); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides), + dev_shape_and_strides, ctx]() { + // capture of shp_host_shape_and_strides ensure the underlying + // vector exists for the entire execution of copying kernel + sycl_free_noexcept(dev_shape_and_strides, ctx); + }); + }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); +} + +void init_triul_ctor_dispatch_vectors(void) +{ + + using namespace td_ns; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + + DispatchVectorBuilder dvb1; + dvb1.populate_dispatch_vector(tril_generic_dispatch_vector); + + DispatchVectorBuilder dvb2; + dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp new file mode 100644 index 000000000000..08889df6227f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}); + +extern void init_triul_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From f1d6e5650910eec6f330b2de902a93a1ae95df5f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:05:03 -0800 Subject: [PATCH 23/38] Use tril/triu/_tril from dpctl_ext.tensor in dpnp --- dpnp/dpnp_container.py | 4 ++-- dpnp/linalg/dpnp_utils_linalg.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index b13bf96cda28..c8e28529cd57 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -270,13 +270,13 @@ def ones( def tril(x1, /, *, k=0): """Creates `dpnp_array` as lower triangular part of an input array.""" - array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) def triu(x1, /, *, k=0): """Creates `dpnp_array` as upper triangular part of an input array.""" - array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 196cd2ae9da5..5fb1c099dde2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -42,12 +42,12 @@ from typing import NamedTuple -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 668079060d9ece02fbb6887c2313edca9e6ecbef Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Feb 2026 02:47:35 -0800 Subject: [PATCH 24/38] Disable pylint no-name-in-module for dpctl_ext --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 1 + dpnp/dpnp_iface.py | 3 +-- dpnp/dpnp_iface_searching.py | 1 + dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index b63bf61f8dad..d8235b84e2d0 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -44,6 +44,7 @@ _validate_dtype, ) +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 832446c826ba..6220c61db6d9 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -40,6 +40,7 @@ """ # pylint: disable=protected-access +# pylint: disable=no-name-in-module import os @@ -53,8 +54,6 @@ import dpnp from .dpnp_array import dpnp_array - -# pylint: disable=no-name-in-module from .dpnp_utils import ( dpnp_descriptor, map_dtype_to_device, diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index fdbd317d31dd..74fbc9b37d13 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -41,6 +41,7 @@ import dpctl.tensor as dpt +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 4d8e3cdfbd0d..2de2bc15372c 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -37,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi From 263b7175f4aab799cd4fa100602011e8e23d046b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:31:01 -0800 Subject: [PATCH 25/38] Add TODO comments --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 ++ dpnp/dpnp_iface.py | 2 ++ dpnp/dpnp_iface_searching.py | 2 ++ dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 ++ dpnp/scipy/linalg/_utils.py | 2 ++ setup.py | 2 +- 6 files changed, 11 insertions(+), 1 deletion(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index d8235b84e2d0..88abcee5035c 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -45,6 +45,8 @@ ) # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 6220c61db6d9..50b474014666 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,8 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 74fbc9b37d13..16ab633d506b 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -42,6 +42,8 @@ import dpctl.tensor as dpt # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 2de2bc15372c..3dfd3c23ee7f 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -38,6 +38,8 @@ from dpctl.utils import ExecutionPlacementError # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 8eb9187236bf..ce832d8f4529 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,8 @@ import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/setup.py b/setup.py index a0c54b066dcf..7ffef3bed9d8 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", - # dpctl_ext + # TODO: replace with dpctl; dpctl.tensor "dpctl_ext", "dpctl_ext.tensor", ], From 4130c1b80aa108ca127040a6c4ea15bcaa86173f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:53:39 -0800 Subject: [PATCH 26/38] Use default_device_complex_type from dpctl_ext on test_array_api_info.py --- dpnp/tests/test_array_api_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py index b310192ffc59..32730c8724dc 100644 --- a/dpnp/tests/test_array_api_info.py +++ b/dpnp/tests/test_array_api_info.py @@ -1,9 +1,11 @@ -import numpy import pytest from dpctl import SyclDeviceCreationError, get_devices, select_default_device -from dpctl.tensor._tensor_impl import default_device_complex_type import dpnp + +# TODO: revert to `from dpctl.tensor....` +# when dpnp fully migrates dpctl/tensor +from dpctl_ext.tensor._tensor_impl import default_device_complex_type from dpnp.tests.helper import ( has_support_aspect64, is_win_platform, From 17ca9ab52368f3bbdbfbdf6410b82823c98c53c0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 06:59:55 -0800 Subject: [PATCH 27/38] Remove unused build_dpctl_ext function --- dpctl_ext/CMakeLists.txt | 80 ---------------------------------------- 1 file changed, 80 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index bb33a4f57332..cdb007a2d230 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -122,84 +122,4 @@ set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") -function(build_dpctl_ext _trgt _src _dest) - set(options SYCL) - cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) - add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) - set(_cythonize_trgt "${_trgt}_cythonize_pyx") - python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) - if(BUILD_DPCTL_EXT_SYCL) - add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) - target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) - target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) - if(DPCTL_OFFLOAD_COMPRESS) - target_link_options(${_trgt} PRIVATE --offload-compress) - endif() - if(_dpctl_sycl_targets) - # make fat binary - target_compile_options( - ${_trgt} - PRIVATE ${_dpctl_sycl_target_compile_options} - ) - target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE Python::NumPy) - if(DPCTL_GENERATE_COVERAGE) - target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) - if(BUILD_DPCTL_EXT_SYCL) - target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) - set(_linker_options "LINKER:${DPCTL_LDFLAGS}") - target_link_options(${_trgt} PRIVATE ${_linker_options}) - get_filename_component(_name_wle ${_generated_src} NAME_WLE) - get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) - set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") - set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") - - # TODO: create separate folder inside build folder that contains only - # headers related to this target and appropriate folder structure to - # eliminate shadow dependencies - get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) - # TODO: do not set directory if we did not generate header - target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) - set(_rpath_value "$ORIGIN") - if(BUILD_DPCTL_EXT_RELATIVE_PATH) - set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") - endif() - if(DPCTL_WITH_REDIST) - set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") - endif() - set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) - - install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) - install( - FILES ${_generated_api_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - install( - FILES ${_generated_public_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - if(DPCTL_GENERATE_COVERAGE) - get_filename_component(_original_src_dir ${_src} DIRECTORY) - file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) - install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) - endif() - - # Create target with headers only, because python is managing all the - # library imports at runtime - set(_trgt_headers ${_trgt}_headers) - add_library(${_trgt_headers} INTERFACE) - add_dependencies(${_trgt_headers} ${_trgt}) - get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) - target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) -endfunction() - add_subdirectory(tensor) From 79cb2a45f28f5099701c0728a6def5c8961c5279 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 07:51:45 -0800 Subject: [PATCH 28/38] Apply remarks for CMake files --- dpctl_ext/CMakeLists.txt | 10 ++------- dpctl_ext/tensor/CMakeLists.txt | 38 ++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index cdb007a2d230..e58693091422 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -27,13 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python REQUIRED COMPONENTS NumPy) - -# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) -# -w is to set working directory (and correctly set __pyx_f[] array of filenames) -set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") -find_package(Cython REQUIRED) - +# TODO: rework this logic to remove current duplication if(WIN32) string( CONCAT WARNING_FLAGS @@ -118,7 +112,7 @@ else() endif() # at build time create include/ directory and copy header files over -set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..28e7a4cb55f4 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,8 +27,10 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +find_package(Python COMPONENTS Development) + if(WIN32) - if(${CMAKE_VERSION} VERSION_LESS "3.23") + if(${CMAKE_VERSION} VERSION_LESS "3.27") # this is a work-around for target_link_options inserting option after -link option, cause # linker to ignore it. set(CMAKE_CXX_LINK_FLAGS @@ -37,6 +39,7 @@ if(WIN32) endif() endif() +# TODO: reuse this library for dpnp ufunc extension build set(_static_lib_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp ) @@ -67,11 +70,11 @@ add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) target_include_directories( ${_static_lib_trgt} PRIVATE - ${Python_INCLUDE_DIRS} - ${DPCTL_INCLUDE_DIR} + # ${Python_INCLUDE_DIRS} + # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts) @@ -94,14 +97,14 @@ set(_no_fast_math_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) -list( - APPEND _no_fast_math_sources - # ${_elementwise_sources} - # ${_reduction_sources} - # ${_sorting_sources} - # ${_linalg_sources} - # ${_accumulator_sources} -) +#list( +#APPEND _no_fast_math_sources +# ${_elementwise_sources} +# ${_reduction_sources} +# ${_sorting_sources} +# ${_linalg_sources} +# ${_accumulator_sources} +#) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) @@ -114,7 +117,7 @@ endforeach() set(_compiler_definitions "") -set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +set(_linker_options "LINKER:${DPNP_LDFLAGS}") foreach(python_module_name ${_py_trgts}) target_compile_options( ${python_module_name} @@ -124,6 +127,7 @@ foreach(python_module_name ${_py_trgts}) ${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel ) + # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level if(DPCTL_OFFLOAD_COMPRESS) target_link_options(${python_module_name} PRIVATE --offload-compress) endif() @@ -149,22 +153,22 @@ foreach(python_module_name ${_py_trgts}) PRIVATE -fprofile-instr-generate -fcoverage-mapping ) endif() - if(_dpctl_sycl_targets) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_compile_options} + PRIVATE ${_dpnp_sycl_target_compile_options} ) target_link_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_link_options} + PRIVATE ${_dpnp_sycl_target_link_options} ) endif() # TODO: update source so they reference individual libraries instead of # dpctl4pybind11.hpp. It will allow to simplify dependency tree # NOTE: dpctl C-API is resolved at runtime via Python # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) - if(DPCTL_WITH_REDIST) + if(DPNP_WITH_REDIST) set_target_properties( ${python_module_name} PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." From 4bf080edc0e5d277441fe39b31733571fbad0de3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 08:30:03 -0800 Subject: [PATCH 29/38] Apply remarks for c++ files --- .../include/kernels/copy_and_cast.hpp | 18 ++++----------- .../include/kernels/copy_as_contiguous.hpp | 19 ++++----------- .../source/copy_and_cast_usm_to_usm.cpp | 23 ++++--------------- .../source/copy_and_cast_usm_to_usm.hpp | 11 ++------- .../libtensor/source/copy_as_contig.cpp | 14 ++++------- .../libtensor/source/copy_as_contig.hpp | 11 ++------- .../source/device_support_queries.cpp | 13 ++++------- .../source/device_support_queries.hpp | 12 ++-------- .../source/simplify_iteration_space.cpp | 12 ++++------ .../source/simplify_iteration_space.hpp | 11 +++------ .../tensor/libtensor/source/tensor_ctors.cpp | 10 ++++---- 11 files changed, 43 insertions(+), 111 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp index a07d311a7fcb..d6001a11e471 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_and_cast +namespace dpctl::tensor::kernels::copy_and_cast { using dpctl::tensor::ssize_t; @@ -1282,7 +1277,4 @@ struct CopyForRollNDShiftFactory } }; -} // namespace copy_and_cast -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_and_cast diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index b4f367448758..37126a22dc64 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_as_contig +namespace dpctl::tensor::kernels::copy_as_contig { using dpctl::tensor::ssize_t; @@ -648,8 +643,4 @@ struct AsCContigNDBatchOfSquareMatricesFactory return as_c_contiguous_nd_batch_of_square_matrices_impl; } }; - -} // namespace copy_as_contig -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_as_contig diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 0458aa75ac32..3d20be02f885 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -32,21 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include +#include #include -#include -#include #include -#include -#include +#include #include +#include #include "dpnp4pybind11.hpp" -#include -#include #include -#include #include "kernels/copy_and_cast.hpp" #include "utils/memory_overlap.hpp" @@ -54,16 +48,11 @@ #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -305,6 +294,4 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void) dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp index d2a2dcaf7b85..d2e07b08d38f 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -38,13 +38,8 @@ #include #include "dpnp4pybind11.hpp" -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair copy_usm_ndarray_into_usm_ndarray( @@ -55,6 +50,4 @@ extern std::pair copy_usm_ndarray_into_usm_ndarray( extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 53b39ff5874c..7105202fe2ff 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -32,10 +32,11 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include #include +#include #include #include +#include #include #include @@ -54,13 +55,10 @@ #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::kernels::copy_as_contig:: @@ -753,6 +751,4 @@ std::pair ascontig_ev); } -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp index 2de67098b7fa..bfe3159c8813 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -32,14 +32,9 @@ #include #include "dpnp4pybind11.hpp" -#include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { std::pair @@ -56,6 +51,4 @@ std::pair void init_copy_as_contig_dispatch_vectors(void); -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp index 51eb7dba1b6c..97a8ba83831e 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -39,13 +39,11 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; + namespace { @@ -61,7 +59,6 @@ std::string _default_device_fp_type(const sycl::device &d) int get_numpy_major_version() { - namespace py = pybind11; py::module_ numpy = py::module_::import("numpy"); py::str version_string = numpy.attr("__version__"); @@ -179,6 +176,4 @@ std::string default_device_index_type(const py::object &arg) return _default_device_index_type(d); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp index 6ea01dcd49d7..adde7aefe3dd 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -36,14 +36,8 @@ #include "dpnp4pybind11.hpp" #include -#include -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::string default_device_fp_type(const py::object &); @@ -53,6 +47,4 @@ extern std::string default_device_bool_type(const py::object &); extern std::string default_device_complex_type(const py::object &); extern std::string default_device_index_type(const py::object &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp index 2526f022e0ac..e3cff701ed50 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -34,15 +34,13 @@ #include "simplify_iteration_space.hpp" #include "utils/strided_iters.hpp" +#include #include +#include #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -539,6 +537,4 @@ std::vector _unravel_index_f(py::ssize_t flat_index, return mi; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp index d3448ee1f5fd..acbc833157d1 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -36,11 +36,7 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -125,6 +121,5 @@ std::vector _unravel_index_c(py::ssize_t, std::vector const &); std::vector _unravel_index_f(py::ssize_t, std::vector const &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..be69ee1a8c7e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,15 +32,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include -#include +// #include +// #include +// #include #include #include #include -#include +// #include +#include #include #include +#include #include "dpnp4pybind11.hpp" From cfa6cd69735591e79ca3437cc05c326ce115ffc9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:31:42 -0800 Subject: [PATCH 30/38] Remove linear-sequence implementations --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 177 ---------- .../libtensor/source/linear_sequences.cpp | 312 ------------------ .../libtensor/source/linear_sequences.hpp | 69 ---- .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 19 insertions(+), 579 deletions(-) delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 1375c8316754..baf8ef5ce5f6 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 8d53655b2754..f43614e13766 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,189 +58,12 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ -template -class linear_sequence_step_kernel; -template -class linear_sequence_affine_kernel; template class full_strided_kernel; // template class eye_kernel; using namespace dpctl::tensor::offset_utils; -template -class LinearSequenceStepFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty step_v; - -public: - LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) - : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - p[i] = Ty{start_v.real() + i * step_v.real(), - start_v.imag() + i * step_v.imag()}; - } - else { - p[i] = start_v + i * step_v; - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting value and - * increment. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start_v Typed starting value of the sequence - * @param step_v Typed increment of the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty step_v, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceStepFunctor(array_data, start_v, step_v)); - }); - - return lin_space_step_event; -} - -// Constructor to populate tensor with linear sequence defined by -// start and and data - -template -class LinearSequenceAffineFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty end_v; - std::size_t n; - -public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), - n((den == 0) ? 1 : den) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - wTy wc = wTy(i) / n; - wTy w = wTy(n - i) / n; - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using reT = typename Ty::value_type; - auto _w = static_cast(w); - auto _wc = static_cast(wc); - auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); - re_comb = - sycl::fma(end_v.real(), _wc, - re_comb); // start_v.real() * _w + end_v.real() * _wc; - auto im_comb = - sycl::fma(start_v.imag(), _w, - reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; - im_comb = sycl::fma(end_v.imag(), _wc, im_comb); - Ty affine_comb = Ty{re_comb, im_comb}; - p[i] = affine_comb; - } - else if constexpr (std::is_floating_point::value) { - Ty _w = static_cast(w); - Ty _wc = static_cast(wc); - auto affine_comb = - sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; - affine_comb = sycl::fma(end_v, _wc, affine_comb); - p[i] = affine_comb; - } - else { - using dpctl::tensor::type_utils::convert_impl; - auto affine_comb = start_v * w + end_v * wc; - p[i] = convert_impl(affine_comb); - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting and end values. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence. - * @param start_v Stating value of the sequence. - * @param end_v End-value of the sequence. - * @param include_endpoint Whether the end-value is included in the sequence. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty end_v, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - - const bool device_supports_doubles = - exec_q.get_device().has(sycl::aspect::fp64); - const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; - - sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (device_supports_doubles) { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - else { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - }); - - return lin_space_affine_event; -} - /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp deleted file mode 100644 index 02c4a8ad0fa1..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp +++ /dev/null @@ -1,312 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include "dpnp4pybind11.hpp" -#include -#include -#include -#include -#include -#include -#include - -#include "kernels/constructors.hpp" -#include "utils/output_validation.hpp" -#include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" - -#include "linear_sequences.hpp" - -namespace py = pybind11; -namespace td_ns = dpctl::tensor::type_dispatch; - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -// Constructor to populate tensor with linear sequence defined by -// start and step data - -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &step, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting value and increment - * given as Python objects. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start Starting value of the sequence as Python object. Must be - * convertible to array element data type `Ty`. - * @param step Increment of the sequence as Python object. Must be convertible - * to array element data type `Ty`. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &step, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty step_v = py::cast(step); - - using dpctl::tensor::kernels::constructors::lin_space_step_impl; - - auto lin_space_step_event = lin_space_step_impl( - exec_q, nelems, start_v, step_v, array_data, depends); - - return lin_space_step_event; -} - -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting and end values given - * as Python objects. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence - * @param start Stating value of the sequence as Python object. Must be - * convertible to array data element type `Ty`. - * @param end End-value of the sequence as Python object. Must be convertible - * to array data element type `Ty`. - * @param include_endpoint Whether the end-value is included in the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty end_v = py::cast(end); - - using dpctl::tensor::kernels::constructors::lin_space_affine_impl; - - auto lin_space_affine_event = lin_space_affine_impl( - exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); - - return lin_space_affine_event; -} - -using dpctl::utils::keep_args_alive; - -static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; - -static lin_space_affine_fn_ptr_t - lin_space_affine_dispatch_vector[td_ns::num_types]; - -std::pair - usm_ndarray_linear_sequence_step(const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue is not compatible with the allocation queue"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_step_event; - - auto fn = lin_space_step_dispatch_vector[dst_typeid]; - - linspace_step_event = - fn(exec_q, static_cast(len), start, dt, dst_data, depends); - - return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), - linspace_step_event); -} - -std::pair - usm_ndarray_linear_sequence_affine(const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_affine_event; - - auto fn = lin_space_affine_dispatch_vector[dst_typeid]; - - linspace_affine_event = fn(exec_q, static_cast(len), start, - end, include_endpoint, dst_data, depends); - - return std::make_pair( - keep_args_alive(exec_q, {dst}, {linspace_affine_event}), - linspace_affine_event); -} - -/*! - * @brief Factor to get function pointer of type `fnT` for array with elements - * of type `Ty`. - * @defgroup CtorKernels - */ -template -struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - -/*! - * @brief Factory to get function pointer of type `fnT` for array data type - * `Ty`. - */ -template -struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - -void init_linear_sequences_dispatch_vectors(void) -{ - using namespace td_ns; - - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); - - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp deleted file mode 100644 index 321cd2f23efe..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp +++ /dev/null @@ -1,69 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#pragma once -#include -#include -#include - -#include "dpnp4pybind11.hpp" -#include - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -extern std::pair usm_ndarray_linear_sequence_step( - const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern std::pair usm_ndarray_linear_sequence_affine( - const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern void init_linear_sequences_dispatch_vectors(void); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index f2afce105f7f..7e4253c0cbb6 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -#include "linear_sequences.hpp" +// #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - init_linear_sequences_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,20 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and step `dt`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("dt"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and end point `end`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("end"), py::arg("dst"), - py::arg("include_endpoint"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and step + // `dt`. " "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and end + // point `end`. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From 087a2ecbfff6262224ff115c9948202ecf45e6ba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:58:15 -0800 Subject: [PATCH 31/38] Use _tensor_impl from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 3 +++ dpnp/dpnp_iface.py | 1 + dpnp/dpnp_iface_indexing.py | 11 +++++++---- dpnp/fft/dpnp_utils_fft.py | 14 +++++++++++--- dpnp/linalg/dpnp_utils_linalg.py | 3 +++ dpnp/scipy/linalg/_utils.py | 1 + 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 0d6640c3b8b5..4137a2794747 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -33,6 +33,9 @@ from dpctl.tensor._ctors import _cast_fill_val import dpnp + +# TODO: revert to `from dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 50b474014666..533bdc36c617 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,7 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6421f39fd4e4..a01a036e16cc 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -45,7 +45,6 @@ from collections.abc import Iterable import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._copy_utils import _nonzero_impl @@ -53,7 +52,11 @@ from dpctl.tensor._numpy_helper import normalize_axis_index import dpctl_ext.tensor as dpt_ext -import dpctl_ext.tensor._tensor_impl as ti_ext + +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti import dpnp # pylint: disable=no-name-in-module @@ -297,7 +300,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti_ext._array_overlap(x, out): + if ti._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -306,7 +309,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti_ext._take( + h_ev, take_ev = ti._take( src=x, ind=(inds,), dst=out, diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index c692774a424f..60f89a933284 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,6 +42,11 @@ from collections.abc import Sequence import dpctl + +# pylint: disable=no-name-in-module +# TODO: remove it when ti.__linspace_step +# is migrated to dpctl_ext/tensor +import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -50,7 +55,10 @@ ) from dpctl.utils import ExecutionPlacementError -import dpctl_ext.tensor._tensor_impl as ti +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp import dpnp.backend.extensions.fft._fft_impl as fi @@ -196,7 +204,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides): if ( out is not None and out.strides == tuple(out_strides) - and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) + and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) ): res_usm = dpnp.get_usm_ndarray(out) result = out @@ -524,7 +532,7 @@ def _truncate_or_pad(a, shape, axes): ) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(a), dst=z.get_array()[tuple(index)], sycl_queue=exec_q, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 5fb1c099dde2..171ac38a141c 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -47,6 +47,9 @@ from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index ce832d8f4529..665a4e1595ad 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,7 @@ import dpctl.utils as dpu +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti From f4492fbc8048d2fcc598a089715b85ed6504f02d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:28:16 -0800 Subject: [PATCH 32/38] Add missing include --- .../tensor/libtensor/include/kernels/constructors.hpp | 3 ++- .../include/kernels/integer_advanced_indexing.hpp | 4 +--- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 8 ++++---- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 5 ++++- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++++++---- .../libtensor/source/integer_advanced_indexing.hpp | 6 +++++- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 3 +-- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 7 ++----- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 3 ++- 10 files changed, 29 insertions(+), 22 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index f43614e13766..3bc4a1d16271 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -33,8 +33,9 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include +#include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 1b2c79d2e2a5..d0ec5227731c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,12 +33,10 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include #include -#include #include #include +#include #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index e1f61be4a12a..279bb9f470bc 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,15 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include -#include -#include +#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index d664b2013506..43b30fc8341c 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -33,13 +33,16 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 244acfe3955f..ed72096bff8f 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,21 +34,23 @@ //===----------------------------------------------------------------------===// #include -#include #include #include +#include +#include #include -#include +#include #include +#include + +#include #include "dpnp4pybind11.hpp" -#include #include #include #include "kernels/integer_advanced_indexing.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 57f0ddda132c..5dfbd2f04d93 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -34,13 +34,17 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index 0890dfdb4766..f0f592c52938 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -34,8 +34,8 @@ #include // for std::copy #include // for std::size_t +#include // for std::begin, std::end #include // for std::make_shared -#include // for std::runtime_error #include // for std::pair, std::move #include // for std::vector, std::begin, std::end @@ -47,7 +47,6 @@ #include "kernels/constructors.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index 08889df6227f..c61d95eef7ec 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -40,6 +40,8 @@ #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index 4558743b3c22..d7370f55e8cb 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,21 +32,18 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include #include -#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include -#include "kernels/constructors.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "zeros_ctor.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index 51270a3443cc..ec3bce994ef6 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -33,10 +33,11 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include From b367c9fd3b4b538e132afb5838584137a6f8a25c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:36:24 -0800 Subject: [PATCH 33/38] Use nested namespace syntax --- .../libtensor/include/kernels/constructors.hpp | 13 ++----------- .../include/kernels/integer_advanced_indexing.hpp | 13 ++----------- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 10 ++-------- 10 files changed, 20 insertions(+), 86 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 3bc4a1d16271..47726319b3e1 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -44,13 +44,7 @@ #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace constructors +namespace dpctl::tensor::kernels::constructors { using dpctl::tensor::ssize_t; @@ -305,7 +299,4 @@ struct TriuGenericFactory } }; -} // namespace constructors -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::constructors diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index d0ec5227731c..7351502dbc11 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -43,13 +43,7 @@ #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace indexing +namespace dpctl::tensor::kernels::indexing { using dpctl::tensor::ssize_t; @@ -419,7 +413,4 @@ struct PutClipFactory } }; -} // namespace indexing -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index 279bb9f470bc..ca4a17f28f77 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -53,11 +53,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -310,6 +306,4 @@ void init_full_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(full_strided_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index 43b30fc8341c..18c15de87a40 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -43,11 +43,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -58,6 +54,4 @@ extern std::pair extern void init_full_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index ed72096bff8f..77322381d517 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -62,11 +62,7 @@ #define WRAP_MODE 0 #define CLIP_MODE 1 -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -816,6 +812,4 @@ void init_advanced_indexing_dispatch_tables(void) dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 5dfbd2f04d93..bc0136288e1c 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -45,11 +45,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -72,6 +68,4 @@ extern std::pair extern void init_advanced_indexing_dispatch_tables(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index f0f592c52938..13e909196460 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -54,11 +54,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -247,6 +243,4 @@ void init_triul_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index c61d95eef7ec..47cc4ce8892d 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -42,11 +42,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -59,6 +55,4 @@ extern std::pair extern void init_triul_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index d7370f55e8cb..b9a2e01bea4a 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -50,11 +50,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -160,6 +156,4 @@ void init_zeros_ctor_dispatch_vectors(void) return; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index ec3bce994ef6..51a1903a0f36 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -41,11 +41,7 @@ #include "dpnp4pybind11.hpp" #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -55,6 +51,4 @@ extern std::pair extern void init_zeros_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal From 3113716a13a131dc44f819140489176be5ff7cba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 02:50:47 -0800 Subject: [PATCH 34/38] Add missing include complex --- dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp | 1 + .../libtensor/include/kernels/integer_advanced_indexing.hpp | 4 +++- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 2 ++ .../tensor/libtensor/source/integer_advanced_indexing.cpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 2 ++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 47726319b3e1..22189ee3129c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -34,6 +34,7 @@ #pragma once #include +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 7351502dbc11..7be2b3ea8591 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,11 +33,13 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include -#include #include #include +#include + #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" #include "utils/offset_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index ca4a17f28f77..aef57836666e 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -41,6 +42,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 77322381d517..925cc2e895ed 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,6 +34,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index b9a2e01bea4a..2eb05e49f382 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -40,6 +41,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "utils/output_validation.hpp" From 978afee9115d8feaebe72c80ce3e827e13c66770 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 03:13:50 -0800 Subject: [PATCH 35/38] Add missing memory and queue checks --- .../libtensor/source/copy_as_contig.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 7105202fe2ff..bbee24c95d4d 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -189,6 +189,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -314,6 +320,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -459,6 +471,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + if (nelems == 0) { // nothing to do return std::make_pair(sycl::event(), sycl::event()); @@ -624,6 +642,20 @@ std::pair throw py::value_error("Unexpected destination array layout"); } + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + int src_typenum = src.get_typenum(); int dst_typenum = dst.get_typenum(); From 19e93b99c7c2c238f1b697dfefe5b70525370819 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:34:09 -0800 Subject: [PATCH 36/38] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..0cfebe53f623 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +# TODO: revert to `dpctl/` +# when dpnp fully migrates dpctl/tensor +dpctl_ext/**/*.cpython*.so From b111e49b784168180c835569d5dbe97958521f16 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:35:23 -0800 Subject: [PATCH 37/38] Remove unused includes in tensor_ctors.cpp --- dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index be69ee1a8c7e..54d6adbc8f6e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,18 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -// #include -// #include -// #include -#include -#include -#include -// #include #include #include #include #include +#include +#include +#include + #include "dpnp4pybind11.hpp" // #include "accumulators.hpp" From c082224e07df5e4d4960112ef5ec4e5faef2a452 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 05:40:59 -0800 Subject: [PATCH 38/38] Use Python::Module for dpctl_ext static lib to avoid libpython dependency --- dpctl_ext/tensor/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 28e7a4cb55f4..ed69b4f10cba 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,7 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python COMPONENTS Development) +find_package(Python COMPONENTS Development.Module) if(WIN32) if(${CMAKE_VERSION} VERSION_LESS "3.27") @@ -74,7 +74,7 @@ target_include_directories( # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts)