From afc461e1e57c611ac1103cf21d877a523144f765 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 2 Jan 2026 07:47:19 -0500
Subject: [PATCH 1/7] Initial support for CUDA + factorizations

---
 test/cuda/factorizations.jl | 499 ++++++++++++++++++++++++++++++++++++
 test/runtests.jl            |   1 +
 2 files changed, 500 insertions(+)
 create mode 100644 test/cuda/factorizations.jl

diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
new file mode 100644
index 000000000..d44bb878a
--- /dev/null
+++ b/test/cuda/factorizations.jl
@@ -0,0 +1,499 @@
+using Adapt, CUDA, cuTENSOR
+using Test, TestExtras
+using TensorKit
+using LinearAlgebra: LinearAlgebra
+using MatrixAlgebraKit: diagview
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+const curand = getglobal(CUDAExt, :curand)
+const curandn = getglobal(CUDAExt, :curandn)
+const curand! = getglobal(CUDAExt, :curand!)
+using CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VIB_diag)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₃, VfU₁, VfSU₂, VIB_M)
+        else
+            (Vtr, VU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
+        end
+    else
+        (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
+    end
+catch
+    (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
+end
+
+eltypes = (Float32, ComplexF64)
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("Factorizations with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Factorizations with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        W = V1 ⊗ V2
+        @assert !isempty(blocksectors(W))
+        @assert !isempty(intersect(blocksectors(V4), blocksectors(W)))
+
+        @testset "QR decomposition" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, V4),
+                        #CUDA.rand(T, V4, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                Q, R = @constinferred qr_full(t)
+                @test Q * R ≈ t
+                @test isunitary(Q)
+
+                Q, R = @constinferred qr_compact(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+
+                Q, R = @constinferred left_orth(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+
+                N = @constinferred qr_null(t)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                N = @constinferred left_null(t)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes
+                t = CUDA.rand(T, V1 ⊗ V2, zerospace(V1))
+
+                Q, R = @constinferred qr_full(t)
+                @test Q * R ≈ t
+                @test isunitary(Q)
+                @test dim(R) == dim(t) == 0
+
+                Q, R = @constinferred qr_compact(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                Q, R = @constinferred left_orth(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                N = @constinferred qr_null(t)
+                @test isunitary(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+
+        @testset "LQ decomposition" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, V4),
+                        #CUDA.rand(T, V4, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                L, Q = @constinferred lq_full(t)
+                @test L * Q ≈ t
+                @test isunitary(Q)
+
+                L, Q = @constinferred lq_compact(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+
+                L, Q = @constinferred right_orth(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            for T in eltypes
+                # empty tensor
+                t = CUDA.rand(T, zerospace(V1), V1 ⊗ V2)
+
+                L, Q = @constinferred lq_full(t)
+                @test L * Q ≈ t
+                @test isunitary(Q)
+                @test dim(L) == dim(t) == 0
+
+                L, Q = @constinferred lq_compact(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                L, Q = @constinferred right_orth(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isunitary(Nᴴ)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+
+        @testset "Polar decomposition" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, V4),
+                        #CUDA.rand(T, V4, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                @assert domain(t) ≾ codomain(t)
+                w, p = @constinferred left_polar(t)
+                @test w * p ≈ t
+                @test isisometric(w)
+                @test isposdef(p)
+
+                w, p = @constinferred left_orth(t; alg = :polar)
+                @test w * p ≈ t
+                @test isisometric(w)
+            end
+
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, V4, W),
+                        #CUDA.rand(T, W, V4)',
+                    )
+
+                @assert codomain(t) ≾ domain(t)
+                p, wᴴ = @constinferred right_polar(t)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+                @test isposdef(p)
+
+                p, wᴴ = @constinferred right_orth(t; alg = :polar)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+            end
+        end
+
+        @testset "SVD" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, V4),
+                        CUDA.rand(T, V4, W),
+                        #CUDA.rand(T, W, V4)',
+                        #CUDA.rand(T, V4, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                u, s, vᴴ = @constinferred svd_full(t)
+                @test u * s * vᴴ ≈ t
+                @test isunitary(u)
+                @test isunitary(vᴴ)
+
+                u, s, vᴴ = @constinferred svd_compact(t)
+                @test u * s * vᴴ ≈ t
+                @test isisometric(u)
+                @test isposdef(s)
+                @test isisometric(vᴴ; side = :right)
+
+                s′ = @constinferred svd_vals(t)
+                @test parent(s′) ≈ parent(diagview(s))
+                @test s′ isa TensorKit.SectorVector
+
+                v, c = @constinferred left_orth(t; alg = :svd)
+                @test v * c ≈ t
+                @test isisometric(v)
+
+                c, vᴴ = @constinferred right_orth(t; alg = :svd)
+                @test c * vᴴ ≈ t
+                @test isisometric(vᴴ; side = :right)
+
+                N = @constinferred left_null(t; alg = :svd)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                #N = @constinferred left_null(t; trunc = (; atol = 100 * eps(norm(t))))
+                #@test isisometric(N)
+                #@test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                Nᴴ = @constinferred right_null(t; alg = :svd)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+
+                #Nᴴ = @constinferred right_null(t; trunc = (; atol = 100 * eps(norm(t))))
+                #@test isisometric(Nᴴ; side = :right)
+                #@test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes, t in (
+                        CUDA.rand(T, W, zerospace(V1)),
+                        CUDA.rand(T, zerospace(V1), W),
+                    )
+                U, S, Vᴴ = @constinferred svd_full(t)
+                @test U * S * Vᴴ ≈ t
+                @test isunitary(U)
+                @test isunitary(Vᴴ)
+
+                U, S, Vᴴ = @constinferred svd_compact(t)
+                @test U * S * Vᴴ ≈ t
+                @test dim(U) == dim(S) == dim(Vᴴ) == dim(t) == 0
+            end
+        end
+
+        #=@testset "truncated SVD" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.randn(T, W, W),
+                        #CUDA.randn(T, W, W)',
+                        CUDA.randn(T, W, V4),
+                        CUDA.randn(T, V4, W),
+                        #CUDA.randn(T, W, V4)',
+                        #CUDA.randn(T, V4, W)',
+                        DiagonalTensorMap(CUDA.randn(T, reduceddim(V1)), V1),
+                    )
+
+                @constinferred normalize!(t)
+
+                U, S, Vᴴ, ϵ = @constinferred svd_trunc(t; trunc = notrunc())
+                @test U * S * Vᴴ ≈ t
+                @test ϵ ≈ 0
+                @test isisometric(U)
+                @test isisometric(Vᴴ; side = :right)
+
+                # dimension of S is a float for IsingBimodule
+                nvals = round(Int, dim(domain(S)) / 2)
+                trunc = truncrank(nvals)
+                U1, S1, Vᴴ1, ϵ1 = @constinferred svd_trunc(t; trunc)
+                @test t * Vᴴ1' ≈ U1 * S1
+                @test isisometric(U1)
+                @test isisometric(Vᴴ1; side = :right)
+                @test norm(t - U1 * S1 * Vᴴ1) ≈ ϵ1 atol = eps(real(T))^(4 / 5)
+                @test dim(domain(S1)) <= nvals
+
+                λ = minimum(diagview(S1))
+                trunc = trunctol(; atol = λ - 10eps(λ))
+                U2, S2, Vᴴ2, ϵ2 = @constinferred svd_trunc(t; trunc)
+                @test t * Vᴴ2' ≈ U2 * S2
+                @test isisometric(U2)
+                @test isisometric(Vᴴ2; side = :right)
+                @test norm(t - U2 * S2 * Vᴴ2) ≈ ϵ2 atol = eps(real(T))^(4 / 5)
+                @test minimum(diagview(S1)) >= λ
+                @test U2 ≈ U1
+                @test S2 ≈ S1
+                @test Vᴴ2 ≈ Vᴴ1
+                @test ϵ1 ≈ ϵ2
+
+                trunc = truncspace(space(S2, 1))
+                U3, S3, Vᴴ3, ϵ3 = @constinferred svd_trunc(t; trunc)
+                @test t * Vᴴ3' ≈ U3 * S3
+                @test isisometric(U3)
+                @test isisometric(Vᴴ3; side = :right)
+                @test norm(t - U3 * S3 * Vᴴ3) ≈ ϵ3 atol = eps(real(T))^(4 / 5)
+                @test space(S3, 1) ≾ space(S2, 1)
+
+                for trunc in (truncerror(; atol = ϵ2), truncerror(; rtol = ϵ2 / norm(t)))
+                    U4, S4, Vᴴ4, ϵ4 = @constinferred svd_trunc(t; trunc)
+                    @test t * Vᴴ4' ≈ U4 * S4
+                    @test isisometric(U4)
+                    @test isisometric(Vᴴ4; side = :right)
+                    @test norm(t - U4 * S4 * Vᴴ4) ≈ ϵ4 atol = eps(real(T))^(4 / 5)
+                    @test ϵ4 ≤ ϵ2
+                end
+
+                trunc = truncrank(nvals) & trunctol(; atol = λ - 10eps(λ))
+                U5, S5, Vᴴ5, ϵ5 = @constinferred svd_trunc(t; trunc)
+                @test t * Vᴴ5' ≈ U5 * S5
+                @test isisometric(U5)
+                @test isisometric(Vᴴ5; side = :right)
+                @test norm(t - U5 * S5 * Vᴴ5) ≈ ϵ5 atol = eps(real(T))^(4 / 5)
+                @test minimum(diagview(S5)) >= λ
+                @test dim(domain(S5)) ≤ nvals
+            end
+        end=# # TODO
+
+        @testset "Eigenvalue decomposition" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, V1, V1),
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                d, v = @constinferred eig_full(t)
+                @test t * v ≈ v * d
+
+                d′ = @constinferred eig_vals(t)
+                @test parent(d′) ≈ parent(diagview(d))
+                @test d′ isa TensorKit.SectorVector
+
+                vdv = project_hermitian!(v' * v)
+                @test @constinferred isposdef(vdv)
+                t isa DiagonalTensorMap || @test !isposdef(t) # unlikely for non-hermitian map
+
+                #=nvals = round(Int, dim(domain(t)) / 2)
+                d, v = @constinferred eig_trunc(t; trunc = truncrank(nvals))
+                @test t * v ≈ v * d
+                @test dim(domain(d)) ≤ nvals=#
+
+                t2 = @constinferred project_hermitian(t)
+                D, V = eigen(t2)
+                @test isisometric(V)
+                D̃, Ṽ = @constinferred eigh_full(t2)
+                @test D ≈ D̃
+                @test V ≈ Ṽ
+                λ = minimum(real, parent(diagview(D)))
+                @test cond(Ṽ) ≈ one(real(T))
+                @test isposdef(t2) == isposdef(λ)
+                @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+
+                d, v = @constinferred eigh_full(t2)
+                @test t2 * v ≈ v * d
+                @test isunitary(v)
+
+                d′ = @constinferred eigh_vals(t2)
+                @test parent(d′) ≈ parent(diagview(d))
+                @test d′ isa TensorKit.SectorVector
+
+                λ = minimum(real, parent(diagview(d)))
+                @test cond(v) ≈ one(real(T))
+                @test isposdef(t2) == isposdef(λ)
+                @test isposdef(t2 - λ * one(t) + 0.1 * one(t2))
+                @test !isposdef(t2 - λ * one(t) - 0.1 * one(t2))
+
+                # TODO
+                #=d, v = @constinferred eigh_trunc(t2; trunc = truncrank(nvals))
+                @test t2 * v ≈ v * d
+                @test dim(domain(d)) ≤ nvals=#
+            end
+        end
+
+        @testset "Condition number and rank" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, V4),
+                        CUDA.rand(T, V4, W),
+                        #CUDA.rand(T, W, V4)',
+                        #CUDA.rand(T, V4, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+
+                d1, d2 = dim(codomain(t)), dim(domain(t))
+                r = rank(t)
+                @test r == min(d1, d2)
+                @test typeof(r) == typeof(d1)
+                M = left_null(t)
+                @test @constinferred(rank(M)) + r ≈ d1
+                Mᴴ = right_null(t)
+                @test rank(Mᴴ) + r ≈ d2
+            end
+            for T in eltypes
+                u = unitary(T, V1 ⊗ V2, V1 ⊗ V2)
+                @test @constinferred(cond(u)) ≈ one(real(T))
+                @test @constinferred(rank(u)) == dim(V1 ⊗ V2)
+
+                t = CUDA.rand(T, zerospace(V1), W)
+                @test rank(t) == 0
+                t2 = CUDA.rand(T, zerospace(V1) * zerospace(V2), zerospace(V1) * zerospace(V2))
+                @test rank(t2) == 0
+                @test cond(t2) == 0.0
+            end
+            for T in eltypes, t in (
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                    )
+                project_hermitian!(t)
+                vals = @constinferred LinearAlgebra.eigvals(t)
+                λmax = maximum(s -> maximum(abs, s), values(vals))
+                λmin = minimum(s -> minimum(abs, s), values(vals))
+                @test cond(t) ≈ λmax / λmin
+            end
+        end
+
+        @testset "Hermitian projections" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.rand(T, V1, V1),
+                        CUDA.rand(T, W, W),
+                        #CUDA.rand(T, W, W)',
+                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                    )
+                normalize!(t)
+                noisefactor = eps(real(T))^(3 / 4)
+
+                th = (t + t') / 2
+                ta = (t - t') / 2
+                tc = copy(t)
+
+                th′ = @constinferred project_hermitian(t)
+                @test ishermitian(th′)
+                @test th′ ≈ th
+                @test t == tc
+                th_approx = th + noisefactor * ta
+                @test !ishermitian(th_approx) || (T <: Real && t isa DiagonalTensorMap)
+                @test ishermitian(th_approx; atol = 10 * noisefactor)
+
+                ta′ = project_antihermitian(t)
+                @test isantihermitian(ta′)
+                @test ta′ ≈ ta
+                @test t == tc
+                ta_approx = ta + noisefactor * th
+                @test !isantihermitian(ta_approx)
+                @test isantihermitian(ta_approx; atol = 10 * noisefactor) || (T <: Real && t isa DiagonalTensorMap)
+            end
+        end
+
+        @testset "Isometric projections" begin
+            for T in eltypes,
+                    t in (
+                        CUDA.randn(T, W, W),
+                        #CUDA.randn(T, W, W)',
+                        CUDA.randn(T, W, V4),
+                        #CUDA.randn(T, V4, W)',
+                    )
+                t2 = project_isometric(t)
+                @test isisometric(t2)
+                t3 = project_isometric(t2)
+                @test t3 ≈ t2 # stability of the projection
+                @test t2 * (t2' * t) ≈ t
+
+                tc = similar(t)
+                t3 = @constinferred project_isometric!(copy!(tc, t), t2)
+                @test t3 === t2
+                @test isisometric(t2)
+
+                # test that t2 is closer to A then any other isometry
+                for k in 1:10
+                    δt = CUDA.randn!(similar(t))
+                    t3 = project_isometric(t + δt / 100)
+                    @test norm(t - t3) > norm(t - t2)
+                end
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9513378db..3b0bfe8b0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -50,6 +50,7 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
         using CUDA
         CUDA.functional() || continue
         @time include("cuda/tensors.jl")
+        @time include("cuda/factorizations.jl")
     elseif is_buildkite
         continue
     end

From ed8f52b7a05734e0f4bc7f0ab945351abdc8857d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 2 Jan 2026 13:56:36 -0500
Subject: [PATCH 2/7] Working adjoint

---
 ext/TensorKitCUDAExt/cutensormap.jl |  8 ++++++
 src/factorizations/adjoint.jl       |  3 +++
 test/cuda/factorizations.jl         | 42 ++++++++++++++---------------
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index b4aa7f7a2..28927687b 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -164,3 +164,11 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+function Base.copy!(tdst::CuTensorMap, tsrc::AdjointTensorMap)
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
diff --git a/src/factorizations/adjoint.jl b/src/factorizations/adjoint.jl
index 29f27203c..8a75fc2fd 100644
--- a/src/factorizations/adjoint.jl
+++ b/src/factorizations/adjoint.jl
@@ -9,6 +9,9 @@ _adjoint(alg::MAK.LAPACK_HouseholderRQ) = MAK.LAPACK_HouseholderQL(; alg.kwargs.
 _adjoint(alg::MAK.PolarViaSVD) = MAK.PolarViaSVD(_adjoint(alg.svd_alg))
 _adjoint(alg::AbstractAlgorithm) = alg
 
+_adjoint(alg::MAK.CUSOLVER_HouseholderQR) = MAK.LQViaTransposedQR(alg)
+_adjoint(alg::MAK.LQViaTransposedQR) = alg.qr_alg
+
 for f in
     [
         :svd_compact, :svd_full, :svd_vals,
diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index d44bb878a..8e03368ab 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -49,9 +49,9 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, W, V4),
-                        #CUDA.rand(T, V4, W)',
+                        CUDA.rand(T, V4, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -105,9 +105,9 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, W, V4),
-                        #CUDA.rand(T, V4, W)',
+                        CUDA.rand(T, V4, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -157,9 +157,9 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, W, V4),
-                        #CUDA.rand(T, V4, W)',
+                        CUDA.rand(T, V4, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -167,7 +167,7 @@ for V in spacelist
                 w, p = @constinferred left_polar(t)
                 @test w * p ≈ t
                 @test isisometric(w)
-                @test isposdef(p)
+                @test isposdef(project_hermitian!(p))
 
                 w, p = @constinferred left_orth(t; alg = :polar)
                 @test w * p ≈ t
@@ -177,16 +177,16 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, V4, W),
-                        #CUDA.rand(T, W, V4)',
+                        CUDA.rand(T, W, V4)',
                     )
 
                 @assert codomain(t) ≾ domain(t)
                 p, wᴴ = @constinferred right_polar(t)
                 @test p * wᴴ ≈ t
                 @test isisometric(wᴴ; side = :right)
-                @test isposdef(p)
+                @test isposdef(project_hermitian!(p))
 
                 p, wᴴ = @constinferred right_orth(t; alg = :polar)
                 @test p * wᴴ ≈ t
@@ -198,11 +198,11 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, W, V4),
                         CUDA.rand(T, V4, W),
-                        #CUDA.rand(T, W, V4)',
-                        #CUDA.rand(T, V4, W)',
+                        CUDA.rand(T, W, V4)',
+                        CUDA.rand(T, V4, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -338,7 +338,7 @@ for V in spacelist
                     t in (
                         CUDA.rand(T, V1, V1),
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -395,11 +395,11 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         CUDA.rand(T, W, V4),
                         CUDA.rand(T, V4, W),
-                        #CUDA.rand(T, W, V4)',
-                        #CUDA.rand(T, V4, W)',
+                        CUDA.rand(T, W, V4)',
+                        CUDA.rand(T, V4, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
 
@@ -425,7 +425,7 @@ for V in spacelist
             end
             for T in eltypes, t in (
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                     )
                 project_hermitian!(t)
                 vals = @constinferred LinearAlgebra.eigvals(t)
@@ -440,7 +440,7 @@ for V in spacelist
                     t in (
                         CUDA.rand(T, V1, V1),
                         CUDA.rand(T, W, W),
-                        #CUDA.rand(T, W, W)',
+                        CUDA.rand(T, W, W)',
                         DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
                     )
                 normalize!(t)
@@ -472,9 +472,9 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.randn(T, W, W),
-                        #CUDA.randn(T, W, W)',
+                        CUDA.randn(T, W, W)',
                         CUDA.randn(T, W, V4),
-                        #CUDA.randn(T, V4, W)',
+                        CUDA.randn(T, V4, W)',
                     )
                 t2 = project_isometric(t)
                 @test isisometric(t2)

From 54259b15abf9474e10d70a5f189fb2ca7e2a33e3 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Mon, 5 Jan 2026 07:13:33 +0100
Subject: [PATCH 3/7] Update test/cuda/factorizations.jl

Co-authored-by: Lukas Devos <ldevos98@gmail.com>
---
 test/cuda/factorizations.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index 8e03368ab..9155e5212 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -429,8 +429,8 @@ for V in spacelist
                     )
                 project_hermitian!(t)
                 vals = @constinferred LinearAlgebra.eigvals(t)
-                λmax = maximum(s -> maximum(abs, s), values(vals))
-                λmin = minimum(s -> minimum(abs, s), values(vals))
+                λmax = maximum(abs, parent(vals))
+                λmin = minimum(abs, parent(vals))
                 @test cond(t) ≈ λmax / λmin
             end
         end

From 07a89b5867d7d75cfe79c9b66e34cef519376921 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Mon, 5 Jan 2026 07:13:47 +0100
Subject: [PATCH 4/7] Update test/cuda/factorizations.jl

Co-authored-by: Jutho <Jutho@users.noreply.github.com>
---
 test/cuda/factorizations.jl | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index 9155e5212..aa7936bd4 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -14,20 +14,16 @@ using CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curand
 @isdefined(TestSetup) || include("../setup.jl")
 using .TestSetup
 
-spacelist = try
-    if ENV["CI"] == "true"
-        println("Detected running on CI")
-        if Sys.iswindows()
-            (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VIB_diag)
-        elseif Sys.isapple()
-            (Vtr, Vℤ₃, VfU₁, VfSU₂, VIB_M)
-        else
-            (Vtr, VU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
-        end
+spacelist = if get(ENV, "CI", "false") == "true"
+    println("Detected running on CI")
+    if Sys.iswindows()
+        (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VIB_diag)
+    elseif Sys.isapple()
+        (Vtr, Vℤ₃, VfU₁, VfSU₂, VIB_M)
     else
-        (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
+        (Vtr, VU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
     end
-catch
+else
     (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂, VIB_diag, VIB_M)
 end
 

From 02266d80e45a180657729636d3c1fc097c9ab047 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 5 Jan 2026 04:31:23 -0500
Subject: [PATCH 5/7] Fix posdef and copy

---
 ext/TensorKitCUDAExt/cutensormap.jl | 14 +++++---------
 src/tensors/linalg.jl               |  4 +++-
 test/cuda/factorizations.jl         |  8 ++++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 28927687b..09032c3b0 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -121,11 +121,15 @@ function LinearAlgebra.isposdef(t::CuTensorMap)
         # do our own hermitian check
         isherm = MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
         isherm || return false
-        isposdef(Hermitian(b)) || return false
+        isposdef(project_hermitian!(b)) || return false
     end
     return true
 end
 
+function LinearAlgebra.isposdef(t::TensorKit.AdjointTensorMap{T, S, N₁, N₂, <:CuTensorMap}) where {T, S, N₁, N₂}
+    return isposdef(adjoint(t))
+end
+
 function Base.promote_rule(
         ::Type{<:TT₁},
         ::Type{<:TT₂}
@@ -164,11 +168,3 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
-
-function Base.copy!(tdst::CuTensorMap, tsrc::AdjointTensorMap)
-    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
-    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
-        copy!(bdst, bsrc)
-    end
-    return tdst
-end
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index 4b2c7d4ee..f37d2d0e9 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -197,13 +197,15 @@ LinearAlgebra.isdiag(t::AbstractTensorMap) = all(LinearAlgebra.isdiag ∘ last,
 # In-place methods
 #------------------
 # Wrapping the blocks in a StridedView enables multithreading if JULIA_NUM_THREADS > 1
+# but can cause problems with underlying array types (like CuArray) that don't yet play
+# nicely with StridedViews
 # TODO: reconsider this strategy, consider spawning different threads for different blocks
 
 # Copy, adjoint and fill:
 function Base.copy!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
     space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
     for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
-        copy!(StridedView(bdst), StridedView(bsrc))
+        copy!(bdst, bsrc)
     end
     return tdst
 end
diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index aa7936bd4..f7b6ad6d6 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -150,7 +150,7 @@ for V in spacelist
         end
 
         @testset "Polar decomposition" begin
-            for T in eltypes,
+            @testset for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
                         CUDA.rand(T, W, W)',
@@ -163,14 +163,14 @@ for V in spacelist
                 w, p = @constinferred left_polar(t)
                 @test w * p ≈ t
                 @test isisometric(w)
-                @test isposdef(project_hermitian!(p))
+                @test isposdef(p)
 
                 w, p = @constinferred left_orth(t; alg = :polar)
                 @test w * p ≈ t
                 @test isisometric(w)
             end
 
-            for T in eltypes,
+            @testset for T in eltypes,
                     t in (
                         CUDA.rand(T, W, W),
                         CUDA.rand(T, W, W)',
@@ -182,7 +182,7 @@ for V in spacelist
                 p, wᴴ = @constinferred right_polar(t)
                 @test p * wᴴ ≈ t
                 @test isisometric(wᴴ; side = :right)
-                @test isposdef(project_hermitian!(p))
+                @test isposdef(p)
 
                 p, wᴴ = @constinferred right_orth(t; alg = :polar)
                 @test p * wᴴ ≈ t

From 788fed186c61052c0f890fd5cfa34db4d164e29d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 5 Jan 2026 09:57:54 -0500
Subject: [PATCH 6/7] Comments

---
 ext/TensorKitCUDAExt/cutensormap.jl | 8 ++------
 src/factorizations/adjoint.jl       | 4 ++++
 src/tensors/linalg.jl               | 4 ----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 09032c3b0..3274e654a 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -119,17 +119,13 @@ function LinearAlgebra.isposdef(t::CuTensorMap)
     InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
     for (c, b) in blocks(t)
         # do our own hermitian check
-        isherm = MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm = MatrixAlgebraKit.ishermitian(b)
         isherm || return false
-        isposdef(project_hermitian!(b)) || return false
+        isposdef(Hermitian(b)) || return false
     end
     return true
 end
 
-function LinearAlgebra.isposdef(t::TensorKit.AdjointTensorMap{T, S, N₁, N₂, <:CuTensorMap}) where {T, S, N₁, N₂}
-    return isposdef(adjoint(t))
-end
-
 function Base.promote_rule(
         ::Type{<:TT₁},
         ::Type{<:TT₂}
diff --git a/src/factorizations/adjoint.jl b/src/factorizations/adjoint.jl
index 8a75fc2fd..20d6d5986 100644
--- a/src/factorizations/adjoint.jl
+++ b/src/factorizations/adjoint.jl
@@ -111,3 +111,7 @@ function MAK.svd_compact!(t::AdjointTensorMap, F, alg::DiagonalAlgorithm)
     F′ = svd_compact!(adjoint(t), reverse(adjoint.(F)), _adjoint(alg))
     return reverse(adjoint.(F′))
 end
+
+function LinearAlgebra.isposdef(t::AdjointTensorMap)
+    return isposdef(adjoint(t))
+end
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index f37d2d0e9..38c45c1f9 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -196,10 +196,6 @@ LinearAlgebra.isdiag(t::AbstractTensorMap) = all(LinearAlgebra.isdiag ∘ last,
 
 # In-place methods
 #------------------
-# Wrapping the blocks in a StridedView enables multithreading if JULIA_NUM_THREADS > 1
-# but can cause problems with underlying array types (like CuArray) that don't yet play
-# nicely with StridedViews
-# TODO: reconsider this strategy, consider spawning different threads for different blocks
 
 # Copy, adjoint and fill:
 function Base.copy!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)

From cf0418eee19340aba868fba47c553aeac66f437e Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 7 Jan 2026 10:24:45 -0500
Subject: [PATCH 7/7] Try using truncation on CUDA

---
 Project.toml                |  4 ++++
 test/cuda/factorizations.jl | 31 +++++++++++++++----------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8ca217db8..02195559d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -79,3 +79,7 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+
+[sources]
+GPUArrays = {rev = "master", url = "https://github.com/JuliaGPU/GPUArrays.jl"}
+MatrixAlgebraKit = {rev = "main", url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl"}
diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index f7b6ad6d6..2f8cf876e 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -1,4 +1,4 @@
-using Adapt, CUDA, cuTENSOR
+using Adapt, GPUArrays, CUDA, cuTENSOR
 using Test, TestExtras
 using TensorKit
 using LinearAlgebra: LinearAlgebra
@@ -229,17 +229,17 @@ for V in spacelist
                 @test isisometric(N)
                 @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
 
-                #N = @constinferred left_null(t; trunc = (; atol = 100 * eps(norm(t))))
-                #@test isisometric(N)
-                #@test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+                #=N = @constinferred left_null(t; trunc = (; atol = 100 * eps(norm(t))))
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))=#
 
                 Nᴴ = @constinferred right_null(t; alg = :svd)
                 @test isisometric(Nᴴ; side = :right)
                 @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
 
-                #Nᴴ = @constinferred right_null(t; trunc = (; atol = 100 * eps(norm(t))))
-                #@test isisometric(Nᴴ; side = :right)
-                #@test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+                #=Nᴴ = @constinferred right_null(t; trunc = (; atol = 100 * eps(norm(t))))
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))=#
             end
 
             # empty tensor
@@ -262,11 +262,11 @@ for V in spacelist
             for T in eltypes,
                     t in (
                         CUDA.randn(T, W, W),
-                        #CUDA.randn(T, W, W)',
+                        CUDA.randn(T, W, W)',
                         CUDA.randn(T, W, V4),
                         CUDA.randn(T, V4, W),
-                        #CUDA.randn(T, W, V4)',
-                        #CUDA.randn(T, V4, W)',
+                        CUDA.randn(T, W, V4)',
+                        CUDA.randn(T, V4, W)',
                         DiagonalTensorMap(CUDA.randn(T, reduceddim(V1)), V1),
                     )
 
@@ -327,7 +327,7 @@ for V in spacelist
                 @test minimum(diagview(S5)) >= λ
                 @test dim(domain(S5)) ≤ nvals
             end
-        end=# # TODO
+        end=#
 
         @testset "Eigenvalue decomposition" begin
             for T in eltypes,
@@ -349,10 +349,10 @@ for V in spacelist
                 @test @constinferred isposdef(vdv)
                 t isa DiagonalTensorMap || @test !isposdef(t) # unlikely for non-hermitian map
 
-                #=nvals = round(Int, dim(domain(t)) / 2)
+                nvals = round(Int, dim(domain(t)) / 2)
                 d, v = @constinferred eig_trunc(t; trunc = truncrank(nvals))
                 @test t * v ≈ v * d
-                @test dim(domain(d)) ≤ nvals=#
+                @test dim(domain(d)) ≤ nvals
 
                 t2 = @constinferred project_hermitian(t)
                 D, V = eigen(t2)
@@ -380,10 +380,9 @@ for V in spacelist
                 @test isposdef(t2 - λ * one(t) + 0.1 * one(t2))
                 @test !isposdef(t2 - λ * one(t) - 0.1 * one(t2))
 
-                # TODO
-                #=d, v = @constinferred eigh_trunc(t2; trunc = truncrank(nvals))
+                d, v = @constinferred eigh_trunc(t2; trunc = truncrank(nvals))
                 @test t2 * v ≈ v * d
-                @test dim(domain(d)) ≤ nvals=#
+                @test dim(domain(d)) ≤ nvals
             end
         end