diff --git a/Project.toml b/Project.toml
index 607558519..8defa8104 100644
--- a/Project.toml
+++ b/Project.toml
@@ -89,3 +89,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "AllocCheck", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "JET", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote", "Mooncake"]
+
+[sources]
+Strided = {url = "https://github.com/QuantumKitHub/Strided.jl/", rev = "ksh/copyto"}
diff --git a/ext/TensorKitAdaptExt.jl b/ext/TensorKitAdaptExt.jl
index 30c223820..9539822a1 100644
--- a/ext/TensorKitAdaptExt.jl
+++ b/ext/TensorKitAdaptExt.jl
@@ -15,8 +15,11 @@ function Adapt.adapt_structure(to, x::DiagonalTensorMap)
     data′ = adapt(to, x.data)
     return DiagonalTensorMap(data′, x.domain)
 end
-function Adapt.adapt_structure(::Type{TorA}, x::BraidingTensor) where {TorA <: Union{Number, DenseArray{<:Number}}}
-    return BraidingTensor{scalartype(TorA)}(space(x), x.adjoint)
+function Adapt.adapt_structure(::Type{T}, x::BraidingTensor{T′, S, A}) where {T <: Number, T′, S, A}
+    return BraidingTensor(space(x), TensorKit.similarstoragetype(A, T), x.adjoint)
+end
+function Adapt.adapt_structure(::Type{TA}, x::BraidingTensor{T, S, A}) where {TA <: DenseArray{<:Number}, T, S, A}
+    return BraidingTensor(space(x), TA, x.adjoint)
 end
 
 end
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index f5efb98bb..43bca1b52 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -3,14 +3,18 @@ module TensorKitCUDAExt
 using CUDA, CUDA.CUBLAS, CUDA.CUSOLVER, LinearAlgebra
 using CUDA: @allowscalar
 using cuTENSOR: cuTENSOR
+using Strided: StridedViews
 import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
 
+using CUDA: KernelAbstractions
+using CUDA.KernelAbstractions: @kernel, @index
+
 using TensorKit
 using TensorKit.Factorizations
 using TensorKit.Strided
 using TensorKit.Factorizations: AbstractAlgorithm
 using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
-import TensorKit: randisometry, rand, randn
+import TensorKit: randisometry, rand, randn, similarmatrixtype, _set_subblock!
 
 using TensorKit: MatrixAlgebraKit
 
@@ -19,4 +23,18 @@ using Random
 include("cutensormap.jl")
 include("truncation.jl")
 
+TensorKit.similarmatrixtype(::Type{A}) where {T <: Number, M, A <: CuVector{T, M}} = CuMatrix{T, M}
+
+function TensorKit._set_subblock!(data::TD, val) where {T, TD <: Union{<:CuMatrix{T}, <:StridedViews.StridedView{T, 4, <:CuArray{T}}}}
+    @kernel function fill_subblock_kernel!(subblock, val)
+        idx = @index(Global, Cartesian)
+        @inbounds subblock[idx[1], idx[2], idx[2], idx[1]] = val
+    end
+    kernel = fill_subblock_kernel!(KernelAbstractions.get_backend(data))
+    d1 = size(data, 1)
+    d2 = size(data, 2)
+    kernel(data, val; ndrange = (d1, d2))
+    return data
+end
+
 end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index f065c2ec1..41928fed7 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -168,3 +168,23 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+
+function TensorKit.add_kernel_nonthreaded!(
+        ::TensorKit.FusionStyle,
+        tdst::CuTensorMap, tsrc::CuTensorMap, p, transformer::TensorKit.GenericTreeTransformer, α, β, backend...
+    )
+    # preallocate buffers
+    buffers = TensorKit.allocate_buffers(tdst, tsrc, transformer)
+
+    for subtransformer in transformer.data
+        # Special case without intermediate buffers whenever there is only a single block
+        if length(subtransformer[1]) == 1
+            TensorKit._add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+        else
+            cu_subtransformer = tuple(CUDA.adapt(CuArray, subtransformer[1]), subtransformer[2:end]...)
+            TensorKit._add_transform_multi!(tdst, tsrc, p, cu_subtransformer, buffers, α, β, backend...)
+        end
+    end
+    return nothing
+end
diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl
index d7d520b43..ecf87c631 100644
--- a/src/tensors/abstracttensor.jl
+++ b/src/tensors/abstracttensor.jl
@@ -122,6 +122,19 @@ similarstoragetype(::Type{D}, ::Type{T}) where {D <: AbstractDict{<:Sector, <:Ab
 # default storage type for numbers
 similarstoragetype(::Type{T}) where {T <: Number} = Vector{T}
 
+@doc """
+    similarmatrixtype(T::Type{<:Number}) -> Matrix{T}
+    similarmatrixtype(A::Type{T, <:DenseVector{T}}) -> Matrix{T}
+
+For a given dense vector type `A` or number type `T`, compute an appropriate
+**matrix** storage type for tensors. This function is used internally for
+[`BraidingTensor`](@ref) to determine the output storage format for indexing
+and other operations with other tensor types.
+""" similarmatrixtype
+
+similarmatrixtype(::Type{T}) where {T <: Number} = Matrix{T}
+similarmatrixtype(::Type{A}) where {T <: Number, A <: DenseVector{T}} = Matrix{T}
+
 @doc """
     promote_storagetype([T], A, B, C...)
     promote_storagetype([T], TA, TB, TC...)
@@ -607,10 +620,10 @@ Base.similar(t::AbstractTensorMap, ::Type{T}, codomain::TensorSpace) where {T} =
 # 2 arguments
 Base.similar(t::AbstractTensorMap, codomain::TensorSpace) =
     similar(t, codomain ← one(codomain))
-Base.similar(t::AbstractTensorMap, V::TensorMapSpace) = similar(t, scalartype(t), V)
+Base.similar(t::AbstractTensorMap, V::TensorMapSpace) = similar(t, storagetype(t), V)
 Base.similar(t::AbstractTensorMap, ::Type{T}) where {T} = similar(t, T, space(t))
 # 1 argument
-Base.similar(t::AbstractTensorMap) = similar(t, scalartype(t), space(t))
+Base.similar(t::AbstractTensorMap) = similar(t, storagetype(t), space(t))
 
 # generic implementation for AbstractTensorMap -> returns `TensorMap`
 function Base.similar(t::AbstractTensorMap, ::Type{TorA}, V::TensorMapSpace) where {TorA}
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index 3ff8a9abf..31b954bb8 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -2,72 +2,89 @@
 # special (2,2) tensor that implements a standard braiding operation
 #====================================================================#
 """
-    struct BraidingTensor{T,S<:IndexSpace} <: AbstractTensorMap{T, S, 2, 2}
-    BraidingTensor(V1::S, V2::S, adjoint::Bool=false) where {S<:IndexSpace}
+    struct BraidingTensor{T,S<:IndexSpace,A<:DenseVector{T}} <: AbstractTensorMap{T, S, 2, 2}
+    BraidingTensor(V1::S, V2::S, ::Type{A}, adjoint::Bool=false) where {S<:IndexSpace, A <: DenseVector{<:Number}}
 
 Specific subtype of [`AbstractTensorMap`](@ref) for representing the braiding tensor that
 braids the first input over the second input; its inverse can be obtained as the adjoint.
 
 It holds that `domain(BraidingTensor(V1, V2)) == V1 ⊗ V2` and
-`codomain(BraidingTensor(V1, V2)) == V2 ⊗ V1`.
+`codomain(BraidingTensor(V1, V2)) == V2 ⊗ V1`. The storage type `TA`
+controls the array type of the braiding tensor used when indexing
+and multiplying with other tensors.
 """
-struct BraidingTensor{T, S} <: AbstractTensorMap{T, S, 2, 2}
+struct BraidingTensor{T, S, A} <: AbstractTensorMap{T, S, 2, 2}
     V1::S
     V2::S
     adjoint::Bool
-    function BraidingTensor{T, S}(V1::S, V2::S, adjoint::Bool = false) where {T, S <: IndexSpace}
-        for a in sectors(V1)
-            for b in sectors(V2)
-                for c in (a ⊗ b)
-                    Nsymbol(a, b, c) == Nsymbol(b, a, c) ||
-                        throw(ArgumentError("Cannot define a braiding between $a and $b"))
-                end
-            end
+    function BraidingTensor{T, S, A}(V1::S, V2::S, ::Type{A}, adjoint::Bool = false) where {T, S <: IndexSpace, A <: DenseVector{T}}
+        for a in sectors(V1), b in sectors(V2), c in (a ⊗ b)
+            Nsymbol(a, b, c) == Nsymbol(b, a, c) ||
+                throw(ArgumentError("Cannot define a braiding between $a and $b"))
         end
-        return new{T, S}(V1, V2, adjoint)
+        return new{T, S, A}(V1, V2, adjoint)
         # partial construction: only construct rowr and colr when needed
     end
 end
+function BraidingTensor{T, S}(V1::S, V2::S, ::Type{A}, adjoint::Bool = false) where {T, S <: IndexSpace, A}
+    return BraidingTensor{T, S, A}(V1, V2, A, adjoint)
+end
+function BraidingTensor{T}(V1::S, V2::S, A, adjoint::Bool = false) where {T, S <: IndexSpace}
+    return BraidingTensor{T, S}(V1, V2, A, adjoint)
+end
 function BraidingTensor{T}(V1::S, V2::S, adjoint::Bool = false) where {T, S <: IndexSpace}
-    return BraidingTensor{T, S}(V1, V2, adjoint)
+    return BraidingTensor{T, S}(V1, V2, Vector{T}, adjoint)
+end
+function BraidingTensor{T}(V1::IndexSpace, V2::IndexSpace, A, adjoint::Bool = false) where {T}
+    return BraidingTensor{T}(promote(V1, V2)..., A, adjoint)
 end
 function BraidingTensor{T}(V1::IndexSpace, V2::IndexSpace, adjoint::Bool = false) where {T}
-    return BraidingTensor{T}(promote(V1, V2)..., adjoint)
+    return BraidingTensor{T}(V1, V2, Vector{T}, adjoint)
+end
+function BraidingTensor(V1::IndexSpace, V2::IndexSpace, ::Type{A}, adjoint::Bool = false) where {T, A <: DenseVector{T}}
+    return BraidingTensor{T}(promote(V1, V2)..., A, adjoint)
+end
+function BraidingTensor(V1::IndexSpace, V2::IndexSpace, ::Type{T}, adjoint::Bool = false) where {T}
+    return BraidingTensor{T}(promote(V1, V2)..., Vector{T}, adjoint)
 end
 function BraidingTensor(V1::IndexSpace, V2::IndexSpace, adjoint::Bool = false)
     return BraidingTensor(promote(V1, V2)..., adjoint)
 end
 function BraidingTensor(V1::S, V2::S, adjoint::Bool = false) where {S <: IndexSpace}
     T = BraidingStyle(sectortype(S)) isa SymmetricBraiding ? Float64 : ComplexF64
-    return BraidingTensor{T, S}(V1, V2, adjoint)
+    return BraidingTensor{T, S}(V1, V2, Vector{T}, adjoint)
+end
+function BraidingTensor(V1::S, V2::S, ::Type{A}, adjoint::Bool = false) where {S <: IndexSpace, A <: AbstractArray}
+    T = BraidingStyle(sectortype(S)) isa SymmetricBraiding ? Float64 : ComplexF64
+    A′ = similarstoragetype(A, T)
+    return BraidingTensor{T, S}(V1, V2, A′, adjoint)
 end
 function BraidingTensor(V::HomSpace, adjoint::Bool = false)
     domain(V) == reverse(codomain(V)) ||
         throw(SpaceMismatch("Cannot define a braiding on $V"))
     return BraidingTensor(V[2], V[1], adjoint)
 end
+function BraidingTensor(V::HomSpace, ::Type{A}, adjoint::Bool = false) where {A}
+    domain(V) == reverse(codomain(V)) ||
+        throw(SpaceMismatch("Cannot define a braiding on $V"))
+    return BraidingTensor(V[2], V[1], A, adjoint)
+end
 function BraidingTensor{T}(V::HomSpace, adjoint::Bool = false) where {T}
     domain(V) == reverse(codomain(V)) ||
         throw(SpaceMismatch("Cannot define a braiding on $V"))
     return BraidingTensor{T}(V[2], V[1], adjoint)
 end
-function Base.adjoint(b::BraidingTensor{T, S}) where {T, S}
-    return BraidingTensor{T, S}(b.V1, b.V2, !b.adjoint)
+function Base.adjoint(b::BraidingTensor{T, S, A}) where {T, S, A}
+    return BraidingTensor{T, S, A}(b.V1, b.V2, A, !b.adjoint)
 end
 
+storagetype(::Type{BraidingTensor{T, S, A}}) where {T, S, A} = A
 space(b::BraidingTensor) = b.adjoint ? b.V1 ⊗ b.V2 ← b.V2 ⊗ b.V1 : b.V2 ⊗ b.V1 ← b.V1 ⊗ b.V2
 
-# specializations to ignore the storagetype of BraidingTensor
-promote_storagetype(::Type{A}, ::Type{B}) where {A <: BraidingTensor, B <: AbstractTensorMap} = storagetype(B)
-promote_storagetype(::Type{A}, ::Type{B}) where {A <: AbstractTensorMap, B <: BraidingTensor} = storagetype(A)
-promote_storagetype(::Type{A}, ::Type{B}) where {A <: BraidingTensor, B <: BraidingTensor} = storagetype(A)
-
-promote_storagetype(::Type{T}, ::Type{A}, ::Type{B}) where {T <: Number, A <: BraidingTensor, B <: AbstractTensorMap} =
-    similarstoragetype(B, T)
-promote_storagetype(::Type{T}, ::Type{A}, ::Type{B}) where {T <: Number, A <: AbstractTensorMap, B <: BraidingTensor} =
-    similarstoragetype(A, T)
-promote_storagetype(::Type{T}, ::Type{A}, ::Type{B}) where {T <: Number, A <: BraidingTensor, B <: BraidingTensor} =
-    similarstoragetype(A, T)
+promote_storagetype(::Type{B}, ::Type{T}) where {B <: BraidingTensor, T <: AbstractTensorMap} =
+    promote_storagetype(storagetype(B), storagetype(T))
+promote_storagetype(::Type{T}, ::Type{B}) where {B <: BraidingTensor, T <: AbstractTensorMap} =
+    promote_storagetype(storagetype(B), storagetype(T))
 
 function Base.getindex(b::BraidingTensor)
     sectortype(b) === Trivial || throw(SectorMismatch())
@@ -99,6 +116,14 @@ function _braiding_factor(f₁, f₂, inv::Bool = false)
     return r
 end
 
+function _set_subblock!(data, val)
+    @inbounds for i in axes(data, 1), j in axes(data, 2)
+        data[i, j, j, i] = val
+    end
+    return data
+end
+
+
 @inline function subblock(
         b::BraidingTensor, (f₁, f₂)::Tuple{FusionTree{I, 2}, FusionTree{I, 2}}
     ) where {I <: Sector}
@@ -115,15 +140,12 @@ end
     d = (dims(codomain(b), f₁.uncoupled)..., dims(domain(b), f₂.uncoupled)...)
     n1 = d[1] * d[2]
     n2 = d[3] * d[4]
-    data = sreshape(StridedView(Matrix{eltype(b)}(undef, n1, n2)), d)
+    data_t = similarmatrixtype(storagetype(b))(undef, (n1, n2))
+    data = sreshape(StridedView(data_t), d)
     fill!(data, zero(eltype(b)))
 
     r = _braiding_factor(f₁, f₂, b.adjoint)
-    if !isnothing(r)
-        @inbounds for i in axes(data, 1), j in axes(data, 2)
-            data[i, j, j, i] = r
-        end
-    end
+    !isnothing(r) && _set_subblock!(data, r)
     return data
 end
 
@@ -134,8 +156,31 @@ TensorMap(b::BraidingTensor) = copy!(similar(b), b)
 Base.convert(::Type{TensorMap}, b::BraidingTensor) = TensorMap(b)
 
 Base.complex(b::BraidingTensor{<:Complex}) = b
-function Base.complex(b::BraidingTensor)
-    return BraidingTensor{complex(scalartype(b))}(space(b), b.adjoint)
+function Base.complex(b::BraidingTensor{T, S, A}) where {T, S, A}
+    Ac = similarstoragetype(A, complex(T))
+    return BraidingTensor(space(b), Ac, b.adjoint)
+end
+
+function _trivial_subblock!(data, b::BraidingTensor)
+    V1, V2 = codomain(b)
+    d1, d2 = dim(V1), dim(V2)
+    subblock = sreshape(StridedView(data), (d1, d2, d2, d1))
+    _set_subblock!(subblock, one(eltype(b)))
+    return data
+end
+
+function _nontrivial_subblock!(data, b::BraidingTensor, s::Sector)
+    base_offset = first(blockstructure(b)[s][2]) - 1
+
+    for ((f₁, f₂), (sz, str, off)) in pairs(subblockstructure(space(b)))
+        (f₁.coupled == f₂.coupled == s) || continue
+        r = _braiding_factor(f₁, f₂, b.adjoint)
+        isnothing(r) && continue
+        # change offset to account for single block
+        subblock = StridedView(data, sz, str, off - base_offset)
+        _set_subblock!(subblock, r)
+    end
+    return data
 end
 
 function block(b::BraidingTensor, s::Sector)
@@ -145,36 +190,17 @@ function block(b::BraidingTensor, s::Sector)
     # TODO: probably always square?
     m = blockdim(codomain(b), s)
     n = blockdim(domain(b), s)
-    data = Matrix{eltype(b)}(undef, (m, n))
 
-    length(data) == 0 && return data # s ∉ blocksectors(b)
+    m * n == 0  && return similarmatrixtype(storagetype(b))(undef, (m, n)) # s ∉ blocksectors(b)
 
+    data = similarmatrixtype(storagetype(b))(undef, (m, n))
     data = fill!(data, zero(eltype(b)))
 
-    V1, V2 = codomain(b)
     if sectortype(b) === Trivial
-        d1, d2 = dim(V1), dim(V2)
-        subblock = sreshape(StridedView(data), (d1, d2, d2, d1))
-        @inbounds for i in axes(subblock, 1), j in axes(subblock, 2)
-            subblock[i, j, j, i] = one(eltype(b))
-        end
-        return data
-    end
-
-    base_offset = first(blockstructure(b)[s][2]) - 1
-
-    for ((f₁, f₂), (sz, str, off)) in pairs(subblockstructure(space(b)))
-        (f₁.coupled == f₂.coupled == s) || continue
-        r = _braiding_factor(f₁, f₂, b.adjoint)
-        isnothing(r) && continue
-        # change offset to account for single block
-        subblock = StridedView(data, sz, str, off - base_offset)
-        @inbounds for i in axes(subblock, 1), j in axes(subblock, 2)
-            subblock[i, j, j, i] = r
-        end
+        return _trivial_subblock!(data, b)
+    else
+        return _nontrivial_subblock!(data, b, s)
     end
-
-    return data
 end
 
 # Index manipulations
diff --git a/test/cuda/planar.jl b/test/cuda/planar.jl
new file mode 100644
index 000000000..067cedc13
--- /dev/null
+++ b/test/cuda/planar.jl
@@ -0,0 +1,297 @@
+using Test, TestExtras
+using Adapt
+using TensorKit
+using TensorKit: PlanarTrivial, ℙ
+using TensorKit: planaradd!, planartrace!, planarcontract!
+using TensorOperations, CUDA, cuTENSOR
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+@testset "Braiding tensor" begin
+    for V in (Vtr, VU₁, VfU₁, VfSU₂, Vfib)
+        W = V[1] ⊗ V[2] ← V[2] ⊗ V[1]
+        T = isreal(sectortype(W)) ? Float64 : ComplexF64
+        t1 = @constinferred BraidingTensor(W, CuVector{T, CUDA.DeviceMemory})
+        @test space(t1) == W
+        @test codomain(t1) == codomain(W)
+        @test domain(t1) == domain(W)
+        @test scalartype(t1) == (isreal(sectortype(W)) ? Float64 : ComplexF64)
+        @test storagetype(t1) == CuVector{scalartype(t1), CUDA.DeviceMemory}
+        t2 = @constinferred BraidingTensor(W, CuVector{ComplexF64, CUDA.DeviceMemory})
+        @test scalartype(t2) == ComplexF64
+        @test storagetype(t2) == CuVector{ComplexF64, CUDA.DeviceMemory}
+        t3 = @testinferred adapt(storagetype(t2), t1)
+        @test storagetype(t3) == storagetype(t2)
+        # allowscalar needed for the StridedView comparison
+        CUDA.@allowscalar begin
+            @test t3 == t2
+        end
+
+        W2 = reverse(codomain(W)) ← domain(W)
+        @test_throws SpaceMismatch BraidingTensor(W2)
+
+        @test adjoint(t1) isa BraidingTensor
+        @test complex(t1) isa BraidingTensor
+        @test scalartype(complex(t1)) <: Complex
+
+        t3 = @inferred TensorMap(t2)
+        @test storagetype(t3) == CuVector{ComplexF64, CUDA.DeviceMemory}
+        t4 = braid(adapt(CuArray, id(scalartype(t2), domain(t2))), ((2, 1), (3, 4)), (1, 2, 3, 4))
+        CUDA.@allowscalar begin
+            @test t1 ≈ t4
+        end
+        for (c, b) in blocks(t1)
+            @test block(t1, c) ≈ b ≈ block(t3, c)
+        end
+
+        CUDA.@allowscalar begin
+            for (f1, f2) in fusiontrees(t1)
+                @test t1[f1, f2] ≈ t3[f1, f2]
+            end
+        end
+
+        t5 = @inferred TensorMap(t2')
+        @test storagetype(t5) == CuVector{ComplexF64, CUDA.DeviceMemory}
+        t6 = braid(adapt(CuArray, id(scalartype(t2), domain(t2'))), ((2, 1), (3, 4)), (4, 3, 2, 1))
+        CUDA.@allowscalar begin
+            @test t5 ≈ t6
+            for (c, b) in blocks(t1')
+                @test block(t1', c) ≈ b ≈ block(t5, c)
+            end
+            for (f1, f2) in fusiontrees(t1')
+                # needed here for broadcasting the - in isapprox
+                @test t1'[f1, f2] ≈ t5[f1, f2]
+            end
+        end
+    end
+end
+
+@testset "planar methods" verbose = true begin
+    @testset "planaradd" begin
+        A = CUDA.randn(ℂ^2 ⊗ ℂ^3 ← ℂ^6 ⊗ ℂ^5 ⊗ ℂ^4)
+        C = CUDA.randn((ℂ^5)' ⊗ (ℂ^6)' ← ℂ^4 ⊗ (ℂ^3)' ⊗ (ℂ^2)')
+        A′ = force_planar(A)
+        C′ = force_planar(C)
+        p = ((4, 3), (5, 2, 1))
+
+        @test force_planar(tensoradd!(C, A, p, false, true, true)) ≈
+            planaradd!(C′, A′, p, true, true)
+    end
+
+    @testset "planartrace" begin
+        A = CUDA.randn(ℂ^2 ⊗ ℂ^3 ← ℂ^2 ⊗ ℂ^5 ⊗ ℂ^4)
+        C = CUDA.randn((ℂ^5)' ⊗ ℂ^3 ← ℂ^4)
+        A′ = force_planar(A)
+        C′ = force_planar(C)
+        p = ((4, 2), (5,))
+        q = ((1,), (3,))
+
+        @test force_planar(tensortrace!(C, A, p, q, false, true, true)) ≈
+            planartrace!(C′, A′, p, q, true, true)
+    end
+
+    @testset "planarcontract" begin
+        A = CUDA.randn(ℂ^2 ⊗ ℂ^3 ← ℂ^2 ⊗ ℂ^5 ⊗ ℂ^4)
+        B = CUDA.randn(ℂ^2 ⊗ ℂ^4 ← ℂ^4 ⊗ ℂ^3)
+        C = CUDA.randn((ℂ^5)' ⊗ (ℂ^2)' ⊗ ℂ^2 ← (ℂ^2)' ⊗ ℂ^4)
+
+        A′ = force_planar(A)
+        B′ = force_planar(B)
+        C′ = force_planar(C)
+
+        pA = ((1, 3, 4), (5, 2))
+        pB = ((2, 4), (1, 3))
+        pAB = ((3, 2, 1), (4, 5))
+
+        @test force_planar(tensorcontract!(C, A, pA, false, B, pB, false, pAB, true, true)) ≈
+            planarcontract!(C′, A′, pA, B′, pB, pAB, true, true)
+    end
+end
+
+@testset "@planar" verbose = true begin
+    T = ComplexF64
+
+    @testset "contractcheck" begin
+        V = ℂ^2
+        A = CUDA.rand(T, V ⊗ V ← V)
+        B = CUDA.rand(T, V ⊗ V ← V')
+        @tensor C1[i j; k l] := A[i j; m] * B[k l; m]
+        @tensor contractcheck = true C2[i j; k l] := A[i j; m] * B[k l; m]
+        @test C1 ≈ C2
+        B2 = CUDA.rand(T, V ⊗ V ← V) # wrong duality for third space
+        @test_throws SpaceMismatch("incompatible spaces for m: $V ≠ $(V')") begin
+            @tensor contractcheck = true C3[i j; k l] := A[i j; m] * B2[k l; m]
+        end
+
+        #= # TODO NEEDS UPDATES TO planar/preprocessors
+        A = CUDA.rand(T, V ← V ⊗ V)
+        B = CUDA.rand(T, V ⊗ V ← V)
+        @planar C1[i; j] := A[i; k l] * τ[k l; m n] * B[m n; j]
+        @planar contractcheck = true C2[i; j] := A[i; k l] * τ[k l; m n] * B[m n; j]
+        @test C1 ≈ C2
+        @test_throws SpaceMismatch("incompatible spaces for m: $V ≠ $(V')") begin
+            @planar contractcheck = true C3[i; j] := A[i; k l] * τ[k l; m n] * B[n j; m]
+        end=#
+    end
+
+    @testset "MPS networks" begin
+        P = ℂ^2
+        Vmps = ℂ^12
+        Vmpo = ℂ^4
+
+        # ∂AC
+        # -------
+        x = CUDA.randn(T, Vmps ⊗ P ← Vmps)
+        O = CUDA.randn(T, Vmpo ⊗ P ← P ⊗ Vmpo)
+        GL = CUDA.randn(T, Vmps ⊗ Vmpo' ← Vmps)
+        GR = CUDA.randn(T, Vmps ⊗ Vmpo ← Vmps)
+
+        x′ = force_planar(x)
+        O′ = force_planar(O)
+        GL′ = force_planar(GL)
+        GR′ = force_planar(GR)
+
+        for alloc in
+            (TensorOperations.DefaultAllocator(), TensorOperations.CUDAAllocator())
+            @tensor allocator = alloc y[-1 -2; -3] := GL[-1 2; 1] * x[1 3; 4] *
+                O[2 -2; 3 5] * GR[4 5; -3]
+            @planar allocator = alloc y′[-1 -2; -3] := GL′[-1 2; 1] * x′[1 3; 4] *
+                O′[2 -2; 3 5] * GR′[4 5; -3]
+            @test force_planar(y) ≈ y′
+        end
+
+        # ∂AC2
+        # -------
+        x2 = CUDA.randn(T, Vmps ⊗ P ← Vmps ⊗ P')
+        x2′ = force_planar(x2)
+        @tensor contractcheck = true y2[-1 -2; -3 -4] := GL[-1 7; 6] * x2[6 5; 1 3] *
+            O[7 -2; 5 4] * O[4 -4; 3 2] *
+            GR[1 2; -3]
+        @planar y2′[-1 -2; -3 -4] := GL′[-1 7; 6] * x2′[6 5; 1 3] * O′[7 -2; 5 4] *
+            O′[4 -4; 3 2] * GR′[1 2; -3]
+        @test force_planar(y2) ≈ y2′
+
+        # transfer matrix
+        # ----------------
+        v = CUDA.randn(T, Vmps ← Vmps)
+        v′ = force_planar(v)
+        @tensor ρ[-1; -2] := x[-1 2; 1] * conj(x[-2 2; 3]) * v[1; 3]
+        @planar ρ′[-1; -2] := x′[-1 2; 1] * conj(x′[-2 2; 3]) * v′[1; 3]
+        @test force_planar(ρ) ≈ ρ′
+
+        @tensor ρ2[-1 -2; -3] := GL[1 -2; 3] * x[3 2; -3] * conj(x[1 2; -1])
+        @plansor ρ3[-1 -2; -3] := GL[1 2; 4] * x[4 5; -3] * τ[2 3; 5 -2] * conj(x[1 3; -1])
+        #@planar ρ2′[-1 -2; -3] := GL′[1 2; 4] * x′[4 5; -3] * τ[2 3; 5 -2] *
+        #    conj(x′[1 3; -1])
+        #@test force_planar(ρ2) ≈ ρ2′
+        @test ρ2 ≈ ρ3
+
+        # Periodic boundary conditions
+        # ----------------------------
+        f1 = isomorphism(storagetype(O), fuse(Vmpo^3), Vmpo ⊗ Vmpo' ⊗ Vmpo)
+        f2 = isomorphism(storagetype(O), fuse(Vmpo^3), Vmpo ⊗ Vmpo' ⊗ Vmpo)
+        f1′ = force_planar(f1)
+        f2′ = force_planar(f2)
+        @tensor O_periodic1[-1 -2; -3 -4] := O[1 -2; -3 2] * f1[-1; 1 3 4] *
+            conj(f2[-4; 2 3 4])
+        @plansor O_periodic2[-1 -2; -3 -4] := O[1 2; -3 6] * f1[-1; 1 3 5] *
+            conj(f2[-4; 6 7 8]) * τ[2 3; 7 4] *
+            τ[4 5; 8 -2]
+        #=@planar O_periodic′[-1 -2; -3 -4] := O′[1 2; -3 6] * f1′[-1; 1 3 5] *
+            conj(f2′[-4; 6 7 8]) * τ[2 3; 7 4] *
+            τ[4 5; 8 -2]=#
+        @test O_periodic1 ≈ O_periodic2
+        #@test force_planar(O_periodic1) ≈ O_periodic′
+    end
+
+    @testset "MERA networks" begin
+        Vmera = ℂ^2
+
+        u = CUDA.randn(T, Vmera ⊗ Vmera ← Vmera ⊗ Vmera)
+        w = CUDA.randn(T, Vmera ⊗ Vmera ← Vmera)
+        ρ = CUDA.randn(T, Vmera ⊗ Vmera ⊗ Vmera ← Vmera ⊗ Vmera ⊗ Vmera)
+        h = CUDA.randn(T, Vmera ⊗ Vmera ⊗ Vmera ← Vmera ⊗ Vmera ⊗ Vmera)
+
+        u′ = force_planar(u)
+        w′ = force_planar(w)
+        ρ′ = force_planar(ρ)
+        h′ = force_planar(h)
+
+        for alloc in
+            (TensorOperations.DefaultAllocator(), TensorOperations.CUDAAllocator())
+            @tensor allocator = alloc begin
+                C = (
+                    (
+                        (
+                            (
+                                (
+                                    ((h[9 3 4; 5 1 2] * u[1 2; 7 12]) * conj(u[3 4; 11 13])) *
+                                        (u[8 5; 15 6] * w[6 7; 19])
+                                ) *
+                                    (conj(u[8 9; 17 10]) * conj(w[10 11; 22]))
+                            ) *
+                                ((w[12 14; 20] * conj(w[13 14; 23])) * ρ[18 19 20; 21 22 23])
+                        ) *
+                            w[16 15; 18]
+                    ) * conj(w[16 17; 21])
+                )
+            end
+            @planar allocator = alloc begin
+                C′ = (
+                    (
+                        (
+                            (
+                                (
+                                    ((h′[9 3 4; 5 1 2] * u′[1 2; 7 12]) * conj(u′[3 4; 11 13])) *
+                                        (u′[8 5; 15 6] * w′[6 7; 19])
+                                ) *
+                                    (conj(u′[8 9; 17 10]) * conj(w′[10 11; 22]))
+                            ) *
+                                ((w′[12 14; 20] * conj(w′[13 14; 23])) * ρ′[18 19 20; 21 22 23])
+                        ) *
+                            w′[16 15; 18]
+                    ) * conj(w′[16 17; 21])
+                )
+            end
+            @test C ≈ C′
+        end
+    end
+
+    @testset "Issue 93" begin
+        T = Float64
+        V1 = ℂ^2
+        V2 = ℂ^3
+        t1 = CUDA.rand(T, V1 ← V2)
+        t2 = CUDA.rand(T, V2 ← V1)
+
+        tr1 = @planar opt = true t1[a; b] * t2[b; a] / 2
+        #=tr2 = @planar opt = true t1[d; a] * t2[b; c] * 1 / 2 * τ[c b; a d]
+        tr3 = @planar opt = true t1[d; a] * t2[b; c] * τ[a c; d b] / 2
+        tr4 = @planar opt = true t1[f; a] * 1 / 2 * t2[c; d] * τ[d b; c e] * τ[e b; a f]
+        tr5 = @planar opt = true t1[f; a] * t2[c; d] / 2 * τ[d b; c e] * τ[a e; f b]
+        tr6 = @planar opt = true t1[f; a] * t2[c; d] * τ[c d; e b] / 2 * τ[e b; a f]
+        tr7 = @planar opt = true t1[f; a] * t2[c; d] * (τ[c d; e b] * τ[a e; f b] / 2)=#
+
+        #@test tr1 ≈ tr2 ≈ tr3 ≈ tr4 ≈ tr5 ≈ tr6 ≈ tr7
+
+        tr1 = @plansor opt = true t1[a; b] * t2[b; a] / 2
+        #=tr2 = @plansor opt = true t1[d; a] * t2[b; c] * 1 / 2 * τ[c b; a d]
+        tr3 = @plansor opt = true t1[d; a] * t2[b; c] * τ[a c; d b] / 2
+        tr4 = @plansor opt = true t1[f; a] * 1 / 2 * t2[c; d] * τ[d b; c e] * τ[e b; a f]
+        tr5 = @plansor opt = true t1[f; a] * t2[c; d] / 2 * τ[d b; c e] * τ[a e; f b]
+        tr6 = @plansor opt = true t1[f; a] * t2[c; d] * τ[c d; e b] / 2 * τ[e b; a f]
+        tr7 = @plansor opt = true t1[f; a] * t2[c; d] * (τ[c d; e b] * τ[a e; f b] / 2)=#
+
+        #@test tr1 ≈ tr2 ≈ tr3 ≈ tr4 ≈ tr5 ≈ tr6 ≈ tr7
+    end
+    @testset "Issue 262" begin
+        V = ℂ^2
+        A = CUDA.rand(T, V ← V)
+        B = CUDA.rand(T, V ← V')
+        C = CUDA.rand(T, V' ← V)
+        @planar D1[i; j] := A[i; j] + B[i; k] * C[k; j]
+        @planar D2[i; j] := B[i; k] * C[k; j] + A[i; j]
+        @test D1 ≈ D2
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index ad7b4006e..7344785af 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -51,6 +51,7 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
         CUDA.functional() || continue
         @time include("cuda/tensors.jl")
         @time include("cuda/factorizations.jl")
+        @time include("cuda/planar.jl")
     elseif is_buildkite
         continue
     end
diff --git a/test/setup.jl b/test/setup.jl
index afc8091a5..c668211d8 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -103,7 +103,7 @@ function force_planar(V::GradedSpace)
 end
 force_planar(V::ProductSpace) = mapreduce(force_planar, ⊗, V)
 function force_planar(tsrc::TensorMap{<:Any, ComplexSpace})
-    tdst = TensorMap{scalartype(tsrc)}(
+    tdst = TensorKit.TensorMapWithStorage{scalartype(tsrc), storagetype(tsrc)}(
         undef,
         force_planar(codomain(tsrc)) ←
             force_planar(domain(tsrc))
@@ -112,7 +112,7 @@ function force_planar(tsrc::TensorMap{<:Any, ComplexSpace})
     return tdst
 end
 function force_planar(tsrc::TensorMap{<:Any, <:GradedSpace})
-    tdst = TensorMap{scalartype(tsrc)}(
+    tdst = TensorKit.TensorMapWithStorage{scalartype(tsrc), storagetype(tsrc)}(
         undef,
         force_planar(codomain(tsrc)) ←
             force_planar(domain(tsrc))