Wrapper for Blocksparse CuTensor code#3057
Conversation
… to make it a union type of CuTensorBS and AbstractArray?
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make
+## For now call contract in ITensor and rely on UnallocatedArrays to make
## C in a dry-run of the contraction.
# function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
# tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
using LinearAlgebra
function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
- contract!(α,
- A, A.inds, CUTENSOR_OP_IDENTITY,
- B, B.inds, CUTENSOR_OP_IDENTITY,
- β,
- C, C.inds, CUTENSOR_OP_IDENTITY,
- CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
- return C
-end
\ No newline at end of file
+ contract!(
+ α,
+ A, A.inds, CUTENSOR_OP_IDENTITY,
+ B, B.inds, CUTENSOR_OP_IDENTITY,
+ β,
+ C, C.inds, CUTENSOR_OP_IDENTITY,
+ CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+ )
+ return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
@nospecialize(beta::Number),
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
- plan::Union{CuTensorPlan, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+ plan::Union{CuTensorPlan, Nothing} = nothing
+ )
actual_plan = if plan === nothing
- plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
- jit, workspace, algo, compute_type)
+ plan_contraction(
+ A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+ jit, workspace, algo, compute_type
+ )
else
plan
end
contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-
+
if plan === nothing
- CUDA.unsafe_free!(actual_plan)
+ CUDA.unsafe_free!(actual_plan)
end
return C
@@ -33,12 +36,14 @@ end
## This function assumes A, B, and C are Arrays of pointers to CuArrays.
## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
- @nospecialize(alpha::Number),
- @nospecialize(A::AbstractArray),
- @nospecialize(B::AbstractArray),
- @nospecialize(beta::Number),
- @nospecialize(C::AbstractArray))
+function contractBS!(
+ plan::CuTensorPlan,
+ @nospecialize(alpha::Number),
+ @nospecialize(A::AbstractArray),
+ @nospecialize(B::AbstractArray),
+ @nospecialize(beta::Number),
+ @nospecialize(C::AbstractArray)
+ )
scalar_type = plan.scalar_type
# Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-
- cutensorBlockSparseContract(handle(), plan,
- Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
- Ref{scalar_type}(beta), C_ptrs, C_ptrs,
- plan.workspace, sizeof(plan.workspace), stream())
+
+ cutensorBlockSparseContract(
+ handle(), plan,
+ Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+ Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+ plan.workspace, sizeof(plan.workspace), stream()
+ )
synchronize(stream())
return C
end
@@ -60,21 +67,22 @@ function plan_contraction(
@nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+ )
!is_unary(opA) && throw(ArgumentError("opA must be a unary op!"))
!is_unary(opB) && throw(ArgumentError("opB must be a unary op!"))
!is_unary(opC) && throw(ArgumentError("opC must be a unary op!"))
!is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!"))
-
+
descA = CuTensorBSDescriptor(A)
descB = CuTensorBSDescriptor(B)
descC = CuTensorBSDescriptor(C)
# for now, D must be identical to C (and thus, descD must be identical to descC)
-
+
modeA = collect(Cint, Ainds)
modeB = collect(Cint, Binds)
modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
desc = Ref{cutensorOperationDescriptor_t}()
- cutensorCreateBlockSparseContraction(handle(),
- desc,
- descA, modeA, opA,
- descB, modeB, opB,
- descC, modeC, opC,
- descC, modeC, actual_compute_type)
+ cutensorCreateBlockSparseContraction(
+ handle(),
+ desc,
+ descA, modeA, opA,
+ descB, modeB, opB,
+ descC, modeC, opC,
+ descC, modeC, actual_compute_type
+ )
plan_pref = Ref{cutensorPlanPreference_t}()
cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
- plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+ plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
# cutensorDestroyOperationDescriptor(desc[])
cutensorDestroyPlanPreference(plan_pref[])
return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
## This expects a Vector{Tuple(Int)} right now
nonzero_block_coords
- function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray},
- blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+ function CuTensorBS{T, N}(
+ nonzero_data::Vector{<:CuArray},
+ blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number, N}
CuArrayT = eltype(nonzero_data)
@assert eltype(CuArrayT) == T
# @assert ndims(CuArrayT) == N
@assert length(block_extents) == N
- new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+ return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
end
end
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}},
- blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
- CuTensorBS{T,length(block_extents)}(nonzero_data,
- blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+ nonzero_data::Vector{<:CuArray{T}},
+ blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number}
+ return CuTensorBS{T, length(block_extents)}(
+ nonzero_data,
+ blocks_per_mode, block_extents, nonzero_block_coords, inds
+ )
end
# array interface
function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
function block_extents(T::CuTensorBS)
- extents = Vector{Int64}()
-
+ extents = Vector{Int64}()
+
for ex in T.block_extents
extents = vcat(extents, ex...)
end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
handle::cutensorBlockSparseTensorDescriptor_t
# inner constructor handles creation and finalizer of the descriptor
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- stride,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ stride,
+ eltype
+ )
desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
- cutensorCreateBlockSparseTensorDescriptor(handle(), desc,
- numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
- stride, eltype)
+ cutensorCreateBlockSparseTensorDescriptor(
+ handle(), desc,
+ numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+ stride, eltype
+ )
obj = new(desc[])
finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
end
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ eltype
+ )
return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
function unsafe_destroy!(obj::CuTensorBSDescriptor)
- cutensorDestroyBlockSparseTensorDescriptor(obj)
+ return cutensorDestroyBlockSparseTensorDescriptor(obj)
end
## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
numSectionsPerMode = collect(Int32, A.blocks_per_mode)
extent = block_extents(A)
- nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+ nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
st = strides(A)
- dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+ dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
- CuTensorBSDescriptor(numModes, numNonZeroBlocks,
- numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+ return CuTensorBSDescriptor(
+ numModes, numNonZeroBlocks,
+ numSectionsPerMode, extent, nonZeroCoordinates, dataType
+ )
end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index b33560b72..4e7ba168d 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
@gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
plan::cutensorPlan_t,
alpha::Ptr{Cvoid},
- A::Ptr{CuPtr{Cvoid}},
- B::Ptr{CuPtr{Cvoid}},
+ A::Ptr{CuPtr{Cvoid}},
+ B::Ptr{CuPtr{Cvoid}},
beta::Ptr{Cvoid},
- C::Ptr{CuPtr{Cvoid}},
- D::Ptr{CuPtr{Cvoid}},
- workspace::CuPtr{Cvoid},
+ C::Ptr{CuPtr{Cvoid}},
+ D::Ptr{CuPtr{Cvoid}},
+ workspace::CuPtr{Cvoid},
workspaceSize::UInt64,
stream::cudaStream_t)::cutensorStatus_t
end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
end
end
-eltypes_compact = [
- (Float32, Float32, Float32, Float32),
- (ComplexF32, ComplexF32, ComplexF32, Float32),
- (Float64, Float64, Float64, Float64),
- (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
- ## There are many unsupported types because this is a new functionality
- ## So I will test with Float32 and ComplexF32 only
- @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
- ## i = [20,20,25]
- ## k = [10,10,15]
- ## l = [30,30,35]
- ## A = Tensor(k,i,l)
- ## Nonzero blocks are
- ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
- A = Vector{CuArray{eltyA, 3}}()
- for k in [10,15]
- for i in [20,25]
- for l in [30,35]
- push!(A, CuArray(ones(eltyA, k,i,l)))
+ eltypes_compact = [
+ (Float32, Float32, Float32, Float32),
+ (ComplexF32, ComplexF32, ComplexF32, Float32),
+ (Float64, Float64, Float64, Float64),
+ (ComplexF64, ComplexF64, ComplexF64, Float64),
+ ]
+ @testset "Blocksparse Contraction" begin
+ ## There are many unsupported types because this is a new functionality
+ ## So I will test with Float32 and ComplexF32 only
+ @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+ ## i = [20,20,25]
+ ## k = [10,10,15]
+ ## l = [30,30,35]
+ ## A = Tensor(k,i,l)
+ ## Nonzero blocks are
+ ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+ A = Vector{CuArray{eltyA, 3}}()
+ for k in [10, 15]
+ for i in [20, 25]
+ for l in [30, 35]
+ push!(A, CuArray(ones(eltyA, k, i, l)))
+ end
end
end
- end
- ## B = Tensor(k,l)
- ## Nonzero blocks are
- ## [1,1], [2,3]
- B = Array{CuArray{eltyB, 2}}(
- [CuArray(randn(eltyB, 10, 30)),
- CuArray(randn(eltyB, 10, 35))])
-
- ## C = Tensor(i)
- ## Nonzero blocks are
- ## [1,], [3,]
- C = Vector{CuArray{eltyC, 1}}(
- [CuArray(zeros(eltyC, 20)),
- CuArray(zeros(eltyC, 25))]
- )
-
- cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3],
- [(10,10,15), (20,20,25), (30,30,35)],
- [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
- [1,3,2])
- cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
- [(10,10,15), (30,30,35)],
- [(1,1),(2,3)], [1,2], )
- cuTenC = cuTENSOR.CuTensorBS(C, [3],
- [(20,20,25)],[(1,),(3,)], [3])
-
- mul!(cuTenC, cuTenA, cuTenB, 1, 0)
- ## C[1] = A[1,1,1] * B[1,1]
- @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
- ## C[3] = A[1,3,1] * B[1,1]
- @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## B = Tensor(k,l)
+ ## Nonzero blocks are
+ ## [1,1], [2,3]
+ B = Array{CuArray{eltyB, 2}}(
+ [
+ CuArray(randn(eltyB, 10, 30)),
+ CuArray(randn(eltyB, 10, 35)),
+ ]
+ )
+
+ ## C = Tensor(i)
+ ## Nonzero blocks are
+ ## [1,], [3,]
+ C = Vector{CuArray{eltyC, 1}}(
+ [
+ CuArray(zeros(eltyC, 20)),
+ CuArray(zeros(eltyC, 25)),
+ ]
+ )
+
+ cuTenA = cuTENSOR.CuTensorBS(
+ A, [3, 3, 3],
+ [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+ [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+ [1, 3, 2]
+ )
+ cuTenB = cuTENSOR.CuTensorBS(
+ B, [3, 3],
+ [(10, 10, 15), (30, 30, 35)],
+ [(1, 1), (2, 3)], [1, 2],
+ )
+ cuTenC = cuTENSOR.CuTensorBS(
+ C, [3],
+ [(20, 20, 25)], [(1,), (3,)], [3]
+ )
+
+ mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+ ## C[1] = A[1,1,1] * B[1,1]
+ @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## C[3] = A[1,3,1] * B[1,1]
+ @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ end
end
-end
end |
|
There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #3057 +/- ##
===========================================
+ Coverage 76.94% 89.44% +12.49%
===========================================
Files 148 151 +3
Lines 12984 13149 +165
===========================================
+ Hits 9991 11761 +1770
+ Misses 2993 1388 -1605 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 04a39d7 | Previous: c27d64b | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
101959.5 ns |
100597.5 ns |
1.01 |
array/accumulate/Float32/dims=1 |
77501 ns |
75664.5 ns |
1.02 |
array/accumulate/Float32/dims=1L |
1586844 ns |
1584793 ns |
1.00 |
array/accumulate/Float32/dims=2 |
144685 ns |
142976.5 ns |
1.01 |
array/accumulate/Float32/dims=2L |
658861.5 ns |
657415 ns |
1.00 |
array/accumulate/Int64/1d |
119293 ns |
117924 ns |
1.01 |
array/accumulate/Int64/dims=1 |
80763 ns |
78997.5 ns |
1.02 |
array/accumulate/Int64/dims=1L |
1695476 ns |
1693971 ns |
1.00 |
array/accumulate/Int64/dims=2 |
156776 ns |
155028.5 ns |
1.01 |
array/accumulate/Int64/dims=2L |
962655 ns |
960575 ns |
1.00 |
array/broadcast |
20717 ns |
20338 ns |
1.02 |
array/construct |
1304.8 ns |
1309.9 ns |
1.00 |
array/copy |
18330.5 ns |
17908 ns |
1.02 |
array/copyto!/cpu_to_gpu |
216111 ns |
212897 ns |
1.02 |
array/copyto!/gpu_to_cpu |
282716 ns |
282235.5 ns |
1.00 |
array/copyto!/gpu_to_gpu |
10952 ns |
10729 ns |
1.02 |
array/iteration/findall/bool |
135067 ns |
134618 ns |
1.00 |
array/iteration/findall/int |
150561 ns |
149283.5 ns |
1.01 |
array/iteration/findfirst/bool |
81787 ns |
80787 ns |
1.01 |
array/iteration/findfirst/int |
84131 ns |
83132 ns |
1.01 |
array/iteration/findmin/1d |
88252 ns |
83706 ns |
1.05 |
array/iteration/findmin/2d |
117861.5 ns |
116228 ns |
1.01 |
array/iteration/logical |
203684.5 ns |
198950 ns |
1.02 |
array/iteration/scalar |
67490 ns |
67702 ns |
1.00 |
array/permutedims/2d |
52686 ns |
51714 ns |
1.02 |
array/permutedims/3d |
53285 ns |
52456 ns |
1.02 |
array/permutedims/4d |
52232.5 ns |
50766 ns |
1.03 |
array/random/rand/Float32 |
12313 ns |
12307 ns |
1.00 |
array/random/rand/Int64 |
36579 ns |
35801.5 ns |
1.02 |
array/random/rand!/Float32 |
8508 ns |
8286 ns |
1.03 |
array/random/rand!/Int64 |
27463 ns |
28673 ns |
0.96 |
array/random/randn/Float32 |
37996 ns |
43560.5 ns |
0.87 |
array/random/randn!/Float32 |
30803 ns |
29768 ns |
1.03 |
array/reductions/mapreduce/Float32/1d |
34711 ns |
34251 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1 |
39918.5 ns |
40740 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=1L |
51608 ns |
51112 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2 |
57182 ns |
56063 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2L |
69934.5 ns |
69133.5 ns |
1.01 |
array/reductions/mapreduce/Int64/1d |
43020 ns |
42250 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1 |
50137 ns |
41799.5 ns |
1.20 |
array/reductions/mapreduce/Int64/dims=1L |
87260 ns |
86968 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
59597 ns |
58884 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
84786 ns |
84459 ns |
1.00 |
array/reductions/reduce/Float32/1d |
35251 ns |
34461 ns |
1.02 |
array/reductions/reduce/Float32/dims=1 |
43874 ns |
40852 ns |
1.07 |
array/reductions/reduce/Float32/dims=1L |
51700.5 ns |
51381 ns |
1.01 |
array/reductions/reduce/Float32/dims=2 |
56938 ns |
56491 ns |
1.01 |
array/reductions/reduce/Float32/dims=2L |
69985 ns |
69612.5 ns |
1.01 |
array/reductions/reduce/Int64/1d |
43070 ns |
41763 ns |
1.03 |
array/reductions/reduce/Int64/dims=1 |
43398 ns |
41776 ns |
1.04 |
array/reductions/reduce/Int64/dims=1L |
87216 ns |
86961 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
59529 ns |
59365.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84404 ns |
84511 ns |
1.00 |
array/reverse/1d |
17855 ns |
17725 ns |
1.01 |
array/reverse/1dL |
68447 ns |
68319 ns |
1.00 |
array/reverse/1dL_inplace |
65718 ns |
65673 ns |
1.00 |
array/reverse/1d_inplace |
10293.333333333334 ns |
10072.666666666666 ns |
1.02 |
array/reverse/2d |
21114 ns |
20326 ns |
1.04 |
array/reverse/2dL |
73074 ns |
72203 ns |
1.01 |
array/reverse/2dL_inplace |
65751 ns |
65612 ns |
1.00 |
array/reverse/2d_inplace |
10038 ns |
9721 ns |
1.03 |
array/sorting/1d |
2735966.5 ns |
2734503 ns |
1.00 |
array/sorting/2d |
1077088 ns |
1067854 ns |
1.01 |
array/sorting/by |
3305788 ns |
3303246.5 ns |
1.00 |
cuda/synchronization/context/auto |
1180.9 ns |
1178.5 ns |
1.00 |
cuda/synchronization/context/blocking |
940.8571428571429 ns |
927.5833333333334 ns |
1.01 |
cuda/synchronization/context/nonblocking |
8690.7 ns |
7544.4 ns |
1.15 |
cuda/synchronization/stream/auto |
1006 ns |
992.2666666666667 ns |
1.01 |
cuda/synchronization/stream/blocking |
840.71875 ns |
797.5125 ns |
1.05 |
cuda/synchronization/stream/nonblocking |
7503.8 ns |
6980.9 ns |
1.07 |
integration/byval/reference |
143853 ns |
143557 ns |
1.00 |
integration/byval/slices=1 |
146010 ns |
145519 ns |
1.00 |
integration/byval/slices=2 |
284582 ns |
284229.5 ns |
1.00 |
integration/byval/slices=3 |
423155 ns |
422757 ns |
1.00 |
integration/cudadevrt |
102462 ns |
102157 ns |
1.00 |
integration/volumerhs |
23437240.5 ns |
23436970 ns |
1.00 |
kernel/indexing |
13249 ns |
13029 ns |
1.02 |
kernel/indexing_checked |
14108 ns |
13875 ns |
1.02 |
kernel/launch |
2019.1 ns |
2122.4444444444443 ns |
0.95 |
kernel/occupancy |
667.4654088050314 ns |
697.9281045751634 ns |
0.96 |
kernel/rand |
18214 ns |
17197 ns |
1.06 |
latency/import |
3804845451.5 ns |
3802346045 ns |
1.00 |
latency/precompile |
4599414911.5 ns |
4589885396.5 ns |
1.00 |
latency/ttfp |
4397998402.5 ns |
4414744302 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Thanks very much for putting this together, I'm happy to help with the header issues if needed! |
…but the C++ code is still in flux)
|
@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing! |
|
I'll try to take a look today! |
|
Did you use the scripts in |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Yes I did use the scripts but this produced the ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Nothing}}, ::CuPtr{Nothing})
The function `unsafe_convert` exists, but no method is defined for this combination of argument types.
Closest candidates are:
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitBlame)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitRevWalker)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitDiffStats)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
...
Stacktrace:
[1] Ref{Ptr{Nothing}}(a::Vector{CuPtr{Nothing}})
@ Base ./refpointer.jl:166
[2] cconvert
@ ./refpointer.jl:178 [inlined]
[3] macro expansion
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:545 [inlined]
[4] (::cuTENSOR.var"#cutensorBlockSparseContract##0#cutensorBlockSparseContract##1"{…})()
@ cuTENSOR ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:34
[5] retry_reclaim
@ ~/.julia/packages/CUDA/Il00B/src/memory.jl:434 [inlined]
[6] check
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:22 [inlined]
[7] cutensorBlockSparseContract
@ ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:33 [inlined]
[8]
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:50
[9] contract!(alpha::Number, A::Any, Ainds::Vector{…}, opA::cuTENSOR.cutensorOperator_t, B::Any, Binds::Vector{…}, opB::cuTENSOR.cutensorOperator_t, beta::Number, C::Any, Cinds::Vector{…}, opC::cuTENSOR.cutensorOperator_t, opOut::cuTENSOR.cutensorOperator_t; jit::cuTENSOR.cutensorJitMode_t, workspace::cuTENSOR.cutensorWorksizePreference_t, algo::cuTENSOR.cutensorAlgo_t, compute_type::Nothing, plan::Nothing)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:25
[10] mul!(C::CuTensorBS{Float64, 1}, A::CuTensorBS{Float64, 3}, B::CuTensorBS{Float64, 2}, α::Float64, β::Float64)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/interfaces.jl:21However, I found that If I modify the code to be |
|
Probably you missed some of the weird esoterica in |
17806da to
cc4b826
Compare
|
Thanks for doing all the work to get this going, I think it will be quite useful for a bunch of TN packages... |
lkdvos
left a comment
There was a problem hiding this comment.
Left some remaining comments, but for me I think most of the parts that I would use are there, since I don't really see myself going through the CuTensorBS construction (we also never used the CuTensor in TensorOperations so that is completely fine)
Remove left over code. Will need to make something like this to define mul! in the future Co-authored-by: Lukas Devos <ldevos98@gmail.com>
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
…to a contigous memory block)
|
@kshyatt |
Hi,
This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.
Thanks,
Karl