Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions test/core/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,68 @@ using SpecialFunctions
@assert !contains(asm, "__nv") # from libdevice
end

@testset "inv" begin
# Base.inv should use accurate rcp instructions (rcp.rn)
for T in (Float32, Float64)
@test testf(x -> inv.(x), rand(T, 10) .+ T(0.1))
@test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0])
end

function kernel_inv_f32(a)
@inbounds a[] = inv(a[])
return
end
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
@test contains(asm, "rcp.rn.f32")

function kernel_inv_f64(a)
@inbounds a[] = inv(a[])
return
end
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
@test contains(asm, "rcp.rn.f64")
end

@testset "inv_fast" begin
# inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa)
function kernel_inv_fast_f32(a)
@inbounds a[] = @fastmath inv(a[])
return
end
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
@test contains(asm, "rcp.approx.ftz.f32")

fast_inv(x) = @fastmath inv(x)
xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
@test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4

# inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson
function kernel_inv_fast_f64(a)
@inbounds a[] = @fastmath inv(a[])
return
end
asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
@test contains(asm, "rcp.approx.ftz.f64")

xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
@test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10
end

@testset "div_fast Float64" begin
# FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y)
function kernel_div_fast_f64(a, b, c)
@inbounds c[] = @fastmath a[] / b[]
return
end
asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}}))
@test contains(asm, "rcp.approx.ftz.f64")

fast_div(x, y) = @fastmath x / y
xs = rand(Float64, 10) .+ 0.1
ys = rand(Float64, 10) .+ 0.1
@test Array(map(fast_div, CuArray(xs), CuArray(ys))) ≈ xs ./ ys rtol = 1.0e-10
end

@testset "JuliaGPU/CUDA.jl#2111: min/max should return NaN" begin
for T in [Float32, Float64]
AT = CuArray{T}
Expand Down