Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "OpenCL"
uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
version = "0.10.8"
version = "0.10.9"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down Expand Up @@ -38,7 +38,7 @@ Random = "1"
Random123 = "1.7.1"
RandomNumbers = "1.6.0"
Reexport = "1"
SPIRVIntrinsics = "0.5"
SPIRVIntrinsics = "0.5.7"
SPIRV_LLVM_Backend_jll = "20"
SPIRV_Tools_jll = "2025.1"
StaticArrays = "1"
Expand Down
51 changes: 51 additions & 0 deletions lib/cl/device.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,20 @@ end
return tuple([Int(r) for r in result]...)
end

# error handling inspired by rusticl
# https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320
if s == :sub_group_sizes
res_size = Ref{Csize_t}()
err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
if err == CL_SUCCESS && res_size[] > 1
result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
return tuple([Int(r) for r in result]...)
else
return tuple(0, 1)
end
end

if s == :max_image2d_shape
width = Ref{Csize_t}()
height = Ref{Csize_t}()
Expand Down Expand Up @@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol)
end
return cl_dtype
end

sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions
function sub_group_size(d::Device)
sub_groups_supported(d) || 0
if "cl_amd_device_attribute_query" in d.extensions
scalar = Ref{cl_uint}()
clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL)
return Int(scalar[])
elseif "cl_nv_device_attribute_query" in d.extensions
scalar = Ref{cl_uint}()
clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL)
return Int(scalar[])
else
sg_sizes = d.sub_group_sizes
return if length(sg_sizes) == 1
Int(only(sg_sizes))
elseif 32 in sg_sizes
32
elseif 64 in sg_sizes
64
elseif 16 in sg_sizes
16
else
Int(first(sg_sizes))
end
end
end
function sub_group_shuffle_supported_types(d::Device)
if "cl_khr_subgroup_shuffle" in d.extensions
res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
"cl_khr_fp16" in d.extensions && push!(res, Float16)
"cl_khr_fp64" in d.extensions && push!(res, Float64)
res
else
DataType[]
end
end
20 changes: 18 additions & 2 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
## gpucompiler interface

struct OpenCLCompilerParams <: AbstractCompilerParams end
Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
end

const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}

Expand Down Expand Up @@ -29,6 +32,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
job, mod, entry)

# Set the subgroup size if supported
sg_size = job.config.params.sub_group_size
if sg_size >= 0
metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
end

# if this kernel uses our RNG, we should prime the shared state.
# XXX: these transformations should really happen at the Julia IR level...
if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
Expand Down Expand Up @@ -131,9 +140,16 @@ end
supports_fp16 = "cl_khr_fp16" in dev.extensions
supports_fp64 = "cl_khr_fp64" in dev.extensions

# Set to -1 if specifying a subgroup size is not supported
sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
cl.sub_group_size(dev)
else
-1
end

# create GPUCompiler objects
target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...)
params = OpenCLCompilerParams()
params = OpenCLCompilerParams(; sub_group_size)
CompilerConfig(target, params; kernel, name, always_inline)
end

Expand Down
102 changes: 102 additions & 0 deletions test/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]

@testset "barrier" begin

# work-group
@on_device barrier(OpenCL.LOCAL_MEM_FENCE)
@on_device barrier(OpenCL.GLOBAL_MEM_FENCE)
@on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
Expand All @@ -38,6 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices)
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group)

# sub-group
@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE)
@on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
end

@testset "mem_fence" begin
Expand Down Expand Up @@ -166,6 +171,103 @@ end
@test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
end

if cl.sub_groups_supported(cl.device())

struct SubgroupData
sub_group_size::UInt32
max_sub_group_size::UInt32
num_sub_groups::UInt32
sub_group_id::UInt32
sub_group_local_id::UInt32
end
function test_subgroup_kernel(results)
i = get_global_id(1)

if i <= length(results)
@inbounds results[i] = SubgroupData(
get_sub_group_size(),
get_max_sub_group_size(),
get_num_sub_groups(),
get_sub_group_id(),
get_sub_group_local_id()
)
end
return
end

@testset "Sub-groups" begin
sg_size = cl.sub_group_size(cl.device())

@testset "Indexing intrinsics" begin
# Test with small kernel
sg_n = 2
local_size = sg_size * sg_n
numworkgroups = 2
N = local_size * numworkgroups

results = CLVector{SubgroupData}(undef, N)
kernel = @opencl launch = false test_subgroup_kernel(results)

kernel(results; local_size, global_size=N)

host_results = Array(results)

# Verify results make sense
for (i, sg_data) in enumerate(host_results)
@test sg_data.sub_group_size == sg_size
@test sg_data.max_sub_group_size == sg_size
@test sg_data.num_sub_groups == sg_n

# Group ID should be 1-based
expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
@test sg_data.sub_group_id == expected_sub_group

# Local ID should be 1-based within group
expected_sg_local = ((i - 1) % sg_size) + 1
@test sg_data.sub_group_local_id == expected_sg_local
end
end

@testset "shuffle idx" begin
function shfl_idx_kernel(d)
i = get_sub_group_local_id()
j = get_sub_group_size() - i + 0x1

d[i] = sub_group_shuffle(d[i], j)

return
end

@testset for T in cl.sub_group_shuffle_supported_types(cl.device())
a = rand(T, sg_size)
d_a = CLArray(a)
@opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
@test Array(d_a) == reverse(a)
end
end
@testset "shuffle xor" begin
function shfl_xor_kernel(in)
i = get_sub_group_local_id()

# val = in[i]
new_val = sub_group_shuffle_xor(in[i], 1)

in[i] = new_val
return
end

# tests that each pair of values a get swapped using sub_group_shuffle_xor
@testset for T in cl.sub_group_shuffle_supported_types(cl.device())
in = rand(T, sg_size)
idxs = xor.(0:(sg_size - 1), 1) .+ 1
d_in = CLArray(in)
@opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
@test Array(d_in) == in[idxs]
end
end
end
end # if cl.sub_groups_supported(cl.device())

@testset "SIMD - $N x $T" for N in simd_ns, T in float_types
# codegen emits i48 here, which SPIR-V doesn't support
# XXX: fix upstream?
Expand Down
Loading