diff --git a/Project.toml b/Project.toml index 6f3e5b85..828e36be 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "OpenCL" uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2" -version = "0.10.8" +version = "0.10.9" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -38,7 +38,7 @@ Random = "1" Random123 = "1.7.1" RandomNumbers = "1.6.0" Reexport = "1" -SPIRVIntrinsics = "0.5" +SPIRVIntrinsics = "0.5.7" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" diff --git a/lib/cl/device.jl b/lib/cl/device.jl index dd1c4a26..64bb1f9c 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -139,6 +139,20 @@ end return tuple([Int(r) for r in result]...) end + # error handling inspired by rusticl + # https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320 + if s == :sub_group_sizes + res_size = Ref{Csize_t}() + err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size) + if err == CL_SUCCESS && res_size[] > 1 + result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t)) + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + else + return tuple(0, 1) + end + end + if s == :max_image2d_shape width = Ref{Csize_t}() height = Ref{Csize_t}() @@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol) end return cl_dtype end + +sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions +function sub_group_size(d::Device) + sub_groups_supported(d) || 0 + if "cl_amd_device_attribute_query" in d.extensions + scalar = Ref{cl_uint}() + clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL) + return Int(scalar[]) + elseif "cl_nv_device_attribute_query" in d.extensions + scalar = Ref{cl_uint}() + clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL) + return Int(scalar[]) + else + sg_sizes = d.sub_group_sizes + return if length(sg_sizes) == 1 + Int(only(sg_sizes)) + elseif 32 in sg_sizes + 32 + elseif 64 in sg_sizes + 64 + elseif 16 in sg_sizes + 16 + else + Int(first(sg_sizes)) + end + end +end +function sub_group_shuffle_supported_types(d::Device) + if "cl_khr_subgroup_shuffle" in d.extensions + res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32] + "cl_khr_fp16" in d.extensions && push!(res, Float16) + "cl_khr_fp64" in d.extensions && push!(res, Float64) + res + else + DataType[] + end +end diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 03637b97..5f9ae484 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -1,6 +1,9 @@ ## gpucompiler interface -struct OpenCLCompilerParams <: AbstractCompilerParams end +Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams + sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed +end + const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams} @@ -29,6 +32,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob), Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, job, mod, entry) + # Set the subgroup size if supported + sg_size = job.config.params.sub_group_size + if sg_size >= 0 + metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))]) + end + # if this kernel uses our RNG, we should prime the shared state. # XXX: these transformations should really happen at the Julia IR level... if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel @@ -131,9 +140,16 @@ end supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions + # Set to -1 if specifying a subgroup size is not supported + sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions + cl.sub_group_size(dev) + else + -1 + end + # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...) - params = OpenCLCompilerParams() + params = OpenCLCompilerParams(; sub_group_size) CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl index ebbf3aea..dbef9fd2 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -20,6 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16] @testset "barrier" begin +# work-group @on_device barrier(OpenCL.LOCAL_MEM_FENCE) @on_device barrier(OpenCL.GLOBAL_MEM_FENCE) @on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) @@ -38,6 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16] cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices) @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group) +# sub-group +@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE) +@on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE) +@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) end @testset "mem_fence" begin @@ -166,6 +171,103 @@ end @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z end +if cl.sub_groups_supported(cl.device()) + +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = get_global_id(1) + + if i <= length(results) + @inbounds results[i] = SubgroupData( + get_sub_group_size(), + get_max_sub_group_size(), + get_num_sub_groups(), + get_sub_group_id(), + get_sub_group_local_id() + ) + end + return +end + +@testset "Sub-groups" begin + sg_size = cl.sub_group_size(cl.device()) + + @testset "Indexing intrinsics" begin + # Test with small kernel + sg_n = 2 + local_size = sg_size * sg_n + numworkgroups = 2 + N = local_size * numworkgroups + + results = CLVector{SubgroupData}(undef, N) + kernel = @opencl launch = false test_subgroup_kernel(results) + + kernel(results; local_size, global_size=N) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + expected_sub_group = div(((i - 1) % local_size), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end + + @testset "shuffle idx" begin + function shfl_idx_kernel(d) + i = get_sub_group_local_id() + j = get_sub_group_size() - i + 0x1 + + d[i] = sub_group_shuffle(d[i], j) + + return + end + + @testset for T in cl.sub_group_shuffle_supported_types(cl.device()) + a = rand(T, sg_size) + d_a = CLArray(a) + @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a) + @test Array(d_a) == reverse(a) + end + end + @testset "shuffle xor" begin + function shfl_xor_kernel(in) + i = get_sub_group_local_id() + + # val = in[i] + new_val = sub_group_shuffle_xor(in[i], 1) + + in[i] = new_val + return + end + + # tests that each pair of values a get swapped using sub_group_shuffle_xor + @testset for T in cl.sub_group_shuffle_supported_types(cl.device()) + in = rand(T, sg_size) + idxs = xor.(0:(sg_size - 1), 1) .+ 1 + d_in = CLArray(in) + @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in) + @test Array(d_in) == in[idxs] + end + end +end +end # if cl.sub_groups_supported(cl.device()) + @testset "SIMD - $N x $T" for N in simd_ns, T in float_types # codegen emits i48 here, which SPIR-V doesn't support # XXX: fix upstream?