From cb5d65b30e560eb020177bd5cc875455bd525eb6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 18 Jan 2026 15:52:19 -0400 Subject: [PATCH 1/6] Sub-group shuffle Co-Authored-By: Simeon David Schaub --- lib/intrinsics/src/SPIRVIntrinsics.jl | 1 + lib/intrinsics/src/shuffle.jl | 11 +++++++++++ 2 files changed, 12 insertions(+) create mode 100644 lib/intrinsics/src/shuffle.jl diff --git a/lib/intrinsics/src/SPIRVIntrinsics.jl b/lib/intrinsics/src/SPIRVIntrinsics.jl index bd15fdd9..b2bca59d 100644 --- a/lib/intrinsics/src/SPIRVIntrinsics.jl +++ b/lib/intrinsics/src/SPIRVIntrinsics.jl @@ -23,6 +23,7 @@ include("printf.jl") include("math.jl") include("integer.jl") include("atomic.jl") +include("shuffle.jl") # helper macro to import all names from this package, even non-exported ones. macro import_all() diff --git a/lib/intrinsics/src/shuffle.jl b/lib/intrinsics/src/shuffle.jl new file mode 100644 index 00000000..4f8a0aa2 --- /dev/null +++ b/lib/intrinsics/src/shuffle.jl @@ -0,0 +1,11 @@ +export sub_group_shuffle, sub_group_shuffle_xor + +const gentypes = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float16, Float32, Float64] + +for gentype in gentypes + @eval begin + # cl_khr_subgroup_shuffle extension operations + @device_function sub_group_shuffle(x::$gentype, i::Integer) = @builtin_ccall("sub_group_shuffle", $gentype, ($gentype, Int32), x, i % Int32 - 1i32) + @device_function sub_group_shuffle_xor(x::$gentype, mask::Integer) = @builtin_ccall("sub_group_shuffle_xor", $gentype, ($gentype, Int32), x, mask % Int32) + end +end From 48905ce547f1340fe028b69595e01f2d8f9246ee Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 18 Jan 2026 15:52:49 -0400 Subject: [PATCH 2/6] Sub-group barrier --- lib/intrinsics/Project.toml | 2 +- lib/intrinsics/src/synchronization.jl | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml index 338bb329..5f2dafaa 100644 --- a/lib/intrinsics/Project.toml +++ b/lib/intrinsics/Project.toml @@ -1,7 +1,7 @@ name = "SPIRVIntrinsics" uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" authors = ["Tim Besard "] -version = "0.5.6" +version = "0.5.7" [deps] ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" diff --git a/lib/intrinsics/src/synchronization.jl b/lib/intrinsics/src/synchronization.jl index eab1349a..7d88b69f 100644 --- a/lib/intrinsics/src/synchronization.jl +++ b/lib/intrinsics/src/synchronization.jl @@ -149,10 +149,14 @@ write_mem_fence(flags) = atomic_work_item_fence(flags, memory_order_release, mem ## OpenCL execution barriers -export barrier, work_group_barrier +export barrier, work_group_barrier, sub_group_barrier @inline work_group_barrier(flags, scope = memory_scope_work_group) = control_barrier(Scope.Workgroup, cl_scope_to_spirv(scope), MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags)) +@inline sub_group_barrier(flags, scope = memory_scope_sub_group) = + control_barrier(Scope.Subgroup, cl_scope_to_spirv(scope), + MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags)) + barrier(flags) = work_group_barrier(flags) From 1d3ec4f894d558eccbc74cee1394bcfc4f9fbf98 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 15:35:34 -0400 Subject: [PATCH 3/6] Sub-group --- Project.toml | 2 +- lib/cl/device.jl | 51 ++++++++++++++++++++++++ src/compiler/compilation.jl | 25 +++++++++++- test/intrinsics.jl | 77 +++++++++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 6f3e5b85..8ea5539e 100644 --- a/Project.toml +++ b/Project.toml @@ -38,7 +38,7 @@ Random = "1" Random123 = "1.7.1" RandomNumbers = "1.6.0" Reexport = "1" -SPIRVIntrinsics = "0.5" +SPIRVIntrinsics = "0.5.7" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" diff --git a/lib/cl/device.jl b/lib/cl/device.jl index dd1c4a26..64bb1f9c 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -139,6 +139,20 @@ end return tuple([Int(r) for r in result]...) end + # error handling inspired by rusticl + # https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320 + if s == :sub_group_sizes + res_size = Ref{Csize_t}() + err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size) + if err == CL_SUCCESS && res_size[] > 1 + result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t)) + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + else + return tuple(0, 1) + end + end + if s == :max_image2d_shape width = Ref{Csize_t}() height = Ref{Csize_t}() @@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol) end return cl_dtype end + +sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions +function sub_group_size(d::Device) + sub_groups_supported(d) || 0 + if "cl_amd_device_attribute_query" in d.extensions + scalar = Ref{cl_uint}() + clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL) + return Int(scalar[]) + elseif "cl_nv_device_attribute_query" in d.extensions + scalar = Ref{cl_uint}() + clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL) + return Int(scalar[]) + else + sg_sizes = d.sub_group_sizes + return if length(sg_sizes) == 1 + Int(only(sg_sizes)) + elseif 32 in sg_sizes + 32 + elseif 64 in sg_sizes + 64 + elseif 16 in sg_sizes + 16 + else + Int(first(sg_sizes)) + end + end +end +function sub_group_shuffle_supported_types(d::Device) + if "cl_khr_subgroup_shuffle" in d.extensions + res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32] + "cl_khr_fp16" in d.extensions && push!(res, Float16) + "cl_khr_fp64" in d.extensions && push!(res, Float64) + res + else + DataType[] + end +end diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 03637b97..9c185037 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -1,6 +1,14 @@ ## gpucompiler interface -struct OpenCLCompilerParams <: AbstractCompilerParams end +Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams + sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed +end +function Base.hash(params::OpenCLCompilerParams, h::UInt) + h = hash(params.sub_group_size, h) + + return h +end + const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams} @@ -29,6 +37,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob), Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, job, mod, entry) + # Set the subgroup size if supported + sg_size = job.config.params.sub_group_size + if sg_size >= 0 + metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))]) + end + # if this kernel uses our RNG, we should prime the shared state. # XXX: these transformations should really happen at the Julia IR level... if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel @@ -131,9 +145,16 @@ end supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions + # Set to -1 if specifying a subgroup size is not supported + sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions + cl.sub_group_size(dev) + else + -1 + end + # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...) - params = OpenCLCompilerParams() + params = OpenCLCompilerParams(; sub_group_size) CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl index ebbf3aea..407faec2 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -166,6 +166,83 @@ end @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z end +if cl.sub_groups_supported(cl.device()) + +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = get_global_id(1) + + if i <= length(results) + @inbounds results[i] = SubgroupData( + get_sub_group_size(), + get_max_sub_group_size(), + get_num_sub_groups(), + get_sub_group_id(), + get_sub_group_local_id() + ) + end + return +end + +@testset "Sub-groups" begin + sg_size = cl.sub_group_size(cl.device()) + + @testset "Indexing intrinsics" begin + # Test with small kernel + sg_n = 2 + local_size = sg_size * sg_n + numworkgroups = 2 + N = local_size * numworkgroups + + results = CLVector{SubgroupData}(undef, N) + kernel = @opencl launch = false test_subgroup_kernel(results) + + kernel(results; local_size, global_size=N) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + expected_sub_group = div(((i - 1) % local_size), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end + + @testset "shuffle idx" begin + function shfl_idx_kernel(d) + i = get_sub_group_local_id() + j = get_sub_group_size() - i + 1 + + d[i] = sub_group_shuffle(d[i], j) + + return + end + + @testset for T in cl.sub_group_shuffle_supported_types(cl.device()) + a = rand(T, sg_size) + d_a = CLArray(a) + @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a) + @test Array(d_a) == reverse(a) + end + end +end +end # if cl.sub_groups_supported(cl.device()) + @testset "SIMD - $N x $T" for N in simd_ns, T in float_types # codegen emits i48 here, which SPIR-V doesn't support # XXX: fix upstream? From 922f426d996b61d4d16665f371d727c6ba1192c5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 15:35:43 -0400 Subject: [PATCH 4/6] Temp --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8ea5539e..6f3e5b85 100644 --- a/Project.toml +++ b/Project.toml @@ -38,7 +38,7 @@ Random = "1" Random123 = "1.7.1" RandomNumbers = "1.6.0" Reexport = "1" -SPIRVIntrinsics = "0.5.7" +SPIRVIntrinsics = "0.5" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" From 838ddcdbe4a7e10defa5e1b34046e919c0774461 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:53:25 -0400 Subject: [PATCH 5/6] Add KernelIntrinsics support --- Project.toml | 2 +- src/OpenCLKernels.jl | 68 +++++++++++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index 6f3e5b85..65083819 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ SPIRVIntrinsics = {path = "lib/intrinsics"} Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" -KernelAbstractions = "0.9.38" +KernelAbstractions = "0.10" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl index e06102cf..626b7fa0 100644 --- a/src/OpenCLKernels.jl +++ b/src/OpenCLKernels.jl @@ -1,9 +1,10 @@ module OpenCLKernels using ..OpenCL -using ..OpenCL: @device_override, method_table +using ..OpenCL: @device_override, method_table, kernel_convert, clfunction import KernelAbstractions as KA +import KernelAbstractions.KernelIntrinsics as KI import StaticArrays @@ -126,33 +127,62 @@ function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize return nothing end +KI.argconvert(::OpenCLBackend, arg) = kernel_convert(arg) + +function KI.kernel_function(::OpenCLBackend, f::F, tt::TT=Tuple{}; name = nothing, kwargs...) where {F,TT} + kern = clfunction(f, tt; name, kwargs...) + KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern) +end + +function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsize = 1) + KI.check_launch_args(numworkgroups, workgroupsize) + + local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...) + + numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...) + global_size = local_size .* numworkgroups + + obj.kern(args...; local_size, global_size) + return nothing +end + + +function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int=typemax(Int))::Int + wginfo = cl.work_group_info(kernel.kern.fun, cl.device()) + Int(min(wginfo.size, max_work_items)) +end +function KI.max_work_group_size(::OpenCLBackend)::Int + Int(cl.device().max_work_group_size) +end +function KI.multiprocessor_count(::OpenCLBackend)::Int + Int(cl.device().max_compute_units) +end ## Indexing Functions +## COV_EXCL_START -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end -@device_override @inline function KA.__index_Global_Linear(ctx) - #return get_global_id(1) # JuliaGPU/OpenCL.jl#346 - I = KA.__index_Global_Cartesian(ctx) - @inbounds LinearIndices(KA.__ndrange(ctx))[I] +@device_override @inline function KI.get_global_id() + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] +@device_override @inline function KI.get_local_size() + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] +@device_override @inline function KI.get_num_groups() + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_size() + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) @@ -167,7 +197,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = OpenCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end @@ -179,14 +209,14 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) OpenCL._print(args...) end - +## COV_EXCL_STOP ## Other From 69ead4d91908282d6ad1f0c2df56432cce3af2c7 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 17 Nov 2025 22:02:15 -0400 Subject: [PATCH 6/6] Temp CI --- Project.toml | 3 ++- test/Project.toml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 65083819..e2ac2e64 100644 --- a/Project.toml +++ b/Project.toml @@ -23,12 +23,13 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [sources] SPIRVIntrinsics = {path = "lib/intrinsics"} +KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"} [compat] Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" -KernelAbstractions = "0.10" +KernelAbstractions = "0.9, 0.10" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" diff --git a/test/Project.toml b/test/Project.toml index 57ae7ff9..0f85866e 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -8,6 +8,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"