JuliaGPU · christiangnrd · Feb 18, 2026 · Jan 3, 2026
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "OpenCL"
 uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
-version = "0.10.8"
+version = "0.10.9"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -38,7 +38,7 @@ Random = "1"
 Random123 = "1.7.1"
 RandomNumbers = "1.6.0"
 Reexport = "1"
-SPIRVIntrinsics = "0.5"
+SPIRVIntrinsics = "0.5.7"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"

diff --git a/lib/cl/device.jl b/lib/cl/device.jl
@@ -139,6 +139,20 @@ end
         return tuple([Int(r) for r in result]...)
     end
 
+    # error handling inspired by rusticl
+    # https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320
+    if s == :sub_group_sizes
+        res_size = Ref{Csize_t}()
+        err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
+        if err == CL_SUCCESS && res_size[] > 1
+            result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
+            clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
+            return tuple([Int(r) for r in result]...)
+        else
+            return tuple(0, 1)
+        end
+    end
+
     if s == :max_image2d_shape
         width  = Ref{Csize_t}()
         height = Ref{Csize_t}()
@@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol)
     end
     return cl_dtype
 end
+
+sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions
+function sub_group_size(d::Device)
+    sub_groups_supported(d) || 0
+    if "cl_amd_device_attribute_query" in d.extensions
+        scalar = Ref{cl_uint}()
+        clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL)
+        return Int(scalar[])
+    elseif "cl_nv_device_attribute_query" in d.extensions
+        scalar = Ref{cl_uint}()
+        clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL)
+        return Int(scalar[])
+    else
+        sg_sizes = d.sub_group_sizes
+        return if length(sg_sizes) == 1
+            Int(only(sg_sizes))
+        elseif 32 in sg_sizes
+            32
+        elseif 64 in sg_sizes
+            64
+        elseif 16 in sg_sizes
+            16
+        else
+            Int(first(sg_sizes))
+        end
+    end
+end
+function sub_group_shuffle_supported_types(d::Device)
+    if "cl_khr_subgroup_shuffle" in d.extensions
+        res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
+        "cl_khr_fp16" in d.extensions && push!(res, Float16)
+        "cl_khr_fp64" in d.extensions && push!(res, Float64)
+        res
+    else
+        DataType[]
+    end
+end
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -1,6 +1,9 @@
 ## gpucompiler interface
 
-struct OpenCLCompilerParams <: AbstractCompilerParams end
+Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
+    sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
+end
+
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
 const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}
 
@@ -29,6 +32,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
                    Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
                    job, mod, entry)
 
+    # Set the subgroup size if supported
+    sg_size = job.config.params.sub_group_size
+    if sg_size >= 0
+        metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
+    end
+
     # if this kernel uses our RNG, we should prime the shared state.
     # XXX: these transformations should really happen at the Julia IR level...
     if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
@@ -131,9 +140,16 @@ end
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
+    # Set to -1 if specifying a subgroup size is not supported
+    sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
+        cl.sub_group_size(dev)
+    else
+        -1
+    end
+
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...)
-    params = OpenCLCompilerParams()
+    params = OpenCLCompilerParams(; sub_group_size)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
@@ -20,6 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
 
 @testset "barrier" begin
 
+# work-group
 @on_device barrier(OpenCL.LOCAL_MEM_FENCE)
 @on_device barrier(OpenCL.GLOBAL_MEM_FENCE)
 @on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
@@ -38,6 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
 cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices)
 @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group)
 
+# sub-group
+@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE)
+@on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
+@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
 end
 
 @testset "mem_fence" begin
@@ -166,6 +171,103 @@ end
     @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
 end
 
+if cl.sub_groups_supported(cl.device())
+
+struct SubgroupData
+    sub_group_size::UInt32
+    max_sub_group_size::UInt32
+    num_sub_groups::UInt32
+    sub_group_id::UInt32
+    sub_group_local_id::UInt32
+end
+function test_subgroup_kernel(results)
+    i = get_global_id(1)
+
+    if i <= length(results)
+        @inbounds results[i] = SubgroupData(
+            get_sub_group_size(),
+            get_max_sub_group_size(),
+            get_num_sub_groups(),
+            get_sub_group_id(),
+            get_sub_group_local_id()
+        )
+    end
+    return
+end
+
+@testset "Sub-groups" begin
+    sg_size = cl.sub_group_size(cl.device())
+
+    @testset "Indexing intrinsics" begin
+        # Test with small kernel
+        sg_n = 2
+        local_size = sg_size * sg_n
+        numworkgroups = 2
+        N = local_size * numworkgroups
+
+        results = CLVector{SubgroupData}(undef, N)
+        kernel = @opencl launch = false test_subgroup_kernel(results)
+
+        kernel(results; local_size, global_size=N)
+
+        host_results = Array(results)
+
+        # Verify results make sense
+        for (i, sg_data) in enumerate(host_results)
+            @test sg_data.sub_group_size == sg_size
+            @test sg_data.max_sub_group_size == sg_size
+            @test sg_data.num_sub_groups == sg_n
+
+            # Group ID should be 1-based
+            expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
+            @test sg_data.sub_group_id == expected_sub_group
+
+            # Local ID should be 1-based within group
+            expected_sg_local = ((i - 1) % sg_size) + 1
+            @test sg_data.sub_group_local_id == expected_sg_local
+        end
+    end
+
+    @testset "shuffle idx" begin
+        function shfl_idx_kernel(d)
+            i = get_sub_group_local_id()
+            j = get_sub_group_size() - i + 0x1
+
+            d[i] = sub_group_shuffle(d[i], j)
+
+            return
+        end
+
+        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+            a = rand(T, sg_size)
+            d_a = CLArray(a)
+            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+            @test Array(d_a) == reverse(a)
+        end
+    end
+    @testset "shuffle xor" begin
+        function shfl_xor_kernel(in)
+            i = get_sub_group_local_id()
+
+            # val = in[i]
+            new_val = sub_group_shuffle_xor(in[i], 1)
+
+            in[i] = new_val
+            return
+        end
+
+        # tests that each pair of values a get swapped using sub_group_shuffle_xor
+        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+            in = rand(T, sg_size)
+            idxs = xor.(0:(sg_size - 1), 1) .+ 1
+            d_in = CLArray(in)
+            @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
+            @test Array(d_in) == in[idxs]
+        end
+    end
+end
+end # if cl.sub_groups_supported(cl.device())
+
 @testset "SIMD - $N x $T" for N in simd_ns, T in float_types
     # codegen emits i48 here, which SPIR-V doesn't support
     # XXX: fix upstream?