From cb5d65b30e560eb020177bd5cc875455bd525eb6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 18 Jan 2026 15:52:19 -0400
Subject: [PATCH 1/6] Sub-group shuffle

Co-Authored-By: Simeon David Schaub <simeon@schaub.rocks>
---
 lib/intrinsics/src/SPIRVIntrinsics.jl |  1 +
 lib/intrinsics/src/shuffle.jl         | 11 +++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 lib/intrinsics/src/shuffle.jl

diff --git a/lib/intrinsics/src/SPIRVIntrinsics.jl b/lib/intrinsics/src/SPIRVIntrinsics.jl
index bd15fdd9..b2bca59d 100644
--- a/lib/intrinsics/src/SPIRVIntrinsics.jl
+++ b/lib/intrinsics/src/SPIRVIntrinsics.jl
@@ -23,6 +23,7 @@ include("printf.jl")
 include("math.jl")
 include("integer.jl")
 include("atomic.jl")
+include("shuffle.jl")
 
 # helper macro to import all names from this package, even non-exported ones.
 macro import_all()
diff --git a/lib/intrinsics/src/shuffle.jl b/lib/intrinsics/src/shuffle.jl
new file mode 100644
index 00000000..4f8a0aa2
--- /dev/null
+++ b/lib/intrinsics/src/shuffle.jl
@@ -0,0 +1,11 @@
+export sub_group_shuffle, sub_group_shuffle_xor
+
+const gentypes = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float16, Float32, Float64]
+
+for gentype in gentypes
+    @eval begin
+        # cl_khr_subgroup_shuffle extension operations
+        @device_function sub_group_shuffle(x::$gentype, i::Integer) = @builtin_ccall("sub_group_shuffle", $gentype, ($gentype, Int32), x, i % Int32 - 1i32)
+        @device_function sub_group_shuffle_xor(x::$gentype, mask::Integer) = @builtin_ccall("sub_group_shuffle_xor", $gentype, ($gentype, Int32), x, mask % Int32)
+    end
+end

From 48905ce547f1340fe028b69595e01f2d8f9246ee Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 18 Jan 2026 15:52:49 -0400
Subject: [PATCH 2/6] Sub-group barrier

---
 lib/intrinsics/Project.toml           | 2 +-
 lib/intrinsics/src/synchronization.jl | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml
index 338bb329..5f2dafaa 100644
--- a/lib/intrinsics/Project.toml
+++ b/lib/intrinsics/Project.toml
@@ -1,7 +1,7 @@
 name = "SPIRVIntrinsics"
 uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 authors = ["Tim Besard <tim.besard@gmail.com>"]
-version = "0.5.6"
+version = "0.5.7"
 
 [deps]
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
diff --git a/lib/intrinsics/src/synchronization.jl b/lib/intrinsics/src/synchronization.jl
index eab1349a..7d88b69f 100644
--- a/lib/intrinsics/src/synchronization.jl
+++ b/lib/intrinsics/src/synchronization.jl
@@ -149,10 +149,14 @@ write_mem_fence(flags) = atomic_work_item_fence(flags, memory_order_release, mem
 
 ## OpenCL execution barriers
 
-export barrier, work_group_barrier
+export barrier, work_group_barrier, sub_group_barrier
 
 @inline work_group_barrier(flags, scope = memory_scope_work_group) =
     control_barrier(Scope.Workgroup, cl_scope_to_spirv(scope),
                     MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))
 
+@inline sub_group_barrier(flags, scope = memory_scope_sub_group) =
+    control_barrier(Scope.Subgroup, cl_scope_to_spirv(scope),
+                    MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))
+
 barrier(flags) = work_group_barrier(flags)

From 1d3ec4f894d558eccbc74cee1394bcfc4f9fbf98 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 15:35:34 -0400
Subject: [PATCH 3/6] Sub-group

---
 Project.toml                |  2 +-
 lib/cl/device.jl            | 51 ++++++++++++++++++++++++
 src/compiler/compilation.jl | 25 +++++++++++-
 test/intrinsics.jl          | 77 +++++++++++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6f3e5b85..8ea5539e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -38,7 +38,7 @@ Random = "1"
 Random123 = "1.7.1"
 RandomNumbers = "1.6.0"
 Reexport = "1"
-SPIRVIntrinsics = "0.5"
+SPIRVIntrinsics = "0.5.7"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"
diff --git a/lib/cl/device.jl b/lib/cl/device.jl
index dd1c4a26..64bb1f9c 100644
--- a/lib/cl/device.jl
+++ b/lib/cl/device.jl
@@ -139,6 +139,20 @@ end
         return tuple([Int(r) for r in result]...)
     end
 
+    # error handling inspired by rusticl
+    # https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320
+    if s == :sub_group_sizes
+        res_size = Ref{Csize_t}()
+        err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
+        if err == CL_SUCCESS && res_size[] > 1
+            result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
+            clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
+            return tuple([Int(r) for r in result]...)
+        else
+            return tuple(0, 1)
+        end
+    end
+
     if s == :max_image2d_shape
         width  = Ref{Csize_t}()
         height = Ref{Csize_t}()
@@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol)
     end
     return cl_dtype
 end
+
+sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions
+function sub_group_size(d::Device)
+    sub_groups_supported(d) || 0
+    if "cl_amd_device_attribute_query" in d.extensions
+        scalar = Ref{cl_uint}()
+        clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL)
+        return Int(scalar[])
+    elseif "cl_nv_device_attribute_query" in d.extensions
+        scalar = Ref{cl_uint}()
+        clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL)
+        return Int(scalar[])
+    else
+        sg_sizes = d.sub_group_sizes
+        return if length(sg_sizes) == 1
+            Int(only(sg_sizes))
+        elseif 32 in sg_sizes
+            32
+        elseif 64 in sg_sizes
+            64
+        elseif 16 in sg_sizes
+            16
+        else
+            Int(first(sg_sizes))
+        end
+    end
+end
+function sub_group_shuffle_supported_types(d::Device)
+    if "cl_khr_subgroup_shuffle" in d.extensions
+        res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
+        "cl_khr_fp16" in d.extensions && push!(res, Float16)
+        "cl_khr_fp64" in d.extensions && push!(res, Float64)
+        res
+    else
+        DataType[]
+    end
+end
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 03637b97..9c185037 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -1,6 +1,14 @@
 ## gpucompiler interface
 
-struct OpenCLCompilerParams <: AbstractCompilerParams end
+Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
+    sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
+end
+function Base.hash(params::OpenCLCompilerParams, h::UInt)
+    h = hash(params.sub_group_size, h)
+
+    return h
+end
+
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
 const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}
 
@@ -29,6 +37,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
                    Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
                    job, mod, entry)
 
+    # Set the subgroup size if supported
+    sg_size = job.config.params.sub_group_size
+    if sg_size >= 0
+        metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
+    end
+
     # if this kernel uses our RNG, we should prime the shared state.
     # XXX: these transformations should really happen at the Julia IR level...
     if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
@@ -131,9 +145,16 @@ end
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
+    # Set to -1 if specifying a subgroup size is not supported
+    sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
+        cl.sub_group_size(dev)
+    else
+        -1
+    end
+
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...)
-    params = OpenCLCompilerParams()
+    params = OpenCLCompilerParams(; sub_group_size)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index ebbf3aea..407faec2 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -166,6 +166,83 @@ end
     @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
 end
 
+if cl.sub_groups_supported(cl.device())
+
+struct SubgroupData
+    sub_group_size::UInt32
+    max_sub_group_size::UInt32
+    num_sub_groups::UInt32
+    sub_group_id::UInt32
+    sub_group_local_id::UInt32
+end
+function test_subgroup_kernel(results)
+    i = get_global_id(1)
+
+    if i <= length(results)
+        @inbounds results[i] = SubgroupData(
+            get_sub_group_size(),
+            get_max_sub_group_size(),
+            get_num_sub_groups(),
+            get_sub_group_id(),
+            get_sub_group_local_id()
+        )
+    end
+    return
+end
+
+@testset "Sub-groups" begin
+    sg_size = cl.sub_group_size(cl.device())
+
+    @testset "Indexing intrinsics" begin
+        # Test with small kernel
+        sg_n = 2
+        local_size = sg_size * sg_n
+        numworkgroups = 2
+        N = local_size * numworkgroups
+
+        results = CLVector{SubgroupData}(undef, N)
+        kernel = @opencl launch = false test_subgroup_kernel(results)
+
+        kernel(results; local_size, global_size=N)
+
+        host_results = Array(results)
+
+        # Verify results make sense
+        for (i, sg_data) in enumerate(host_results)
+            @test sg_data.sub_group_size == sg_size
+            @test sg_data.max_sub_group_size == sg_size
+            @test sg_data.num_sub_groups == sg_n
+
+            # Group ID should be 1-based
+            expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
+            @test sg_data.sub_group_id == expected_sub_group
+
+            # Local ID should be 1-based within group
+            expected_sg_local = ((i - 1) % sg_size) + 1
+            @test sg_data.sub_group_local_id == expected_sg_local
+        end
+    end
+
+    @testset "shuffle idx" begin
+        function shfl_idx_kernel(d)
+            i = get_sub_group_local_id()
+            j = get_sub_group_size() - i + 1
+
+            d[i] = sub_group_shuffle(d[i], j)
+
+            return
+        end
+
+        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+            a = rand(T, sg_size)
+            d_a = CLArray(a)
+            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+            @test Array(d_a) == reverse(a)
+        end
+    end
+end
+end # if cl.sub_groups_supported(cl.device())
+
 @testset "SIMD - $N x $T" for N in simd_ns, T in float_types
     # codegen emits i48 here, which SPIR-V doesn't support
     # XXX: fix upstream?

From 922f426d996b61d4d16665f371d727c6ba1192c5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 15:35:43 -0400
Subject: [PATCH 4/6] Temp

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 8ea5539e..6f3e5b85 100644
--- a/Project.toml
+++ b/Project.toml
@@ -38,7 +38,7 @@ Random = "1"
 Random123 = "1.7.1"
 RandomNumbers = "1.6.0"
 Reexport = "1"
-SPIRVIntrinsics = "0.5.7"
+SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"

From 838ddcdbe4a7e10defa5e1b34046e919c0774461 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 18:53:25 -0400
Subject: [PATCH 5/6] Add KernelIntrinsics support

---
 Project.toml         |  2 +-
 src/OpenCLKernels.jl | 68 +++++++++++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6f3e5b85..65083819 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ SPIRVIntrinsics = {path = "lib/intrinsics"}
 Adapt = "4"
 GPUArrays = "11.2.1"
 GPUCompiler = "1.7.1"
-KernelAbstractions = "0.9.38"
+KernelAbstractions = "0.10"
 LLVM = "9.1"
 LinearAlgebra = "1"
 OpenCL_jll = "=2024.10.24"
diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl
index e06102cf..626b7fa0 100644
--- a/src/OpenCLKernels.jl
+++ b/src/OpenCLKernels.jl
@@ -1,9 +1,10 @@
 module OpenCLKernels
 
 using ..OpenCL
-using ..OpenCL: @device_override, method_table
+using ..OpenCL: @device_override, method_table, kernel_convert, clfunction
 
 import KernelAbstractions as KA
+import KernelAbstractions.KernelIntrinsics as KI
 
 import StaticArrays
 
@@ -126,33 +127,62 @@ function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize
     return nothing
 end
 
+KI.argconvert(::OpenCLBackend, arg) = kernel_convert(arg)
+
+function KI.kernel_function(::OpenCLBackend, f::F, tt::TT=Tuple{}; name = nothing, kwargs...) where {F,TT}
+    kern = clfunction(f, tt; name, kwargs...)
+    KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern)
+end
+
+function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsize = 1)
+    KI.check_launch_args(numworkgroups, workgroupsize)
+
+    local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...)
+
+    numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...)
+    global_size = local_size .* numworkgroups
+
+    obj.kern(args...; local_size, global_size)
+    return nothing
+end
+
+
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int=typemax(Int))::Int
+    wginfo = cl.work_group_info(kernel.kern.fun, cl.device())
+    Int(min(wginfo.size, max_work_items))
+end
+function KI.max_work_group_size(::OpenCLBackend)::Int
+    Int(cl.device().max_work_group_size)
+end
+function KI.multiprocessor_count(::OpenCLBackend)::Int
+    Int(cl.device().max_compute_units)
+end
 
 ## Indexing Functions
+## COV_EXCL_START
 
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return get_local_id(1)
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return get_group_id(1)
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    #return get_global_id(1)    # JuliaGPU/OpenCL.jl#346
-    I = KA.__index_Global_Cartesian(ctx)
-    @inbounds LinearIndices(KA.__ndrange(ctx))[I]
+@device_override @inline function KI.get_global_id()
+    return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -167,7 +197,7 @@ end
 
 ## Shared and Scratch Memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ptr = OpenCL.emit_localmemory(T, Val(prod(Dims)))
     CLDeviceArray(Dims, ptr)
 end
@@ -179,14 +209,14 @@ end
 
 ## Synchronization and Printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
 end
 
-@device_override @inline function KA.__print(args...)
+@device_override @inline function KI._print(args...)
     OpenCL._print(args...)
 end
-
+## COV_EXCL_STOP
 
 ## Other
 

From 69ead4d91908282d6ad1f0c2df56432cce3af2c7 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 17 Nov 2025 22:02:15 -0400
Subject: [PATCH 6/6] Temp CI

---
 Project.toml      | 3 ++-
 test/Project.toml | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 65083819..e2ac2e64 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,12 +23,13 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [sources]
 SPIRVIntrinsics = {path = "lib/intrinsics"}
+KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"}
 
 [compat]
 Adapt = "4"
 GPUArrays = "11.2.1"
 GPUCompiler = "1.7.1"
-KernelAbstractions = "0.10"
+KernelAbstractions = "0.9, 0.10"
 LLVM = "9.1"
 LinearAlgebra = "1"
 OpenCL_jll = "=2024.10.24"
diff --git a/test/Project.toml b/test/Project.toml
index 57ae7ff9..0f85866e 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,6 +8,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"