Skip to content

[OpenCL] Subgroup support#413

Merged
christiangnrd merged 1 commit intomasterfrom
clsubgroups
Feb 18, 2026
Merged

[OpenCL] Subgroup support#413
christiangnrd merged 1 commit intomasterfrom
clsubgroups

Conversation

@christiangnrd
Copy link
Member

@christiangnrd christiangnrd commented Jan 11, 2026

Depends on #418
Add initial support for subgroups and subgroup shuffles.

@github-actions
Copy link
Contributor

github-actions bot commented Jan 11, 2026

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/lib/cl/device.jl b/lib/cl/device.jl
index 64bb1f9..cdfa4d5 100644
--- a/lib/cl/device.jl
+++ b/lib/cl/device.jl
@@ -315,7 +315,7 @@ function sub_group_size(d::Device)
     end
 end
 function sub_group_shuffle_supported_types(d::Device)
-    if "cl_khr_subgroup_shuffle" in d.extensions
+    return if "cl_khr_subgroup_shuffle" in d.extensions
         res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
         "cl_khr_fp16" in d.extensions && push!(res, Float16)
         "cl_khr_fp64" in d.extensions && push!(res, Float64)
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index dbef9fd..a3c9b4d 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -20,7 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
 
 @testset "barrier" begin
 
-# work-group
+        # work-group
 @on_device barrier(OpenCL.LOCAL_MEM_FENCE)
 @on_device barrier(OpenCL.GLOBAL_MEM_FENCE)
 @on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
@@ -39,10 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
 cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices)
 @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group)
 
-# sub-group
-@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE)
-@on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
-@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
+        # sub-group
+        @on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE)
+        @on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
+        @on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
 end
 
 @testset "mem_fence" begin
@@ -171,102 +171,102 @@ end
     @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
 end
 
-if cl.sub_groups_supported(cl.device())
-
-struct SubgroupData
-    sub_group_size::UInt32
-    max_sub_group_size::UInt32
-    num_sub_groups::UInt32
-    sub_group_id::UInt32
-    sub_group_local_id::UInt32
-end
-function test_subgroup_kernel(results)
-    i = get_global_id(1)
-
-    if i <= length(results)
-        @inbounds results[i] = SubgroupData(
-            get_sub_group_size(),
-            get_max_sub_group_size(),
-            get_num_sub_groups(),
-            get_sub_group_id(),
-            get_sub_group_local_id()
-        )
-    end
-    return
-end
-
-@testset "Sub-groups" begin
-    sg_size = cl.sub_group_size(cl.device())
-
-    @testset "Indexing intrinsics" begin
-        # Test with small kernel
-        sg_n = 2
-        local_size = sg_size * sg_n
-        numworkgroups = 2
-        N = local_size * numworkgroups
-
-        results = CLVector{SubgroupData}(undef, N)
-        kernel = @opencl launch = false test_subgroup_kernel(results)
-
-        kernel(results; local_size, global_size=N)
-
-        host_results = Array(results)
-
-        # Verify results make sense
-        for (i, sg_data) in enumerate(host_results)
-            @test sg_data.sub_group_size == sg_size
-            @test sg_data.max_sub_group_size == sg_size
-            @test sg_data.num_sub_groups == sg_n
-
-            # Group ID should be 1-based
-            expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
-            @test sg_data.sub_group_id == expected_sub_group
-
-            # Local ID should be 1-based within group
-            expected_sg_local = ((i - 1) % sg_size) + 1
-            @test sg_data.sub_group_local_id == expected_sg_local
-        end
-    end
-
-    @testset "shuffle idx" begin
-        function shfl_idx_kernel(d)
-            i = get_sub_group_local_id()
-            j = get_sub_group_size() - i + 0x1
-
-            d[i] = sub_group_shuffle(d[i], j)
-
-            return
-        end
-
-        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
-            a = rand(T, sg_size)
-            d_a = CLArray(a)
-            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
-            @test Array(d_a) == reverse(a)
-        end
-    end
-    @testset "shuffle xor" begin
-        function shfl_xor_kernel(in)
-            i = get_sub_group_local_id()
-
-            # val = in[i]
-            new_val = sub_group_shuffle_xor(in[i], 1)
-
-            in[i] = new_val
-            return
-        end
-
-        # tests that each pair of values a get swapped using sub_group_shuffle_xor
-        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
-            in = rand(T, sg_size)
-            idxs = xor.(0:(sg_size - 1), 1) .+ 1
-            d_in = CLArray(in)
-            @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
-            @test Array(d_in) == in[idxs]
-        end
-    end
-end
-end # if cl.sub_groups_supported(cl.device())
+        if cl.sub_groups_supported(cl.device())
+
+            struct SubgroupData
+                sub_group_size::UInt32
+                max_sub_group_size::UInt32
+                num_sub_groups::UInt32
+                sub_group_id::UInt32
+                sub_group_local_id::UInt32
+            end
+            function test_subgroup_kernel(results)
+                i = get_global_id(1)
+
+                if i <= length(results)
+                    @inbounds results[i] = SubgroupData(
+                        get_sub_group_size(),
+                        get_max_sub_group_size(),
+                        get_num_sub_groups(),
+                        get_sub_group_id(),
+                        get_sub_group_local_id()
+                    )
+                end
+                return
+            end
+
+            @testset "Sub-groups" begin
+                sg_size = cl.sub_group_size(cl.device())
+
+                @testset "Indexing intrinsics" begin
+                    # Test with small kernel
+                    sg_n = 2
+                    local_size = sg_size * sg_n
+                    numworkgroups = 2
+                    N = local_size * numworkgroups
+
+                    results = CLVector{SubgroupData}(undef, N)
+                    kernel = @opencl launch = false test_subgroup_kernel(results)
+
+                    kernel(results; local_size, global_size = N)
+
+                    host_results = Array(results)
+
+                    # Verify results make sense
+                    for (i, sg_data) in enumerate(host_results)
+                        @test sg_data.sub_group_size == sg_size
+                        @test sg_data.max_sub_group_size == sg_size
+                        @test sg_data.num_sub_groups == sg_n
+
+                        # Group ID should be 1-based
+                        expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
+                        @test sg_data.sub_group_id == expected_sub_group
+
+                        # Local ID should be 1-based within group
+                        expected_sg_local = ((i - 1) % sg_size) + 1
+                        @test sg_data.sub_group_local_id == expected_sg_local
+                    end
+                end
+
+                @testset "shuffle idx" begin
+                    function shfl_idx_kernel(d)
+                        i = get_sub_group_local_id()
+                        j = get_sub_group_size() - i + 0x01
+
+                        d[i] = sub_group_shuffle(d[i], j)
+
+                        return
+                    end
+
+                    @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+                        a = rand(T, sg_size)
+                        d_a = CLArray(a)
+                        @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+                        @test Array(d_a) == reverse(a)
+                    end
+                end
+                @testset "shuffle xor" begin
+                    function shfl_xor_kernel(in)
+                        i = get_sub_group_local_id()
+
+                        # val = in[i]
+                        new_val = sub_group_shuffle_xor(in[i], 1)
+
+                        in[i] = new_val
+                        return
+                    end
+
+                    # tests that each pair of values a get swapped using sub_group_shuffle_xor
+                    @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+                        in = rand(T, sg_size)
+                        idxs = xor.(0:(sg_size - 1), 1) .+ 1
+                        d_in = CLArray(in)
+                        @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
+                        @test Array(d_in) == in[idxs]
+                    end
+                end
+            end
+        end # if cl.sub_groups_supported(cl.device())
 
 @testset "SIMD - $N x $T" for N in simd_ns, T in float_types
     # codegen emits i48 here, which SPIR-V doesn't support

@codecov
Copy link

codecov bot commented Jan 12, 2026

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 81.02%. Comparing base (d476f50) to head (0155bd1).
⚠️ Report is 1 commits behind head on master.

Additional details and impacted files
@@            Coverage Diff             @@
##           master     #413      +/-   ##
==========================================
+ Coverage   80.84%   81.02%   +0.18%     
==========================================
  Files          12       12              
  Lines         736      743       +7     
==========================================
+ Hits          595      602       +7     
  Misses        141      141              

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@christiangnrd christiangnrd marked this pull request as draft January 15, 2026 20:54
@christiangnrd christiangnrd marked this pull request as ready for review January 16, 2026 03:00
@VarLad VarLad mentioned this pull request Jan 18, 2026
@christiangnrd christiangnrd force-pushed the clsubgroups branch 2 times, most recently from c58b3e9 to 92f0e6e Compare February 1, 2026 00:44
@christiangnrd christiangnrd changed the title Subgroup support [OpenCL] Subgroup support Feb 13, 2026
Copy link
Member

@simeonschaub simeonschaub left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice work! I have a couple of comments, but overall looks great!

Copy link
Member

@simeonschaub simeonschaub left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, this looks great from my side! Feel free to merge and tag a new version

@christiangnrd

This comment was marked as outdated.

@christiangnrd christiangnrd merged commit bea5703 into master Feb 18, 2026
37 of 39 checks passed
@christiangnrd christiangnrd deleted the clsubgroups branch February 18, 2026 22:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants

Comments