Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[sources]
SPIRVIntrinsics = {path = "lib/intrinsics"}
KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"}

[compat]
Adapt = "4"
GPUArrays = "11.2.1"
GPUCompiler = "1.7.1"
KernelAbstractions = "0.9.38"
KernelAbstractions = "0.9, 0.10"
LLVM = "9.1"
LinearAlgebra = "1"
OpenCL_jll = "=2024.10.24"
Expand Down
51 changes: 51 additions & 0 deletions lib/cl/device.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,20 @@ end
return tuple([Int(r) for r in result]...)
end

# error handling inspired by rusticl
# https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320
if s == :sub_group_sizes
res_size = Ref{Csize_t}()
err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
if err == CL_SUCCESS && res_size[] > 1
result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
return tuple([Int(r) for r in result]...)
else
return tuple(0, 1)
end
end

if s == :max_image2d_shape
width = Ref{Csize_t}()
height = Ref{Csize_t}()
Expand Down Expand Up @@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol)
end
return cl_dtype
end

sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions
function sub_group_size(d::Device)
sub_groups_supported(d) || 0
if "cl_amd_device_attribute_query" in d.extensions
scalar = Ref{cl_uint}()
clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL)
return Int(scalar[])
elseif "cl_nv_device_attribute_query" in d.extensions
scalar = Ref{cl_uint}()
clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL)
return Int(scalar[])
else
sg_sizes = d.sub_group_sizes
return if length(sg_sizes) == 1
Int(only(sg_sizes))
elseif 32 in sg_sizes
32
elseif 64 in sg_sizes
64
elseif 16 in sg_sizes
16
else
Int(first(sg_sizes))
end
end
end
function sub_group_shuffle_supported_types(d::Device)
if "cl_khr_subgroup_shuffle" in d.extensions
res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
"cl_khr_fp16" in d.extensions && push!(res, Float16)
"cl_khr_fp64" in d.extensions && push!(res, Float64)
res
else
DataType[]
end
end
2 changes: 1 addition & 1 deletion lib/intrinsics/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SPIRVIntrinsics"
uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
authors = ["Tim Besard <tim.besard@gmail.com>"]
version = "0.5.6"
version = "0.5.7"

[deps]
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
Expand Down
1 change: 1 addition & 0 deletions lib/intrinsics/src/SPIRVIntrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ include("printf.jl")
include("math.jl")
include("integer.jl")
include("atomic.jl")
include("shuffle.jl")

# helper macro to import all names from this package, even non-exported ones.
macro import_all()
Expand Down
11 changes: 11 additions & 0 deletions lib/intrinsics/src/shuffle.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export sub_group_shuffle, sub_group_shuffle_xor

const gentypes = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float16, Float32, Float64]

for gentype in gentypes
@eval begin
# cl_khr_subgroup_shuffle extension operations
@device_function sub_group_shuffle(x::$gentype, i::Integer) = @builtin_ccall("sub_group_shuffle", $gentype, ($gentype, Int32), x, i % Int32 - 1i32)
@device_function sub_group_shuffle_xor(x::$gentype, mask::Integer) = @builtin_ccall("sub_group_shuffle_xor", $gentype, ($gentype, Int32), x, mask % Int32)
end
end
6 changes: 5 additions & 1 deletion lib/intrinsics/src/synchronization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,14 @@ write_mem_fence(flags) = atomic_work_item_fence(flags, memory_order_release, mem

## OpenCL execution barriers

export barrier, work_group_barrier
export barrier, work_group_barrier, sub_group_barrier

@inline work_group_barrier(flags, scope = memory_scope_work_group) =
control_barrier(Scope.Workgroup, cl_scope_to_spirv(scope),
MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))

@inline sub_group_barrier(flags, scope = memory_scope_sub_group) =
control_barrier(Scope.Subgroup, cl_scope_to_spirv(scope),
MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))

barrier(flags) = work_group_barrier(flags)
68 changes: 49 additions & 19 deletions src/OpenCLKernels.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
module OpenCLKernels

using ..OpenCL
using ..OpenCL: @device_override, method_table
using ..OpenCL: @device_override, method_table, kernel_convert, clfunction

import KernelAbstractions as KA
import KernelAbstractions.KernelIntrinsics as KI

import StaticArrays

Expand Down Expand Up @@ -126,33 +127,62 @@ function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize
return nothing
end

KI.argconvert(::OpenCLBackend, arg) = kernel_convert(arg)

function KI.kernel_function(::OpenCLBackend, f::F, tt::TT=Tuple{}; name = nothing, kwargs...) where {F,TT}
kern = clfunction(f, tt; name, kwargs...)
KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern)
end

function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)

local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...)

numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...)
global_size = local_size .* numworkgroups

obj.kern(args...; local_size, global_size)
return nothing
end


function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int=typemax(Int))::Int
wginfo = cl.work_group_info(kernel.kern.fun, cl.device())
Int(min(wginfo.size, max_work_items))
end
function KI.max_work_group_size(::OpenCLBackend)::Int
Int(cl.device().max_work_group_size)
end
function KI.multiprocessor_count(::OpenCLBackend)::Int
Int(cl.device().max_compute_units)
end

## Indexing Functions
## COV_EXCL_START

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
@device_override @inline function KI.get_local_id()
return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
@device_override @inline function KI.get_group_id()
return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
end

@device_override @inline function KA.__index_Global_Linear(ctx)
#return get_global_id(1) # JuliaGPU/OpenCL.jl#346
I = KA.__index_Global_Cartesian(ctx)
@inbounds LinearIndices(KA.__ndrange(ctx))[I]
@device_override @inline function KI.get_global_id()
return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
@device_override @inline function KI.get_local_size()
return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
@device_override @inline function KI.get_num_groups()
return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@device_override @inline function KI.get_global_size()
return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
end

@device_override @inline function KA.__validindex(ctx)
Expand All @@ -167,7 +197,7 @@ end

## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
ptr = OpenCL.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
end
Expand All @@ -179,14 +209,14 @@ end

## Synchronization and Printing

@device_override @inline function KA.__synchronize()
@device_override @inline function KI.barrier()
work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
end

@device_override @inline function KA.__print(args...)
@device_override @inline function KI._print(args...)
OpenCL._print(args...)
end

## COV_EXCL_STOP

## Other

Expand Down
25 changes: 23 additions & 2 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
## gpucompiler interface

struct OpenCLCompilerParams <: AbstractCompilerParams end
Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
end
function Base.hash(params::OpenCLCompilerParams, h::UInt)
h = hash(params.sub_group_size, h)

return h
end

const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}

Expand Down Expand Up @@ -29,6 +37,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
job, mod, entry)

# Set the subgroup size if supported
sg_size = job.config.params.sub_group_size
if sg_size >= 0
metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
end

# if this kernel uses our RNG, we should prime the shared state.
# XXX: these transformations should really happen at the Julia IR level...
if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
Expand Down Expand Up @@ -131,9 +145,16 @@ end
supports_fp16 = "cl_khr_fp16" in dev.extensions
supports_fp64 = "cl_khr_fp64" in dev.extensions

# Set to -1 if specifying a subgroup size is not supported
sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
cl.sub_group_size(dev)
else
-1
end

# create GPUCompiler objects
target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...)
params = OpenCLCompilerParams()
params = OpenCLCompilerParams(; sub_group_size)
CompilerConfig(target, params; kernel, name, always_inline)
end

Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Expand Down
77 changes: 77 additions & 0 deletions test/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,83 @@ end
@test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
end

if cl.sub_groups_supported(cl.device())

struct SubgroupData
sub_group_size::UInt32
max_sub_group_size::UInt32
num_sub_groups::UInt32
sub_group_id::UInt32
sub_group_local_id::UInt32
end
function test_subgroup_kernel(results)
i = get_global_id(1)

if i <= length(results)
@inbounds results[i] = SubgroupData(
get_sub_group_size(),
get_max_sub_group_size(),
get_num_sub_groups(),
get_sub_group_id(),
get_sub_group_local_id()
)
end
return
end

@testset "Sub-groups" begin
sg_size = cl.sub_group_size(cl.device())

@testset "Indexing intrinsics" begin
# Test with small kernel
sg_n = 2
local_size = sg_size * sg_n
numworkgroups = 2
N = local_size * numworkgroups

results = CLVector{SubgroupData}(undef, N)
kernel = @opencl launch = false test_subgroup_kernel(results)

kernel(results; local_size, global_size=N)

host_results = Array(results)

# Verify results make sense
for (i, sg_data) in enumerate(host_results)
@test sg_data.sub_group_size == sg_size
@test sg_data.max_sub_group_size == sg_size
@test sg_data.num_sub_groups == sg_n

# Group ID should be 1-based
expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
@test sg_data.sub_group_id == expected_sub_group

# Local ID should be 1-based within group
expected_sg_local = ((i - 1) % sg_size) + 1
@test sg_data.sub_group_local_id == expected_sg_local
end
end

@testset "shuffle idx" begin
function shfl_idx_kernel(d)
i = get_sub_group_local_id()
j = get_sub_group_size() - i + 1

d[i] = sub_group_shuffle(d[i], j)

return
end

@testset for T in cl.sub_group_shuffle_supported_types(cl.device())
a = rand(T, sg_size)
d_a = CLArray(a)
@opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
@test Array(d_a) == reverse(a)
end
end
end
end # if cl.sub_groups_supported(cl.device())

@testset "SIMD - $N x $T" for N in simd_ns, T in float_types
# codegen emits i48 here, which SPIR-V doesn't support
# XXX: fix upstream?
Expand Down
Loading