From 39956e068df14804d4c15881a90d61615ba947e9 Mon Sep 17 00:00:00 2001 From: jariji <96840304+jariji@users.noreply.github.com> Date: Thu, 5 Mar 2026 20:17:50 +0000 Subject: [PATCH] Add Enum type support to ArrowTypes Enums are serialized as Int32 with a type registry for roundtrip deserialization. Includes `registertype!` for read-only scenarios where the writing session is unavailable. Co-Authored-By: Claude Opus 4.6 --- src/ArrowTypes/src/ArrowTypes.jl | 33 ++++++++++++++++++++++++- src/ArrowTypes/test/tests.jl | 41 ++++++++++++++++++++++++++++++++ test/runtests.jl | 35 +++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) diff --git a/src/ArrowTypes/src/ArrowTypes.jl b/src/ArrowTypes/src/ArrowTypes.jl index fe2223f..bba4606 100644 --- a/src/ArrowTypes/src/ArrowTypes.jl +++ b/src/ArrowTypes/src/ArrowTypes.jl @@ -36,7 +36,8 @@ export ArrowKind, toarrow, arrowname, fromarrow, - ToArrow + ToArrow, + registertype! """ ArrowTypes.ArrowKind(T) @@ -285,6 +286,36 @@ arrowname(::Type{IPv6}) = IPV6_SYMBOL JuliaType(::Val{IPV6_SYMBOL}) = IPv6 fromarrow(::Type{IPv6}, x::NTuple{16,UInt8}) = IPv6(_cast(UInt128, x)) +# Enum support +const ENUM_SYMBOL = Symbol("JuliaLang.Enum") +const ENUM_TYPES = Dict{String,DataType}() + +""" + registertype!(::Type{T}) where {T<:Enum} + +Register an `Enum` type for deserialization. This is only needed when reading +Arrow data in a session that did not write it (i.e. read-only scenarios). +During writing, enum types are registered automatically via `arrowmetadata`. +""" +function registertype!(::Type{T}) where {T<:Enum} + ENUM_TYPES[string(nameof(T))] = T + return T +end + +ArrowType(::Type{<:Enum}) = Int32 +toarrow(x::Enum) = Int32(Integer(x)) +arrowname(::Type{<:Enum}) = ENUM_SYMBOL + +function arrowmetadata(::Type{T}) where {T<:Enum} + key = string(nameof(T)) + ENUM_TYPES[key] = T + return key +end + +JuliaType(::Val{ENUM_SYMBOL}, S, meta::AbstractString) = get(ENUM_TYPES, meta, nothing) +fromarrow(::Type{T}, x::Integer) where {T<:Enum} = T(x) +default(::Type{T}) where {T<:Enum} = typemin(T) + function _cast(::Type{Y}, x)::Y where {Y} y = Ref{Y}() _unsafe_cast!(y, Ref(x), 1) diff --git a/src/ArrowTypes/test/tests.jl b/src/ArrowTypes/test/tests.jl index 22d8dd0..70c7b66 100644 --- a/src/ArrowTypes/test/tests.jl +++ b/src/ArrowTypes/test/tests.jl @@ -233,4 +233,45 @@ end @test isequal(x, [missing]) end end + + @testset "Enum" begin + @enum Fruit apple=0 banana=1 cherry=2 + + # ArrowType and toarrow + @test ArrowTypes.ArrowType(Fruit) == Int32 + @test ArrowTypes.toarrow(apple) === Int32(0) + @test ArrowTypes.toarrow(cherry) === Int32(2) + + # arrowname + @test ArrowTypes.arrowname(Fruit) === ArrowTypes.ENUM_SYMBOL + @test ArrowTypes.hasarrowname(Fruit) + + # arrowmetadata registers the type and returns the name + meta = ArrowTypes.arrowmetadata(Fruit) + @test meta == "Fruit" + @test ArrowTypes.ENUM_TYPES["Fruit"] === Fruit + + # JuliaType lookup via registry + @test ArrowTypes.JuliaType(Val(ArrowTypes.ENUM_SYMBOL), Int32, "Fruit") === Fruit + @test ArrowTypes.JuliaType(Val(ArrowTypes.ENUM_SYMBOL), Int32, "NoSuchEnum") === nothing + + # fromarrow + @test ArrowTypes.fromarrow(Fruit, Int32(0)) === apple + @test ArrowTypes.fromarrow(Fruit, Int32(2)) === cherry + + # default + @test ArrowTypes.default(Fruit) === apple + + # Manual registertype! + @enum Planet mercury=0 venus=1 earth=2 + # Not yet registered (unless arrowmetadata was called) + delete!(ArrowTypes.ENUM_TYPES, "Planet") + @test ArrowTypes.JuliaType(Val(ArrowTypes.ENUM_SYMBOL), Int32, "Planet") === nothing + ArrowTypes.registertype!(Planet) + @test ArrowTypes.JuliaType(Val(ArrowTypes.ENUM_SYMBOL), Int32, "Planet") === Planet + + # Union{Enum, Missing} passthrough + @test ArrowTypes.arrowname(Union{Fruit,Missing}) === ArrowTypes.ENUM_SYMBOL + @test ArrowTypes.arrowmetadata(Union{Fruit,Missing}) == "Fruit" + end end diff --git a/test/runtests.jl b/test/runtests.jl index 315d1b6..57a2212 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1084,6 +1084,41 @@ end ) @test t.reject_reason[end] == "POST_ONLY" end + + @testset "Enum roundtrip" begin + @enum Direction north=0 south=1 east=2 west=3 + + # Basic roundtrip + orig = [north, south, east, west, north] + tbl = Arrow.Table(Arrow.tobuffer((dir=orig,))) + @test tbl.dir == orig + @test eltype(tbl.dir) == Direction + + # Union{Enum, Missing} + orig_m = Union{Direction,Missing}[north, missing, east, missing, west] + tbl2 = Arrow.Table(Arrow.tobuffer((dir=orig_m,))) + @test isequal(tbl2.dir, orig_m) + @test eltype(tbl2.dir) == Union{Direction,Missing} + + # Multiple enum columns + @enum Priority low=0 medium=1 high=2 + orig_d = [north, south, east] + orig_p = [low, high, medium] + tbl3 = Arrow.Table(Arrow.tobuffer((dir=orig_d, pri=orig_p))) + @test tbl3.dir == orig_d + @test eltype(tbl3.dir) == Direction + @test tbl3.pri == orig_p + @test eltype(tbl3.pri) == Priority + + # Multiple record batches + orig_long = repeat([north, south, east, west], 100) + io = IOBuffer() + Arrow.write(io, (dir=orig_long,); file=false) + seekstart(io) + tbl4 = Arrow.Table(io) + @test tbl4.dir == orig_long + @test eltype(tbl4.dir) == Direction + end end # @testset "misc" @testset "DataAPI.metadata" begin