From 84d49d14d761be286cd707f18016a36819a0d8af Mon Sep 17 00:00:00 2001
From: Parvm1102 <parvmittal31757@gmail.com>
Date: Wed, 4 Mar 2026 23:51:06 +0530
Subject: [PATCH] test: Add Mooncake AD testing to conv layer test
 infrastructure

Signed-off-by: Parvm1102 <parvmittal31757@gmail.com>
---
 GraphNeuralNetworks/test/Project.toml   |  1 +
 GraphNeuralNetworks/test/layers/conv.jl | 45 ++++++++++++++-----------
 GraphNeuralNetworks/test/runtests.jl    |  2 ++
 GraphNeuralNetworks/test/test_module.jl | 36 +++++++++++++++++---
 4 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/GraphNeuralNetworks/test/Project.toml b/GraphNeuralNetworks/test/Project.toml
index 31f7482f4..9efc8f3a1 100644
--- a/GraphNeuralNetworks/test/Project.toml
+++ b/GraphNeuralNetworks/test/Project.toml
@@ -8,6 +8,7 @@ GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48"
 GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/GraphNeuralNetworks/test/layers/conv.jl b/GraphNeuralNetworks/test/layers/conv.jl
index 161ff822b..ecc0fbd78 100644
--- a/GraphNeuralNetworks/test/layers/conv.jl
+++ b/GraphNeuralNetworks/test/layers/conv.jl
@@ -10,20 +10,20 @@ end
         l = GCNConv(D_IN => D_OUT)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
 
         l = GCNConv(D_IN => D_OUT, tanh, bias = false)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
 
         l = GCNConv(D_IN => D_OUT, add_self_loops = false)
         for g in TEST_GRAPHS
             has_isolated_nodes(g) && continue
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
     end
 
@@ -49,7 +49,7 @@ end
         l = GCNConv(1 => 1, add_self_loops = false, use_edge_weight = true)
         @test gradient(w -> sum(l(g, x, w)), w)[1] isa AbstractVector{Float32}   # redundant test but more explicit
         @test size(l(g, x, w)) == (1, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 
     @testset "conv_weight" begin
@@ -86,6 +86,7 @@ end
     for g in TEST_GRAPHS
         g = add_self_loops(g)
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        # Note: test_mooncake not enabled for ChebConv (Mooncake backward pass error)
         test_gradients(l, g, g.x, rtol = RTOL_LOW)
     end
 
@@ -124,13 +125,13 @@ end
     l = GraphConv(D_IN => D_OUT)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 
     l = GraphConv(D_IN => D_OUT, tanh, bias = false, aggr = mean)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 
     @testset "bias=false" begin
@@ -157,7 +158,7 @@ end
         l = GATConv(D_IN => D_OUT; heads, concat, dropout=0)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_LOW)
+            test_gradients(l, g, g.x, rtol = RTOL_LOW, test_mooncake = TEST_MOONCAKE)
         end
     end
 
@@ -166,7 +167,7 @@ end
         l = GATConv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
         g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
         @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW, test_mooncake = TEST_MOONCAKE)
     end
 
     @testset "num params" begin
@@ -197,6 +198,7 @@ end
         l = GATv2Conv(D_IN => D_OUT, tanh; heads, concat, dropout=0)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
+            # Mooncake backward pass error for this layer on CI
             test_gradients(l, g, g.x, rtol = RTOL_LOW, atol=ATOL_LOW)
         end
     end
@@ -206,6 +208,7 @@ end
         l = GATv2Conv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
         g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
         @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        # Mooncake backward pass error for this layer on CI
         test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW, atol=ATOL_LOW)
     end
 
@@ -239,7 +242,7 @@ end
 
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -260,7 +263,7 @@ end
     l = EdgeConv(Dense(2 * D_IN, D_OUT), aggr = +)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -281,7 +284,7 @@ end
     l = GINConv(nn, 0.01, aggr = mean)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 
     @test !in(:eps, Flux.trainable(l))
@@ -307,7 +310,7 @@ end
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
         @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -332,7 +335,7 @@ end
     l = SAGEConv(D_IN => D_OUT, tanh, bias = false, aggr = +)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -351,7 +354,7 @@ end
     l = ResGatedGraphConv(D_IN => D_OUT, tanh, bias = true)
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -411,7 +414,7 @@ end
     Flux.trainable(l) == (; β = [1f0])
     for g in TEST_GRAPHS
         @test size(l(g, g.x)) == (D_IN, g.num_nodes)
-        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -437,7 +440,7 @@ end
             y = l(g, x, e)
             return mean(y[1]) + sum(y[2])
         end
-        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW; loss)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW; loss, test_mooncake = TEST_MOONCAKE)
     end
 end
 
@@ -491,13 +494,13 @@ end
         l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
 
         l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
     end
 end
@@ -520,13 +523,13 @@ end
         l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
 
         l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
-            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_mooncake = TEST_MOONCAKE)
         end
     end
 end
@@ -565,6 +568,7 @@ end
     ein = 2
     heads = 3
     # used like in Kool et al., 2019
+    # Mooncake backward pass error for this layer on CI
     l = TransformerConv(D_IN * heads => D_IN; heads, add_self_loops = true,
                         root_weight = false, ff_channels = 10, skip_connection = true,
                         batch_norm = false)
@@ -616,6 +620,7 @@ end
         l = DConv(D_IN => D_OUT, k)
         for g in TEST_GRAPHS
             @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            # Note: test_mooncake not enabled for DConv (Mooncake backward pass error)
             test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
     end
diff --git a/GraphNeuralNetworks/test/runtests.jl b/GraphNeuralNetworks/test/runtests.jl
index 0a1dfeec6..cdd695805 100644
--- a/GraphNeuralNetworks/test/runtests.jl
+++ b/GraphNeuralNetworks/test/runtests.jl
@@ -4,6 +4,8 @@ using TestItemRunner
 ## for how to run the tests within VS Code.
 ## See test_module.jl for the test infrastructure.
 
+const TEST_MOONCAKE = VERSION >= v"1.12"
+
 ## Uncomment below to change the default test settings
 # ENV["GNN_TEST_CPU"] = "false"
 # ENV["GNN_TEST_CUDA"] = "true"
diff --git a/GraphNeuralNetworks/test/test_module.jl b/GraphNeuralNetworks/test/test_module.jl
index ea6cc3a06..8da2ff740 100644
--- a/GraphNeuralNetworks/test/test_module.jl
+++ b/GraphNeuralNetworks/test/test_module.jl
@@ -29,6 +29,11 @@ using ChainRulesTestUtils, FiniteDifferences
 using Zygote: Zygote
 using SparseArrays
 
+# Mooncake.jl requires Julia >= 1.12
+const TEST_MOONCAKE = VERSION >= v"1.12"
+if TEST_MOONCAKE
+    import Mooncake
+end
 
 # from Base
 export mean, randn, SparseArrays, AbstractSparseMatrix
@@ -45,7 +50,7 @@ export random_regular_graph, erdos_renyi
 # from this module
 export D_IN, D_OUT, GRAPH_TYPES, TEST_GRAPHS,
        test_gradients, finitediff_withgradient, 
-       check_equal_leaves, gpu_backend
+       check_equal_leaves, gpu_backend, TEST_MOONCAKE
 
 
 const D_IN = 3
@@ -82,12 +87,13 @@ function test_gradients(
             test_grad_f = true,
             test_grad_x = true,
             compare_finite_diff = true,
+            test_mooncake = false,
             loss = (f, g, xs...) -> mean(f(g, xs...)),
             )
 
-    if !test_gpu && !compare_finite_diff
-        error("You should either compare finite diff vs CPU AD \
-               or CPU AD vs GPU AD.")
+    if !test_gpu && !compare_finite_diff && !test_mooncake
+        error("You should either compare finite diff vs CPU AD, \
+               CPU AD vs GPU AD, or test Mooncake AD.")
     end
 
     ## Let's make sure first that the forward pass works.
@@ -116,6 +122,17 @@ function test_gradients(
             check_equal_leaves(g, g_fd; rtol, atol)
         end
 
+        if test_mooncake
+            # Mooncake gradient with respect to input, compared against Zygote.
+            loss_mc_x = (xs...) -> loss(f, graph, xs...)
+            _cache_x = Base.invokelatest(Mooncake.prepare_gradient_cache, loss_mc_x, xs...)
+            y_mc, g_mc = Base.invokelatest(Mooncake.value_and_gradient!!, _cache_x, loss_mc_x, xs...)
+            @assert isapprox(y, y_mc; rtol, atol)
+            for i in eachindex(xs)
+                @assert isapprox(g[i], g_mc[i+1]; rtol, atol)
+            end
+        end
+
         if test_gpu
             # Zygote gradient with respect to input on GPU.
             y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu, graph_gpu, xs...), xs_gpu...)
@@ -139,6 +156,17 @@ function test_gradients(
             check_equal_leaves(g, g_fd; rtol, atol)
         end
 
+        if test_mooncake
+            # Mooncake gradient with respect to f, compared against Zygote.
+            ps_mc, re_mc = Flux.destructure(f)
+            loss_mc_f = ps -> loss(re_mc(ps), graph, xs...)
+            _cache_f = Base.invokelatest(Mooncake.prepare_gradient_cache, loss_mc_f, ps_mc)
+            y_mc, g_mc = Base.invokelatest(Mooncake.value_and_gradient!!, _cache_f, loss_mc_f, ps_mc)
+            @assert isapprox(y, y_mc; rtol, atol)
+            g_mc_f = (re_mc(g_mc[2]),)
+            check_equal_leaves(g, g_mc_f; rtol, atol)
+        end
+
         if test_gpu
             # Zygote gradient with respect to f on GPU.
             y_gpu, g_gpu = Zygote.withgradient(f -> loss(f,graph_gpu, xs_gpu...), f_gpu)