From dfb5173c009bc983cffa36ee032ee59c93418678 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 16 Jan 2026 11:29:31 -0700
Subject: [PATCH 1/7] starting on openmp build

---
 cuBQL/builder/openmp.h                  |  26 ++
 cuBQL/builder/openmp/build_omp_target.h | 300 ++++++++++++++++++++++++
 2 files changed, 326 insertions(+)
 create mode 100644 cuBQL/builder/openmp.h
 create mode 100644 cuBQL/builder/openmp/build_omp_target.h
diff --git a/cuBQL/builder/openmp.h b/cuBQL/builder/openmp.h
new file mode 100644
index 0000000..396729d
--- /dev/null
+++ b/cuBQL/builder/openmp.h
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <omp.h>
+
+namespace cuBQL {
+
+  /*! openmp based builder with #pragma omp target directives. */
+  template<typename T, int D>
+  void build_omp_target(BinaryBVH<T,D>   &bvh,
+                        /*! array of bounding boxes to build BVH over,
+                          must be in target device memory (ie, must be
+                          accessible in the device(gpuID) that the
+                          'gpuID' parameter refers to */
+                        const box_t<T,D> *d_boxes,
+                        uint32_t          numBoxes,
+                        BuildConfig       buildConfig=BuildConfig(),
+                        int               gpuID = 0);
+}
+#if CUBQL_OPENMP_BUILDER_IMPLEMENTATION
+# include "openmp/build_omp_target.h"
+#endif
+
+
diff --git a/cuBQL/builder/openmp/build_omp_target.h b/cuBQL/builder/openmp/build_omp_target.h
new file mode 100644
index 0000000..f53a47c
--- /dev/null
+++ b/cuBQL/builder/openmp/build_omp_target.h
@@ -0,0 +1,300 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <omp.h>
+#include <atomic>
+
+namespace cuBQL {
+  namespace omp {
+
+    template<typename T, typename count_t>
+    inline void _ALLOC(T *&ptr, count_t count, int gpuID)
+    { ptr = (T*)omp_target_alloc(count*sizeof(T),gpuID); }
+    
+    template<typename T>
+    inline void _FREE(T *&ptr, int gpuID)
+    { omp_target_free(ptr,gpuID); ptr = 0; }
+    
+    typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState;
+    
+    // ==================================================================
+    // atomicbox
+    // ==================================================================
+    template<typename box_t>
+    struct CUBQL_ALIGN(8) AtomicBox {
+      inline bool is_empty() const { return lower[0] > upper[0]; }
+      inline void  set_empty();
+      // set_empty, in owl::common-style naming
+      inline void  clear() { set_empty(); }
+      inline float get_center(int dim) const;
+      inline box_t make_box() const;
+
+      inline float get_lower(int dim) const {
+        if (box_t::numDims>4) 
+          return decode(lower[dim]);
+        else if (box_t::numDims==4) {
+          return decode(dim>1
+                        ?((dim>2)?lower[3]:lower[2])
+                        :((dim  )?lower[1]:lower[0]));
+        } else if (box_t::numDims==3) {
+          return decode(dim>1
+                        ?lower[2]
+                        :((dim  )?lower[1]:lower[0]));
+        } else
+          return decode(lower[dim]);
+      }
+      inline float get_upper(int dim) const {
+        if (box_t::numDims>4) 
+          return decode(upper[dim]);
+        else if (box_t::numDims==4) {
+          return decode(dim>1
+                        ?((dim>2)?upper[3]:upper[2])
+                        :((dim  )?upper[1]:upper[0]));
+        } else if (box_t::numDims==3)
+          return decode(dim>1
+                        ?upper[2]
+                        :((dim  )?upper[1]:upper[0]));
+        else
+          return decode(upper[dim]);
+      }
+      
+      int32_t lower[box_t::numDims];
+      int32_t upper[box_t::numDims];
+      
+      inline static int32_t encode(float f);
+      inline static float   decode(int32_t bits);
+    };
+
+#ifdef __cplusplus > 202302L
+    inline void atomicMin(int32_t *ptr, int32_t value)
+    { if (value < *ptr) ((std::atomic<int> *)ptr)->fetch_min(value, std::memory_order::seq_cst); }
+    inline void atomicMax(int32_t *ptr, int32_t value)
+    { if (value > *ptr) ((std::atomic<int> *)ptr)->fetch_max(value, std::memory_order::seq_cst); }
+#else
+    inline void atomicMin(int32_t *ptr, int32_t value)
+    { 
+      int current = *(volatile int *)addr;
+      while (current > value) {
+        bool wasChanged
+          = ((std::atomic<int>*)addr)->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+    inline void atomicMax(int32_t *ptr, int32_t value)
+    { 
+      int current = *(volatile int *)addr;
+      while (current < value) {
+        bool wasChanged
+          = ((std::atomic<int>*)addr)->compare_exchange_weak((int&)current,(int&)value);
+        if (wasChanged) break;
+      }
+    }
+    
+#endif
+    
+    template<typename box_t>
+    inline float AtomicBox<box_t>::get_center(int dim) const
+    {
+      return 0.5f*(get_lower(dim)+get_upper(dim));
+      // return 0.5f*(decode(lower[dim])+decode(upper[dim]));
+    }
+
+    template<typename box_t>
+    inline box_t AtomicBox<box_t>::make_box() const
+    {
+      box_t box;
+#pragma unroll
+      for (int d=0;d<box_t::numDims;d++) {
+        box.lower[d] = decode(lower[d]);
+        box.upper[d] = decode(upper[d]);
+      }
+      return box;
+    }
+    
+    template<typename box_t>
+    inline int32_t AtomicBox<box_t>::encode(float f)
+    {
+      const int32_t sign = 0x80000000;
+      int32_t bits = __float_as_int(f);
+      if (bits & sign) bits ^= 0x7fffffff;
+      return bits;
+    }
+      
+    template<typename box_t>
+    inline float AtomicBox<box_t>::decode(int32_t bits)
+    {
+      const int32_t sign = 0x80000000;
+      if (bits & sign) bits ^= 0x7fffffff;
+      return __int_as_float(bits);
+    }
+    
+    template<typename box_t>
+    inline void AtomicBox<box_t>::set_empty()
+    {
+#pragma unroll
+      for (int d=0;d<box_t::numDims;d++) {
+        lower[d] = encode(+FLT_MAX);
+        upper[d] = encode(-FLT_MAX);
+      }
+    }
+
+    template<typename box_t> inline __device__
+    void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
+    {
+#pragma unroll
+      for (int d=0;d<box_t::numDims;d++) {
+        const int32_t enc = AtomicBox<box_t>::encode(other[d]);//get(other,d));
+        if (enc < abox.lower[d])
+          atomicMin(&abox.lower[d],enc);
+        if (enc > abox.upper[d])
+          atomicMax(&abox.upper[d],enc);
+      }
+    } 
+    
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
+    {
+#pragma unroll
+      for (int d=0;d<box_t::numDims;d++) {
+        const int32_t enc_lower = AtomicBox<box_t>::encode(other.get_lower(d));
+        const int32_t enc_upper = AtomicBox<box_t>::encode(other.get_upper(d));
+        if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower);
+        if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper);
+      }
+    }
+
+    template<typename box_t>
+    inline void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
+    {
+#pragma unroll
+      for (int d=0;d<box_t::numDims;d++) {
+        const int32_t enc_lower = other.lower[d];
+        const int32_t enc_upper = other.upper[d];
+        if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower);
+        if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper);
+      }
+    }
+    
+
+    // ==================================================================
+    // internal states
+    // ==================================================================
+    struct BuildState {
+      uint32_t  numNodes;
+    };
+    
+    struct PrimState {
+      union {
+        /* careful with this order - this is intentionally chosen such
+           that all item with nodeID==-1 will end up at the end of the
+           list; and all others will be sorted by nodeID */
+        struct {
+          uint64_t primID:31; //!< prim we're talking about
+          uint64_t done  : 1;
+          uint64_t nodeID:32; //!< node the given prim is (currently) in.
+        };
+        uint64_t bits;
+      };
+    };
+
+    template<typename T, int D>
+    struct CUBQL_ALIGN(16) TempNode {
+      using box_t = cuBQL::box_t<T,D>;
+      union {
+        struct {
+          AtomicBox<box_t> centBounds;
+          uint32_t         count;
+          uint32_t         unused;
+        } openBranch;
+        struct {
+          uint32_t offset;
+          int      dim;
+          uint32_t tieBreaker;
+          float    pos;
+        } openNode;
+        struct {
+          uint32_t offset;
+          uint32_t count;
+          uint32_t unused[2];
+        } doneNode;
+      };
+    };
+
+    template<typename T, int D>
+    void initState(int tid,
+                   BuildState      *buildState,
+                   NodeState       *nodeStates,
+                   TempNode<T,D> *nodes)
+    {
+      if (tid >= 1) return;
+      
+      buildState->numNodes = 2;
+      
+      nodeStates[0]             = OPEN_BRANCH;
+      nodes[0].openBranch.count = 0;
+      nodes[0].openBranch.centBounds.set_empty();
+
+      nodeStates[1]            = DONE_NODE;
+      nodes[1].doneNode.offset = 0;
+      nodes[1].doneNode.count  = 0;
+    }
+
+    
+    /*! openmp based builder with #pragma omp target directives. */
+    template<typename T, int D>
+    inline
+    void build_omp_target_impl(BinaryBVH<T,D>   &bvh,
+                               /*! array of bounding boxes to build BVH over,
+                                 must be in target device memory (ie, must be
+                                 accessible in the device(gpuID) that the
+                                 'gpuID' parameter refers to */
+                               const box_t<T,D> *d_boxes,
+                               uint32_t          numPrims,
+                               BuildConfig       buildConfig,
+                               int               gpuID)
+    {
+      TempNode<T,D> *tempNodes = 0;
+      NodeState     *nodeStates = 0;
+      PrimState     *primStates = 0;
+      BuildState    *buildState = 0;
+      _ALLOC(tempNodes,2*numPrims,gpuID);
+      _ALLOC(nodeStates,2*numPrims,gpuID);
+      _ALLOC(primStates,numPrims,gpuID);
+      _ALLOC(buildState,1,gpuID);
+#pragma omp target device(gpuID) \
+  is_device_ptr(buildState)      \
+  is_device_ptr(nodeStates)      \
+  is_device_ptr(tempNodes)
+#pragma omp teams distribute parallel for
+      for (int i=0;i<1;i++)
+        initState(i,
+                  buildState,
+                  nodeStates,
+                  tempNodes);
+    
+    }
+  }
+    
+  
+  /*! openmp based builder with #pragma omp target directives. */
+  template<typename T, int D>
+  inline
+  void build_omp_target(BinaryBVH<T,D>   &bvh,
+                        /*! array of bounding boxes to build BVH over,
+                          must be in target device memory (ie, must be
+                          accessible in the device(gpuID) that the
+                          'gpuID' parameter refers to */
+                        const box_t<T,D> *d_boxes,
+                        uint32_t          numBoxes,
+                        BuildConfig       buildConfig,
+                        int               gpuID)
+  {
+    omp::build_omp_target_impl(bvh,d_boxes,numBoxes,buildConfig,gpuID);
+  }
+
+} // ::cuBQL
+
+

From b48290f7e983d9d8001fcb792fd87d283424c339 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 17 Jan 2026 14:54:53 -0700
Subject: [PATCH 2/7] added default constructor for cuBQL::ray_t, so it can be
 used in structs

---
 cuBQL/math/Ray.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuBQL/math/Ray.h b/cuBQL/math/Ray.h
index ae77c86..2ac64c2 100644
--- a/cuBQL/math/Ray.h
+++ b/cuBQL/math/Ray.h
@@ -16,6 +16,8 @@ namespace cuBQL {
   struct ray_t {
     using vec3 = vec_t<T,3>;
     
+    __cubql_both ray_t() = default;
+    __cubql_both ray_t(const ray_t &) = default;
     __cubql_both ray_t(vec3 org, vec3 dir, T tMin, T tMax);
     __cubql_both ray_t(vec3 org, vec3 dir);
     vec3 origin;

From 4968a23571ef0875be2b97c67ab39a8be870d687 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 17 Jan 2026 15:32:39 -0700
Subject: [PATCH 3/7] added shorthand cuBQL::bvh3d typedef

---
 cuBQL/bvh.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuBQL/bvh.h b/cuBQL/bvh.h
index effb41a..aeda611 100644
--- a/cuBQL/bvh.h
+++ b/cuBQL/bvh.h
@@ -140,6 +140,7 @@ namespace cuBQL {
   // easy short-hand - though cubql also supports other types of bvhs,
   // scalars, etc, this will likely be the most commonly used one.
   using bvh3f = BinaryBVH<float,3>;
+  using bvh3d = BinaryBVH<double,3>;
 
 #ifdef __CUDACC__
   typedef BinaryBVH<float,2> bvh_float2;

From 08515afdecc512af56488aae37ac3479b0176129 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 17 Jan 2026 17:46:38 -0700
Subject: [PATCH 4/7] bugfix in two-level ray traversal

---
 cuBQL/math/affine.h                   |  5 ++--
 cuBQL/math/linear.h                   | 18 +++++++++++---
 cuBQL/math/vec.h                      |  4 ++-
 cuBQL/queries/triangleData/Triangle.h |  7 +++++-
 cuBQL/traversal/rayQueries.h          | 35 ++++++++++++++++++---------
 5 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h
index 1a24ca5..c3718bc 100644
--- a/cuBQL/math/affine.h
+++ b/cuBQL/math/affine.h
@@ -164,6 +164,7 @@ namespace cuBQL {
   typename AffineSpaceT<L>::vector_t xfmPoint(const AffineSpaceT<L>& m,
                                              const typename AffineSpaceT<L>::vector_t &p)
   {
+    using vector_t = AffineSpaceT<L>::vector_t;
     return madd(vector_t(p.x),m.l.vx,
                 madd(vector_t(p.y),m.l.vy,
                      madd(vector_t(p.z),m.l.vz,
@@ -185,11 +186,11 @@ namespace cuBQL {
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  template<typename L> inline
+  template<typename L> inline __cubql_both
   bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b )
   { return a.l == b.l && a.p == b.p; }
   
-  template<typename L> inline
+  template<typename L> inline __cubql_both
   bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b )
   { return a.l != b.l || a.p != b.p; }
 
diff --git a/cuBQL/math/linear.h b/cuBQL/math/linear.h
index 9628990..5e7c57f 100644
--- a/cuBQL/math/linear.h
+++ b/cuBQL/math/linear.h
@@ -132,8 +132,13 @@ namespace cuBQL {
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  template<typename T> inline __cubql_both bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
-  template<typename T> inline __cubql_both bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+  template<typename T> inline __cubql_both
+  bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b )
+  { return a.vx == b.vx && a.vy == b.vy; }
+  
+  template<typename T> inline __cubql_both
+  bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b )
+  { return a.vx != b.vx || a.vy != b.vy; }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Output Operators
@@ -307,8 +312,13 @@ namespace cuBQL {
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  template<typename T> inline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
-  template<typename T> inline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+  template<typename T> inline __cubql_both
+  bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b )
+  { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  
+  template<typename T> inline __cubql_both
+  bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b )
+  { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Output Operators
diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
index 474a110..10ffdb7 100644
--- a/cuBQL/math/vec.h
+++ b/cuBQL/math/vec.h
@@ -169,7 +169,9 @@ namespace cuBQL {
     using vec_t_data<T,3>::y;
     using vec_t_data<T,3>::z;
 
-    inline __cubql_both vec_t() {}
+    inline vec_t() = default;
+    inline vec_t(const vec_t &) = default;
+    // inline __cubql_both vec_t() {}
     inline __cubql_both vec_t(const T &t) { x = y = z = t; }
     inline __cubql_both vec_t(T x, T y, T z)
     { this->x = x; this->y = y; this->z = z; }
diff --git a/cuBQL/queries/triangleData/Triangle.h b/cuBQL/queries/triangleData/Triangle.h
index c8fb104..84b5166 100644
--- a/cuBQL/queries/triangleData/Triangle.h
+++ b/cuBQL/queries/triangleData/Triangle.h
@@ -34,7 +34,12 @@ namespace cuBQL {
   {
     using vec3 = vec_t<T,3>;
     using box3 = box_t<T,3>;
-    
+
+    inline __cubql_both triangle_t() = default;
+    inline __cubql_both triangle_t(vec3 a, vec3 b, vec3 c)
+      : a(a), b(b), c(c)
+    {}
+    inline __cubql_both triangle_t(const triangle_t &) = default;
     inline __cubql_both box3 bounds() const;
     inline __cubql_both vec3 sample(float u, float v) const;
     inline __cubql_both vec3 normal() const;
diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h
index 9ee0067..41873c5 100644
--- a/cuBQL/traversal/rayQueries.h
+++ b/cuBQL/traversal/rayQueries.h
@@ -168,6 +168,21 @@ namespace cuBQL {
     return tin <= tout;
   }
 
+  template<typename T>
+  inline __cubql_both
+  void rayBoxTest(T &tin, T &tout,
+                  ray_t<T> ray, box_t<T,3> box)
+  {
+    using vec3 = vec_t<T,3>;
+    vec3 inv = rcp(ray.direction);
+    vec3 lo = (box.lower - ray.origin) * inv;
+    vec3 hi = (box.upper - ray.origin) * inv;
+    vec3 nr = min(lo,hi);
+    vec3 fr = max(lo,hi);
+    tin  = max(ray.tMin,reduce_max(nr));
+    tout = min(ray.tMax,reduce_min(fr));
+  }
+
   template<typename T>
   inline __cubql_both
   bool rayIntersectsBox(float &ret_t0,
@@ -462,7 +477,7 @@ namespace cuBQL {
 
     node_t   *tlasSavedNodePtr = 0;
     uint32_t *tlasSavedPrimIDs = 0;
-    vec3f saved_dir, saved_org;
+    vec_t<T,3> saved_dir, saved_org;
     
     if (ray.direction.x == (T)0) ray.direction.x = T(1e-20);
     if (ray.direction.y == (T)0) ray.direction.y = T(1e-20);
@@ -512,6 +527,9 @@ namespace cuBQL {
           enterBlas(transformed_ray,blas,instID);
           ray.origin = transformed_ray.origin;
           ray.direction = transformed_ray.direction;
+          if (ray.direction.x == (T)0) ray.direction.x = T(1e-20);
+          if (ray.direction.y == (T)0) ray.direction.y = T(1e-20);
+          if (ray.direction.z == (T)0) ray.direction.z = T(1e-20);
           rcp_dir = rcp(ray.direction);
           bvh.nodes     = blas.nodes;
           bvh.primIDs   = blas.primIDs;
@@ -531,10 +549,13 @@ namespace cuBQL {
         bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds);
         bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds);
 
-        if (dbg)
+        if (dbg) {
+          dout << " node L " << n0.bounds << "\n";
+          dout << " node R " << n1.bounds << "\n";
           printf("children L hit %i dist %f R hit %i dist %f\n",
                  int(o0),node_t0,
                  int(o1),node_t1);
+        }
         if (o0) {
           if (o1) {
             if (stackPtr-traversalStack >= STACK_DEPTH) {
@@ -620,15 +641,7 @@ namespace cuBQL {
       for (int i=0;i<count;i++) { 
         int primIdx = offset+i;
         if (primIDs) primIdx = primIDs[primIdx];
-        ray.tMax = min(ray.tMax,intersectPrim(offset+i));
-          
-      // if (primIDs == nullptr) {
-      //   if (dbg) printf("leaf %p offset %i\n",bvh.primIDs,(int)offset);
-      //   for (int i=0;i<count;i++) 
-      //     ray.tMax = min(ray.tMax,intersectPrim(offset+i));
-      // } else {
-      //   for (int i=0;i<count;i++) 
-      //     ray.tMax = min(ray.tMax,intersectPrim(primIDs[offset+i]));
+        ray.tMax = min(ray.tMax,intersectPrim(primIdx));
       }
       if (dbg) printf("LEAVING LEAF! t = %f\n",ray.tMax);
       return ray.tMax;

From fe24fbc42c2f109ffb37f55720bdaffd4078804c Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 17 Jan 2026 17:50:11 -0700
Subject: [PATCH 5/7] various warning fixes

---
 cuBQL/math/Ray.h                      | 4 ++--
 cuBQL/math/affine.h                   | 2 +-
 cuBQL/queries/triangleData/Triangle.h | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuBQL/math/Ray.h b/cuBQL/math/Ray.h
index 2ac64c2..b5379d8 100644
--- a/cuBQL/math/Ray.h
+++ b/cuBQL/math/Ray.h
@@ -16,8 +16,8 @@ namespace cuBQL {
   struct ray_t {
     using vec3 = vec_t<T,3>;
     
-    __cubql_both ray_t() = default;
-    __cubql_both ray_t(const ray_t &) = default;
+    inline ray_t() = default;
+    inline ray_t(const ray_t &) = default;
     __cubql_both ray_t(vec3 org, vec3 dir, T tMin, T tMax);
     __cubql_both ray_t(vec3 org, vec3 dir);
     vec3 origin;
diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h
index c3718bc..7655bef 100644
--- a/cuBQL/math/affine.h
+++ b/cuBQL/math/affine.h
@@ -164,7 +164,7 @@ namespace cuBQL {
   typename AffineSpaceT<L>::vector_t xfmPoint(const AffineSpaceT<L>& m,
                                              const typename AffineSpaceT<L>::vector_t &p)
   {
-    using vector_t = AffineSpaceT<L>::vector_t;
+    using vector_t = typename AffineSpaceT<L>::vector_t;
     return madd(vector_t(p.x),m.l.vx,
                 madd(vector_t(p.y),m.l.vy,
                      madd(vector_t(p.z),m.l.vz,
diff --git a/cuBQL/queries/triangleData/Triangle.h b/cuBQL/queries/triangleData/Triangle.h
index 84b5166..5f07f66 100644
--- a/cuBQL/queries/triangleData/Triangle.h
+++ b/cuBQL/queries/triangleData/Triangle.h
@@ -35,11 +35,11 @@ namespace cuBQL {
     using vec3 = vec_t<T,3>;
     using box3 = box_t<T,3>;
 
-    inline __cubql_both triangle_t() = default;
+    inline triangle_t() = default;
+    inline triangle_t(const triangle_t &) = default;
     inline __cubql_both triangle_t(vec3 a, vec3 b, vec3 c)
       : a(a), b(b), c(c)
     {}
-    inline __cubql_both triangle_t(const triangle_t &) = default;
     inline __cubql_both box3 bounds() const;
     inline __cubql_both vec3 sample(float u, float v) const;
     inline __cubql_both vec3 normal() const;

From 3d6a85b16e11a9348349313328e863b809228dc3 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sat, 17 Jan 2026 17:58:21 -0700
Subject: [PATCH 6/7] removed openmp experiments

---
 cuBQL/builder/openmp.h                  |  26 --
 cuBQL/builder/openmp/build_omp_target.h | 300 ------------------------
 cuBQL/traversal/rayQueries.h            |   4 +-
 3 files changed, 2 insertions(+), 328 deletions(-)
 delete mode 100644 cuBQL/builder/openmp.h
 delete mode 100644 cuBQL/builder/openmp/build_omp_target.h

diff --git a/cuBQL/builder/openmp.h b/cuBQL/builder/openmp.h
deleted file mode 100644
index 396729d..0000000
--- a/cuBQL/builder/openmp.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <omp.h>
-
-namespace cuBQL {
-
-  /*! openmp based builder with #pragma omp target directives. */
-  template<typename T, int D>
-  void build_omp_target(BinaryBVH<T,D>   &bvh,
-                        /*! array of bounding boxes to build BVH over,
-                          must be in target device memory (ie, must be
-                          accessible in the device(gpuID) that the
-                          'gpuID' parameter refers to */
-                        const box_t<T,D> *d_boxes,
-                        uint32_t          numBoxes,
-                        BuildConfig       buildConfig=BuildConfig(),
-                        int               gpuID = 0);
-}
-#if CUBQL_OPENMP_BUILDER_IMPLEMENTATION
-# include "openmp/build_omp_target.h"
-#endif
-
-
diff --git a/cuBQL/builder/openmp/build_omp_target.h b/cuBQL/builder/openmp/build_omp_target.h
deleted file mode 100644
index f53a47c..0000000
--- a/cuBQL/builder/openmp/build_omp_target.h
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <omp.h>
-#include <atomic>
-
-namespace cuBQL {
-  namespace omp {
-
-    template<typename T, typename count_t>
-    inline void _ALLOC(T *&ptr, count_t count, int gpuID)
-    { ptr = (T*)omp_target_alloc(count*sizeof(T),gpuID); }
-    
-    template<typename T>
-    inline void _FREE(T *&ptr, int gpuID)
-    { omp_target_free(ptr,gpuID); ptr = 0; }
-    
-    typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState;
-    
-    // ==================================================================
-    // atomicbox
-    // ==================================================================
-    template<typename box_t>
-    struct CUBQL_ALIGN(8) AtomicBox {
-      inline bool is_empty() const { return lower[0] > upper[0]; }
-      inline void  set_empty();
-      // set_empty, in owl::common-style naming
-      inline void  clear() { set_empty(); }
-      inline float get_center(int dim) const;
-      inline box_t make_box() const;
-
-      inline float get_lower(int dim) const {
-        if (box_t::numDims>4) 
-          return decode(lower[dim]);
-        else if (box_t::numDims==4) {
-          return decode(dim>1
-                        ?((dim>2)?lower[3]:lower[2])
-                        :((dim  )?lower[1]:lower[0]));
-        } else if (box_t::numDims==3) {
-          return decode(dim>1
-                        ?lower[2]
-                        :((dim  )?lower[1]:lower[0]));
-        } else
-          return decode(lower[dim]);
-      }
-      inline float get_upper(int dim) const {
-        if (box_t::numDims>4) 
-          return decode(upper[dim]);
-        else if (box_t::numDims==4) {
-          return decode(dim>1
-                        ?((dim>2)?upper[3]:upper[2])
-                        :((dim  )?upper[1]:upper[0]));
-        } else if (box_t::numDims==3)
-          return decode(dim>1
-                        ?upper[2]
-                        :((dim  )?upper[1]:upper[0]));
-        else
-          return decode(upper[dim]);
-      }
-      
-      int32_t lower[box_t::numDims];
-      int32_t upper[box_t::numDims];
-      
-      inline static int32_t encode(float f);
-      inline static float   decode(int32_t bits);
-    };
-
-#ifdef __cplusplus > 202302L
-    inline void atomicMin(int32_t *ptr, int32_t value)
-    { if (value < *ptr) ((std::atomic<int> *)ptr)->fetch_min(value, std::memory_order::seq_cst); }
-    inline void atomicMax(int32_t *ptr, int32_t value)
-    { if (value > *ptr) ((std::atomic<int> *)ptr)->fetch_max(value, std::memory_order::seq_cst); }
-#else
-    inline void atomicMin(int32_t *ptr, int32_t value)
-    { 
-      int current = *(volatile int *)addr;
-      while (current > value) {
-        bool wasChanged
-          = ((std::atomic<int>*)addr)->compare_exchange_weak((int&)current,(int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-    inline void atomicMax(int32_t *ptr, int32_t value)
-    { 
-      int current = *(volatile int *)addr;
-      while (current < value) {
-        bool wasChanged
-          = ((std::atomic<int>*)addr)->compare_exchange_weak((int&)current,(int&)value);
-        if (wasChanged) break;
-      }
-    }
-    
-#endif
-    
-    template<typename box_t>
-    inline float AtomicBox<box_t>::get_center(int dim) const
-    {
-      return 0.5f*(get_lower(dim)+get_upper(dim));
-      // return 0.5f*(decode(lower[dim])+decode(upper[dim]));
-    }
-
-    template<typename box_t>
-    inline box_t AtomicBox<box_t>::make_box() const
-    {
-      box_t box;
-#pragma unroll
-      for (int d=0;d<box_t::numDims;d++) {
-        box.lower[d] = decode(lower[d]);
-        box.upper[d] = decode(upper[d]);
-      }
-      return box;
-    }
-    
-    template<typename box_t>
-    inline int32_t AtomicBox<box_t>::encode(float f)
-    {
-      const int32_t sign = 0x80000000;
-      int32_t bits = __float_as_int(f);
-      if (bits & sign) bits ^= 0x7fffffff;
-      return bits;
-    }
-      
-    template<typename box_t>
-    inline float AtomicBox<box_t>::decode(int32_t bits)
-    {
-      const int32_t sign = 0x80000000;
-      if (bits & sign) bits ^= 0x7fffffff;
-      return __int_as_float(bits);
-    }
-    
-    template<typename box_t>
-    inline void AtomicBox<box_t>::set_empty()
-    {
-#pragma unroll
-      for (int d=0;d<box_t::numDims;d++) {
-        lower[d] = encode(+FLT_MAX);
-        upper[d] = encode(-FLT_MAX);
-      }
-    }
-
-    template<typename box_t> inline __device__
-    void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
-    {
-#pragma unroll
-      for (int d=0;d<box_t::numDims;d++) {
-        const int32_t enc = AtomicBox<box_t>::encode(other[d]);//get(other,d));
-        if (enc < abox.lower[d])
-          atomicMin(&abox.lower[d],enc);
-        if (enc > abox.upper[d])
-          atomicMax(&abox.upper[d],enc);
-      }
-    } 
-    
-    template<typename box_t>
-    inline void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
-    {
-#pragma unroll
-      for (int d=0;d<box_t::numDims;d++) {
-        const int32_t enc_lower = AtomicBox<box_t>::encode(other.get_lower(d));
-        const int32_t enc_upper = AtomicBox<box_t>::encode(other.get_upper(d));
-        if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower);
-        if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper);
-      }
-    }
-
-    template<typename box_t>
-    inline void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
-    {
-#pragma unroll
-      for (int d=0;d<box_t::numDims;d++) {
-        const int32_t enc_lower = other.lower[d];
-        const int32_t enc_upper = other.upper[d];
-        if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower);
-        if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper);
-      }
-    }
-    
-
-    // ==================================================================
-    // internal states
-    // ==================================================================
-    struct BuildState {
-      uint32_t  numNodes;
-    };
-    
-    struct PrimState {
-      union {
-        /* careful with this order - this is intentionally chosen such
-           that all item with nodeID==-1 will end up at the end of the
-           list; and all others will be sorted by nodeID */
-        struct {
-          uint64_t primID:31; //!< prim we're talking about
-          uint64_t done  : 1;
-          uint64_t nodeID:32; //!< node the given prim is (currently) in.
-        };
-        uint64_t bits;
-      };
-    };
-
-    template<typename T, int D>
-    struct CUBQL_ALIGN(16) TempNode {
-      using box_t = cuBQL::box_t<T,D>;
-      union {
-        struct {
-          AtomicBox<box_t> centBounds;
-          uint32_t         count;
-          uint32_t         unused;
-        } openBranch;
-        struct {
-          uint32_t offset;
-          int      dim;
-          uint32_t tieBreaker;
-          float    pos;
-        } openNode;
-        struct {
-          uint32_t offset;
-          uint32_t count;
-          uint32_t unused[2];
-        } doneNode;
-      };
-    };
-
-    template<typename T, int D>
-    void initState(int tid,
-                   BuildState      *buildState,
-                   NodeState       *nodeStates,
-                   TempNode<T,D> *nodes)
-    {
-      if (tid >= 1) return;
-      
-      buildState->numNodes = 2;
-      
-      nodeStates[0]             = OPEN_BRANCH;
-      nodes[0].openBranch.count = 0;
-      nodes[0].openBranch.centBounds.set_empty();
-
-      nodeStates[1]            = DONE_NODE;
-      nodes[1].doneNode.offset = 0;
-      nodes[1].doneNode.count  = 0;
-    }
-
-    
-    /*! openmp based builder with #pragma omp target directives. */
-    template<typename T, int D>
-    inline
-    void build_omp_target_impl(BinaryBVH<T,D>   &bvh,
-                               /*! array of bounding boxes to build BVH over,
-                                 must be in target device memory (ie, must be
-                                 accessible in the device(gpuID) that the
-                                 'gpuID' parameter refers to */
-                               const box_t<T,D> *d_boxes,
-                               uint32_t          numPrims,
-                               BuildConfig       buildConfig,
-                               int               gpuID)
-    {
-      TempNode<T,D> *tempNodes = 0;
-      NodeState     *nodeStates = 0;
-      PrimState     *primStates = 0;
-      BuildState    *buildState = 0;
-      _ALLOC(tempNodes,2*numPrims,gpuID);
-      _ALLOC(nodeStates,2*numPrims,gpuID);
-      _ALLOC(primStates,numPrims,gpuID);
-      _ALLOC(buildState,1,gpuID);
-#pragma omp target device(gpuID) \
-  is_device_ptr(buildState)      \
-  is_device_ptr(nodeStates)      \
-  is_device_ptr(tempNodes)
-#pragma omp teams distribute parallel for
-      for (int i=0;i<1;i++)
-        initState(i,
-                  buildState,
-                  nodeStates,
-                  tempNodes);
-    
-    }
-  }
-    
-  
-  /*! openmp based builder with #pragma omp target directives. */
-  template<typename T, int D>
-  inline
-  void build_omp_target(BinaryBVH<T,D>   &bvh,
-                        /*! array of bounding boxes to build BVH over,
-                          must be in target device memory (ie, must be
-                          accessible in the device(gpuID) that the
-                          'gpuID' parameter refers to */
-                        const box_t<T,D> *d_boxes,
-                        uint32_t          numBoxes,
-                        BuildConfig       buildConfig,
-                        int               gpuID)
-  {
-    omp::build_omp_target_impl(bvh,d_boxes,numBoxes,buildConfig,gpuID);
-  }
-
-} // ::cuBQL
-
-
diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h
index 41873c5..f09bdb8 100644
--- a/cuBQL/traversal/rayQueries.h
+++ b/cuBQL/traversal/rayQueries.h
@@ -550,8 +550,8 @@ namespace cuBQL {
         bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds);
 
         if (dbg) {
-          dout << " node L " << n0.bounds << "\n";
-          dout << " node R " << n1.bounds << "\n";
+          // dout << " node L " << n0.bounds << "\n";
+          // dout << " node R " << n1.bounds << "\n";
           printf("children L hit %i dist %f R hit %i dist %f\n",
                  int(o0),node_t0,
                  int(o1),node_t1);

From b2b50fd086e67c2f701c631382f895b15f072c3d Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 18 Jan 2026 11:45:40 -0700
Subject: [PATCH 7/7] bugfix: sah builder now respects
 buildConfig::maxAllowedLeafSize

---
 cuBQL/builder/cuda/sah_builder.h | 50 +++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h
index ba68e22..dafb440 100644
--- a/cuBQL/builder/cuda/sah_builder.h
+++ b/cuBQL/builder/cuda/sah_builder.h
@@ -28,8 +28,14 @@ namespace cuBQL {
         struct {
           AtomicBox<box_t> centBounds;
           uint32_t offset;
+#if 1
+          int8_t dim;
+          int8_t bin;
+          int32_t atomicSideDeciderBits;
+#else
           int8_t   dim;
           int8_t   bin;
+#endif
         } openNode;
         struct {
           uint32_t offset;
@@ -54,7 +60,8 @@ namespace cuBQL {
     inline __device__
     void evaluateSAH(int &splitDim,
                      int &splitBin,
-                     const SAHBins<box_t> &sah)
+                     const SAHBins<box_t> &sah,
+                     int maxAllowedLeafSize)
     {
       float bestCost = CUBQL_INF;
       
@@ -69,7 +76,7 @@ namespace cuBQL {
           rAreas[b] = surfaceArea(box);
         }
         const float leafCost = rAreas[0] * rCount;
-        if (leafCost < bestCost) {
+        if (leafCost < bestCost && rCount <= maxAllowedLeafSize) {
           bestCost = leafCost;
           splitDim = -1;
         }
@@ -241,12 +248,17 @@ namespace cuBQL {
       int   splitDim = -1;
       int   splitBin;
       if (in.count > buildConfig.makeLeafThreshold) {
-        evaluateSAH(splitDim,splitBin,sah);
+        evaluateSAH(splitDim,splitBin,sah,buildConfig.maxAllowedLeafSize);
       }
-      if (splitDim < 0) {
+      bool makeLeaf
+        =  (splitDim < 0)
+        && (in.count <= buildConfig.maxAllowedLeafSize);
+      
+      if (makeLeaf) {
         nodeState   = DONE_NODE;
         auto &done  = nodes[nodeID].doneNode;
         done.count  = in.count;
+
         // set this to max-value, so the prims can later do atomicMin
         // with their position ion the leaf list; this value is
         // greater than any prim position.
@@ -295,22 +307,26 @@ namespace cuBQL {
 
       auto open = nodes[me.nodeID].openNode;
       const box_t<T,D> primBox = primBoxes[me.primID];
+      int side = -1;
+      if (open.dim == -1) {
+        // // use top bit to atomically 'sort' prims left vs right
+        // side = atomicAdd(&nodes[me.nodeID].openNode.atomicSideDeciderBits,1u<<31)>>31;
+        side = atomicAdd(&nodes[me.nodeID].openNode.atomicSideDeciderBits,1u)&1;
+      } else {
 
-      const int d = open.dim;
-      T lo = open.centBounds.get_lower(d);
-      T hi = open.centBounds.get_upper(d);
+        const int d = open.dim;
+        T lo = open.centBounds.get_lower(d);
+        T hi = open.centBounds.get_upper(d);
       
-      T prim_d = T(0.5)*(primBox.get_lower(d)+primBox.get_upper(d));
-      T rel
-        = (prim_d - lo)
-        / (hi - lo + 1e-20f);
-      int prim_bin = int(rel*(int)SAHBins<box_t<T,D>>::numBins);
-      prim_bin = max(0,min((int)SAHBins<box_t<T,D>>::numBins-1,prim_bin));
+        T prim_d = T(0.5)*(primBox.get_lower(d)+primBox.get_upper(d));
+        T rel
+          = (prim_d - lo)
+          / (hi - lo + 1e-20f);
+        int prim_bin = int(rel*(int)SAHBins<box_t<T,D>>::numBins);
+        prim_bin = max(0,min((int)SAHBins<box_t<T,D>>::numBins-1,prim_bin));
       
-      int side = (prim_bin >= open.bin);
-      // printf("updateprim %i node %i state %i dim %i bin %i -> prim bin %i -> side %i\n",
-      //        primID,me.nodeID,ns,open.dim,open.bin,
-      //        prim_bin,side);
+        side = (prim_bin >= open.bin);
+      }
       int newNodeID = open.offset+side;
       auto &myBranch = nodes[newNodeID].openBranch;
       atomicAdd(&myBranch.count,1);