From dfb5173c009bc983cffa36ee032ee59c93418678 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 16 Jan 2026 11:29:31 -0700 Subject: [PATCH 1/7] starting on openmp build --- cuBQL/builder/openmp.h | 26 ++ cuBQL/builder/openmp/build_omp_target.h | 300 ++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 cuBQL/builder/openmp.h create mode 100644 cuBQL/builder/openmp/build_omp_target.h diff --git a/cuBQL/builder/openmp.h b/cuBQL/builder/openmp.h new file mode 100644 index 0000000..396729d --- /dev/null +++ b/cuBQL/builder/openmp.h @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +namespace cuBQL { + + /*! openmp based builder with #pragma omp target directives. */ + template + void build_omp_target(BinaryBVH &bvh, + /*! array of bounding boxes to build BVH over, + must be in target device memory (ie, must be + accessible in the device(gpuID) that the + 'gpuID' parameter refers to */ + const box_t *d_boxes, + uint32_t numBoxes, + BuildConfig buildConfig=BuildConfig(), + int gpuID = 0); +} +#if CUBQL_OPENMP_BUILDER_IMPLEMENTATION +# include "openmp/build_omp_target.h" +#endif + + diff --git a/cuBQL/builder/openmp/build_omp_target.h b/cuBQL/builder/openmp/build_omp_target.h new file mode 100644 index 0000000..f53a47c --- /dev/null +++ b/cuBQL/builder/openmp/build_omp_target.h @@ -0,0 +1,300 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace cuBQL { + namespace omp { + + template + inline void _ALLOC(T *&ptr, count_t count, int gpuID) + { ptr = (T*)omp_target_alloc(count*sizeof(T),gpuID); } + + template + inline void _FREE(T *&ptr, int gpuID) + { omp_target_free(ptr,gpuID); ptr = 0; } + + typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState; + + // ================================================================== + // atomicbox + // ================================================================== + template + struct CUBQL_ALIGN(8) AtomicBox { + inline bool is_empty() const { return lower[0] > upper[0]; } + inline void set_empty(); + // set_empty, in owl::common-style naming + inline void clear() { set_empty(); } + inline float get_center(int dim) const; + inline box_t make_box() const; + + inline float get_lower(int dim) const { + if (box_t::numDims>4) + return decode(lower[dim]); + else if (box_t::numDims==4) { + return decode(dim>1 + ?((dim>2)?lower[3]:lower[2]) + :((dim )?lower[1]:lower[0])); + } else if (box_t::numDims==3) { + return decode(dim>1 + ?lower[2] + :((dim )?lower[1]:lower[0])); + } else + return decode(lower[dim]); + } + inline float get_upper(int dim) const { + if (box_t::numDims>4) + return decode(upper[dim]); + else if (box_t::numDims==4) { + return decode(dim>1 + ?((dim>2)?upper[3]:upper[2]) + :((dim )?upper[1]:upper[0])); + } else if (box_t::numDims==3) + return decode(dim>1 + ?upper[2] + :((dim )?upper[1]:upper[0])); + else + return decode(upper[dim]); + } + + int32_t lower[box_t::numDims]; + int32_t upper[box_t::numDims]; + + inline static int32_t encode(float f); + inline static float decode(int32_t bits); + }; + +#ifdef __cplusplus > 202302L + inline void atomicMin(int32_t *ptr, int32_t value) + { if (value < *ptr) ((std::atomic *)ptr)->fetch_min(value, std::memory_order::seq_cst); } + inline void atomicMax(int32_t *ptr, int32_t value) + { if (value > *ptr) ((std::atomic *)ptr)->fetch_max(value, std::memory_order::seq_cst); } +#else + inline void atomicMin(int32_t *ptr, int32_t value) + { + int current = *(volatile int *)addr; + while (current > value) { + bool wasChanged + = ((std::atomic*)addr)->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + + inline void atomicMax(int32_t *ptr, int32_t value) + { + int current = *(volatile int *)addr; + while (current < value) { + bool wasChanged + = ((std::atomic*)addr)->compare_exchange_weak((int&)current,(int&)value); + if (wasChanged) break; + } + } + +#endif + + template + inline float AtomicBox::get_center(int dim) const + { + return 0.5f*(get_lower(dim)+get_upper(dim)); + // return 0.5f*(decode(lower[dim])+decode(upper[dim])); + } + + template + inline box_t AtomicBox::make_box() const + { + box_t box; +#pragma unroll + for (int d=0;d + inline int32_t AtomicBox::encode(float f) + { + const int32_t sign = 0x80000000; + int32_t bits = __float_as_int(f); + if (bits & sign) bits ^= 0x7fffffff; + return bits; + } + + template + inline float AtomicBox::decode(int32_t bits) + { + const int32_t sign = 0x80000000; + if (bits & sign) bits ^= 0x7fffffff; + return __int_as_float(bits); + } + + template + inline void AtomicBox::set_empty() + { +#pragma unroll + for (int d=0;d inline __device__ + void atomic_grow(AtomicBox &abox, const typename box_t::vec_t &other) + { +#pragma unroll + for (int d=0;d::encode(other[d]);//get(other,d)); + if (enc < abox.lower[d]) + atomicMin(&abox.lower[d],enc); + if (enc > abox.upper[d]) + atomicMax(&abox.upper[d],enc); + } + } + + template + inline void atomic_grow(AtomicBox &abox, const box_t &other) + { +#pragma unroll + for (int d=0;d::encode(other.get_lower(d)); + const int32_t enc_upper = AtomicBox::encode(other.get_upper(d)); + if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower); + if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); + } + } + + template + inline void atomic_grow(AtomicBox &abox, const AtomicBox &other) + { +#pragma unroll + for (int d=0;d abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); + } + } + + + // ================================================================== + // internal states + // ================================================================== + struct BuildState { + uint32_t numNodes; + }; + + struct PrimState { + union { + /* careful with this order - this is intentionally chosen such + that all item with nodeID==-1 will end up at the end of the + list; and all others will be sorted by nodeID */ + struct { + uint64_t primID:31; //!< prim we're talking about + uint64_t done : 1; + uint64_t nodeID:32; //!< node the given prim is (currently) in. + }; + uint64_t bits; + }; + }; + + template + struct CUBQL_ALIGN(16) TempNode { + using box_t = cuBQL::box_t; + union { + struct { + AtomicBox centBounds; + uint32_t count; + uint32_t unused; + } openBranch; + struct { + uint32_t offset; + int dim; + uint32_t tieBreaker; + float pos; + } openNode; + struct { + uint32_t offset; + uint32_t count; + uint32_t unused[2]; + } doneNode; + }; + }; + + template + void initState(int tid, + BuildState *buildState, + NodeState *nodeStates, + TempNode *nodes) + { + if (tid >= 1) return; + + buildState->numNodes = 2; + + nodeStates[0] = OPEN_BRANCH; + nodes[0].openBranch.count = 0; + nodes[0].openBranch.centBounds.set_empty(); + + nodeStates[1] = DONE_NODE; + nodes[1].doneNode.offset = 0; + nodes[1].doneNode.count = 0; + } + + + /*! openmp based builder with #pragma omp target directives. */ + template + inline + void build_omp_target_impl(BinaryBVH &bvh, + /*! array of bounding boxes to build BVH over, + must be in target device memory (ie, must be + accessible in the device(gpuID) that the + 'gpuID' parameter refers to */ + const box_t *d_boxes, + uint32_t numPrims, + BuildConfig buildConfig, + int gpuID) + { + TempNode *tempNodes = 0; + NodeState *nodeStates = 0; + PrimState *primStates = 0; + BuildState *buildState = 0; + _ALLOC(tempNodes,2*numPrims,gpuID); + _ALLOC(nodeStates,2*numPrims,gpuID); + _ALLOC(primStates,numPrims,gpuID); + _ALLOC(buildState,1,gpuID); +#pragma omp target device(gpuID) \ + is_device_ptr(buildState) \ + is_device_ptr(nodeStates) \ + is_device_ptr(tempNodes) +#pragma omp teams distribute parallel for + for (int i=0;i<1;i++) + initState(i, + buildState, + nodeStates, + tempNodes); + + } + } + + + /*! openmp based builder with #pragma omp target directives. */ + template + inline + void build_omp_target(BinaryBVH &bvh, + /*! array of bounding boxes to build BVH over, + must be in target device memory (ie, must be + accessible in the device(gpuID) that the + 'gpuID' parameter refers to */ + const box_t *d_boxes, + uint32_t numBoxes, + BuildConfig buildConfig, + int gpuID) + { + omp::build_omp_target_impl(bvh,d_boxes,numBoxes,buildConfig,gpuID); + } + +} // ::cuBQL + + From b48290f7e983d9d8001fcb792fd87d283424c339 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sat, 17 Jan 2026 14:54:53 -0700 Subject: [PATCH 2/7] added default constructor for cuBQL::ray_t, so it can be used in structs --- cuBQL/math/Ray.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuBQL/math/Ray.h b/cuBQL/math/Ray.h index ae77c86..2ac64c2 100644 --- a/cuBQL/math/Ray.h +++ b/cuBQL/math/Ray.h @@ -16,6 +16,8 @@ namespace cuBQL { struct ray_t { using vec3 = vec_t; + __cubql_both ray_t() = default; + __cubql_both ray_t(const ray_t &) = default; __cubql_both ray_t(vec3 org, vec3 dir, T tMin, T tMax); __cubql_both ray_t(vec3 org, vec3 dir); vec3 origin; From 4968a23571ef0875be2b97c67ab39a8be870d687 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sat, 17 Jan 2026 15:32:39 -0700 Subject: [PATCH 3/7] added shorthand cuBQL::bvh3d typedef --- cuBQL/bvh.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cuBQL/bvh.h b/cuBQL/bvh.h index effb41a..aeda611 100644 --- a/cuBQL/bvh.h +++ b/cuBQL/bvh.h @@ -140,6 +140,7 @@ namespace cuBQL { // easy short-hand - though cubql also supports other types of bvhs, // scalars, etc, this will likely be the most commonly used one. using bvh3f = BinaryBVH; + using bvh3d = BinaryBVH; #ifdef __CUDACC__ typedef BinaryBVH bvh_float2; From 08515afdecc512af56488aae37ac3479b0176129 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sat, 17 Jan 2026 17:46:38 -0700 Subject: [PATCH 4/7] bugfix in two-level ray traversal --- cuBQL/math/affine.h | 5 ++-- cuBQL/math/linear.h | 18 +++++++++++--- cuBQL/math/vec.h | 4 ++- cuBQL/queries/triangleData/Triangle.h | 7 +++++- cuBQL/traversal/rayQueries.h | 35 ++++++++++++++++++--------- 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h index 1a24ca5..c3718bc 100644 --- a/cuBQL/math/affine.h +++ b/cuBQL/math/affine.h @@ -164,6 +164,7 @@ namespace cuBQL { typename AffineSpaceT::vector_t xfmPoint(const AffineSpaceT& m, const typename AffineSpaceT::vector_t &p) { + using vector_t = AffineSpaceT::vector_t; return madd(vector_t(p.x),m.l.vx, madd(vector_t(p.y),m.l.vy, madd(vector_t(p.z),m.l.vz, @@ -185,11 +186,11 @@ namespace cuBQL { /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// - template inline + template inline __cubql_both bool operator ==( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l == b.l && a.p == b.p; } - template inline + template inline __cubql_both bool operator !=( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l != b.l || a.p != b.p; } diff --git a/cuBQL/math/linear.h b/cuBQL/math/linear.h index 9628990..5e7c57f 100644 --- a/cuBQL/math/linear.h +++ b/cuBQL/math/linear.h @@ -132,8 +132,13 @@ namespace cuBQL { /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// - template inline __cubql_both bool operator ==( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx == b.vx && a.vy == b.vy; } - template inline __cubql_both bool operator !=( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx != b.vx || a.vy != b.vy; } + template inline __cubql_both + bool operator ==( const LinearSpace2& a, const LinearSpace2& b ) + { return a.vx == b.vx && a.vy == b.vy; } + + template inline __cubql_both + bool operator !=( const LinearSpace2& a, const LinearSpace2& b ) + { return a.vx != b.vx || a.vy != b.vy; } //////////////////////////////////////////////////////////////////////////////// /// Output Operators @@ -307,8 +312,13 @@ namespace cuBQL { /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// - template inline bool operator ==( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } - template inline bool operator !=( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } + template inline __cubql_both + bool operator ==( const LinearSpace3& a, const LinearSpace3& b ) + { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } + + template inline __cubql_both + bool operator !=( const LinearSpace3& a, const LinearSpace3& b ) + { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } //////////////////////////////////////////////////////////////////////////////// /// Output Operators diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 474a110..10ffdb7 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -169,7 +169,9 @@ namespace cuBQL { using vec_t_data::y; using vec_t_data::z; - inline __cubql_both vec_t() {} + inline vec_t() = default; + inline vec_t(const vec_t &) = default; + // inline __cubql_both vec_t() {} inline __cubql_both vec_t(const T &t) { x = y = z = t; } inline __cubql_both vec_t(T x, T y, T z) { this->x = x; this->y = y; this->z = z; } diff --git a/cuBQL/queries/triangleData/Triangle.h b/cuBQL/queries/triangleData/Triangle.h index c8fb104..84b5166 100644 --- a/cuBQL/queries/triangleData/Triangle.h +++ b/cuBQL/queries/triangleData/Triangle.h @@ -34,7 +34,12 @@ namespace cuBQL { { using vec3 = vec_t; using box3 = box_t; - + + inline __cubql_both triangle_t() = default; + inline __cubql_both triangle_t(vec3 a, vec3 b, vec3 c) + : a(a), b(b), c(c) + {} + inline __cubql_both triangle_t(const triangle_t &) = default; inline __cubql_both box3 bounds() const; inline __cubql_both vec3 sample(float u, float v) const; inline __cubql_both vec3 normal() const; diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h index 9ee0067..41873c5 100644 --- a/cuBQL/traversal/rayQueries.h +++ b/cuBQL/traversal/rayQueries.h @@ -168,6 +168,21 @@ namespace cuBQL { return tin <= tout; } + template + inline __cubql_both + void rayBoxTest(T &tin, T &tout, + ray_t ray, box_t box) + { + using vec3 = vec_t; + vec3 inv = rcp(ray.direction); + vec3 lo = (box.lower - ray.origin) * inv; + vec3 hi = (box.upper - ray.origin) * inv; + vec3 nr = min(lo,hi); + vec3 fr = max(lo,hi); + tin = max(ray.tMin,reduce_max(nr)); + tout = min(ray.tMax,reduce_min(fr)); + } + template inline __cubql_both bool rayIntersectsBox(float &ret_t0, @@ -462,7 +477,7 @@ namespace cuBQL { node_t *tlasSavedNodePtr = 0; uint32_t *tlasSavedPrimIDs = 0; - vec3f saved_dir, saved_org; + vec_t saved_dir, saved_org; if (ray.direction.x == (T)0) ray.direction.x = T(1e-20); if (ray.direction.y == (T)0) ray.direction.y = T(1e-20); @@ -512,6 +527,9 @@ namespace cuBQL { enterBlas(transformed_ray,blas,instID); ray.origin = transformed_ray.origin; ray.direction = transformed_ray.direction; + if (ray.direction.x == (T)0) ray.direction.x = T(1e-20); + if (ray.direction.y == (T)0) ray.direction.y = T(1e-20); + if (ray.direction.z == (T)0) ray.direction.z = T(1e-20); rcp_dir = rcp(ray.direction); bvh.nodes = blas.nodes; bvh.primIDs = blas.primIDs; @@ -531,10 +549,13 @@ namespace cuBQL { bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds); bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds); - if (dbg) + if (dbg) { + dout << " node L " << n0.bounds << "\n"; + dout << " node R " << n1.bounds << "\n"; printf("children L hit %i dist %f R hit %i dist %f\n", int(o0),node_t0, int(o1),node_t1); + } if (o0) { if (o1) { if (stackPtr-traversalStack >= STACK_DEPTH) { @@ -620,15 +641,7 @@ namespace cuBQL { for (int i=0;i Date: Sat, 17 Jan 2026 17:50:11 -0700 Subject: [PATCH 5/7] various warning fixes --- cuBQL/math/Ray.h | 4 ++-- cuBQL/math/affine.h | 2 +- cuBQL/queries/triangleData/Triangle.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cuBQL/math/Ray.h b/cuBQL/math/Ray.h index 2ac64c2..b5379d8 100644 --- a/cuBQL/math/Ray.h +++ b/cuBQL/math/Ray.h @@ -16,8 +16,8 @@ namespace cuBQL { struct ray_t { using vec3 = vec_t; - __cubql_both ray_t() = default; - __cubql_both ray_t(const ray_t &) = default; + inline ray_t() = default; + inline ray_t(const ray_t &) = default; __cubql_both ray_t(vec3 org, vec3 dir, T tMin, T tMax); __cubql_both ray_t(vec3 org, vec3 dir); vec3 origin; diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h index c3718bc..7655bef 100644 --- a/cuBQL/math/affine.h +++ b/cuBQL/math/affine.h @@ -164,7 +164,7 @@ namespace cuBQL { typename AffineSpaceT::vector_t xfmPoint(const AffineSpaceT& m, const typename AffineSpaceT::vector_t &p) { - using vector_t = AffineSpaceT::vector_t; + using vector_t = typename AffineSpaceT::vector_t; return madd(vector_t(p.x),m.l.vx, madd(vector_t(p.y),m.l.vy, madd(vector_t(p.z),m.l.vz, diff --git a/cuBQL/queries/triangleData/Triangle.h b/cuBQL/queries/triangleData/Triangle.h index 84b5166..5f07f66 100644 --- a/cuBQL/queries/triangleData/Triangle.h +++ b/cuBQL/queries/triangleData/Triangle.h @@ -35,11 +35,11 @@ namespace cuBQL { using vec3 = vec_t; using box3 = box_t; - inline __cubql_both triangle_t() = default; + inline triangle_t() = default; + inline triangle_t(const triangle_t &) = default; inline __cubql_both triangle_t(vec3 a, vec3 b, vec3 c) : a(a), b(b), c(c) {} - inline __cubql_both triangle_t(const triangle_t &) = default; inline __cubql_both box3 bounds() const; inline __cubql_both vec3 sample(float u, float v) const; inline __cubql_both vec3 normal() const; From 3d6a85b16e11a9348349313328e863b809228dc3 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sat, 17 Jan 2026 17:58:21 -0700 Subject: [PATCH 6/7] removed openmp experiments --- cuBQL/builder/openmp.h | 26 -- cuBQL/builder/openmp/build_omp_target.h | 300 ------------------------ cuBQL/traversal/rayQueries.h | 4 +- 3 files changed, 2 insertions(+), 328 deletions(-) delete mode 100644 cuBQL/builder/openmp.h delete mode 100644 cuBQL/builder/openmp/build_omp_target.h diff --git a/cuBQL/builder/openmp.h b/cuBQL/builder/openmp.h deleted file mode 100644 index 396729d..0000000 --- a/cuBQL/builder/openmp.h +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -namespace cuBQL { - - /*! openmp based builder with #pragma omp target directives. */ - template - void build_omp_target(BinaryBVH &bvh, - /*! array of bounding boxes to build BVH over, - must be in target device memory (ie, must be - accessible in the device(gpuID) that the - 'gpuID' parameter refers to */ - const box_t *d_boxes, - uint32_t numBoxes, - BuildConfig buildConfig=BuildConfig(), - int gpuID = 0); -} -#if CUBQL_OPENMP_BUILDER_IMPLEMENTATION -# include "openmp/build_omp_target.h" -#endif - - diff --git a/cuBQL/builder/openmp/build_omp_target.h b/cuBQL/builder/openmp/build_omp_target.h deleted file mode 100644 index f53a47c..0000000 --- a/cuBQL/builder/openmp/build_omp_target.h +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include - -namespace cuBQL { - namespace omp { - - template - inline void _ALLOC(T *&ptr, count_t count, int gpuID) - { ptr = (T*)omp_target_alloc(count*sizeof(T),gpuID); } - - template - inline void _FREE(T *&ptr, int gpuID) - { omp_target_free(ptr,gpuID); ptr = 0; } - - typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState; - - // ================================================================== - // atomicbox - // ================================================================== - template - struct CUBQL_ALIGN(8) AtomicBox { - inline bool is_empty() const { return lower[0] > upper[0]; } - inline void set_empty(); - // set_empty, in owl::common-style naming - inline void clear() { set_empty(); } - inline float get_center(int dim) const; - inline box_t make_box() const; - - inline float get_lower(int dim) const { - if (box_t::numDims>4) - return decode(lower[dim]); - else if (box_t::numDims==4) { - return decode(dim>1 - ?((dim>2)?lower[3]:lower[2]) - :((dim )?lower[1]:lower[0])); - } else if (box_t::numDims==3) { - return decode(dim>1 - ?lower[2] - :((dim )?lower[1]:lower[0])); - } else - return decode(lower[dim]); - } - inline float get_upper(int dim) const { - if (box_t::numDims>4) - return decode(upper[dim]); - else if (box_t::numDims==4) { - return decode(dim>1 - ?((dim>2)?upper[3]:upper[2]) - :((dim )?upper[1]:upper[0])); - } else if (box_t::numDims==3) - return decode(dim>1 - ?upper[2] - :((dim )?upper[1]:upper[0])); - else - return decode(upper[dim]); - } - - int32_t lower[box_t::numDims]; - int32_t upper[box_t::numDims]; - - inline static int32_t encode(float f); - inline static float decode(int32_t bits); - }; - -#ifdef __cplusplus > 202302L - inline void atomicMin(int32_t *ptr, int32_t value) - { if (value < *ptr) ((std::atomic *)ptr)->fetch_min(value, std::memory_order::seq_cst); } - inline void atomicMax(int32_t *ptr, int32_t value) - { if (value > *ptr) ((std::atomic *)ptr)->fetch_max(value, std::memory_order::seq_cst); } -#else - inline void atomicMin(int32_t *ptr, int32_t value) - { - int current = *(volatile int *)addr; - while (current > value) { - bool wasChanged - = ((std::atomic*)addr)->compare_exchange_weak((int&)current,(int&)value); - if (wasChanged) break; - } - } - - inline void atomicMax(int32_t *ptr, int32_t value) - { - int current = *(volatile int *)addr; - while (current < value) { - bool wasChanged - = ((std::atomic*)addr)->compare_exchange_weak((int&)current,(int&)value); - if (wasChanged) break; - } - } - -#endif - - template - inline float AtomicBox::get_center(int dim) const - { - return 0.5f*(get_lower(dim)+get_upper(dim)); - // return 0.5f*(decode(lower[dim])+decode(upper[dim])); - } - - template - inline box_t AtomicBox::make_box() const - { - box_t box; -#pragma unroll - for (int d=0;d - inline int32_t AtomicBox::encode(float f) - { - const int32_t sign = 0x80000000; - int32_t bits = __float_as_int(f); - if (bits & sign) bits ^= 0x7fffffff; - return bits; - } - - template - inline float AtomicBox::decode(int32_t bits) - { - const int32_t sign = 0x80000000; - if (bits & sign) bits ^= 0x7fffffff; - return __int_as_float(bits); - } - - template - inline void AtomicBox::set_empty() - { -#pragma unroll - for (int d=0;d inline __device__ - void atomic_grow(AtomicBox &abox, const typename box_t::vec_t &other) - { -#pragma unroll - for (int d=0;d::encode(other[d]);//get(other,d)); - if (enc < abox.lower[d]) - atomicMin(&abox.lower[d],enc); - if (enc > abox.upper[d]) - atomicMax(&abox.upper[d],enc); - } - } - - template - inline void atomic_grow(AtomicBox &abox, const box_t &other) - { -#pragma unroll - for (int d=0;d::encode(other.get_lower(d)); - const int32_t enc_upper = AtomicBox::encode(other.get_upper(d)); - if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower); - if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); - } - } - - template - inline void atomic_grow(AtomicBox &abox, const AtomicBox &other) - { -#pragma unroll - for (int d=0;d abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); - } - } - - - // ================================================================== - // internal states - // ================================================================== - struct BuildState { - uint32_t numNodes; - }; - - struct PrimState { - union { - /* careful with this order - this is intentionally chosen such - that all item with nodeID==-1 will end up at the end of the - list; and all others will be sorted by nodeID */ - struct { - uint64_t primID:31; //!< prim we're talking about - uint64_t done : 1; - uint64_t nodeID:32; //!< node the given prim is (currently) in. - }; - uint64_t bits; - }; - }; - - template - struct CUBQL_ALIGN(16) TempNode { - using box_t = cuBQL::box_t; - union { - struct { - AtomicBox centBounds; - uint32_t count; - uint32_t unused; - } openBranch; - struct { - uint32_t offset; - int dim; - uint32_t tieBreaker; - float pos; - } openNode; - struct { - uint32_t offset; - uint32_t count; - uint32_t unused[2]; - } doneNode; - }; - }; - - template - void initState(int tid, - BuildState *buildState, - NodeState *nodeStates, - TempNode *nodes) - { - if (tid >= 1) return; - - buildState->numNodes = 2; - - nodeStates[0] = OPEN_BRANCH; - nodes[0].openBranch.count = 0; - nodes[0].openBranch.centBounds.set_empty(); - - nodeStates[1] = DONE_NODE; - nodes[1].doneNode.offset = 0; - nodes[1].doneNode.count = 0; - } - - - /*! openmp based builder with #pragma omp target directives. */ - template - inline - void build_omp_target_impl(BinaryBVH &bvh, - /*! array of bounding boxes to build BVH over, - must be in target device memory (ie, must be - accessible in the device(gpuID) that the - 'gpuID' parameter refers to */ - const box_t *d_boxes, - uint32_t numPrims, - BuildConfig buildConfig, - int gpuID) - { - TempNode *tempNodes = 0; - NodeState *nodeStates = 0; - PrimState *primStates = 0; - BuildState *buildState = 0; - _ALLOC(tempNodes,2*numPrims,gpuID); - _ALLOC(nodeStates,2*numPrims,gpuID); - _ALLOC(primStates,numPrims,gpuID); - _ALLOC(buildState,1,gpuID); -#pragma omp target device(gpuID) \ - is_device_ptr(buildState) \ - is_device_ptr(nodeStates) \ - is_device_ptr(tempNodes) -#pragma omp teams distribute parallel for - for (int i=0;i<1;i++) - initState(i, - buildState, - nodeStates, - tempNodes); - - } - } - - - /*! openmp based builder with #pragma omp target directives. */ - template - inline - void build_omp_target(BinaryBVH &bvh, - /*! array of bounding boxes to build BVH over, - must be in target device memory (ie, must be - accessible in the device(gpuID) that the - 'gpuID' parameter refers to */ - const box_t *d_boxes, - uint32_t numBoxes, - BuildConfig buildConfig, - int gpuID) - { - omp::build_omp_target_impl(bvh,d_boxes,numBoxes,buildConfig,gpuID); - } - -} // ::cuBQL - - diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h index 41873c5..f09bdb8 100644 --- a/cuBQL/traversal/rayQueries.h +++ b/cuBQL/traversal/rayQueries.h @@ -550,8 +550,8 @@ namespace cuBQL { bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds); if (dbg) { - dout << " node L " << n0.bounds << "\n"; - dout << " node R " << n1.bounds << "\n"; + // dout << " node L " << n0.bounds << "\n"; + // dout << " node R " << n1.bounds << "\n"; printf("children L hit %i dist %f R hit %i dist %f\n", int(o0),node_t0, int(o1),node_t1); From b2b50fd086e67c2f701c631382f895b15f072c3d Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 18 Jan 2026 11:45:40 -0700 Subject: [PATCH 7/7] bugfix: sah builder now respects buildConfig::maxAllowedLeafSize --- cuBQL/builder/cuda/sah_builder.h | 50 +++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h index ba68e22..dafb440 100644 --- a/cuBQL/builder/cuda/sah_builder.h +++ b/cuBQL/builder/cuda/sah_builder.h @@ -28,8 +28,14 @@ namespace cuBQL { struct { AtomicBox centBounds; uint32_t offset; +#if 1 + int8_t dim; + int8_t bin; + int32_t atomicSideDeciderBits; +#else int8_t dim; int8_t bin; +#endif } openNode; struct { uint32_t offset; @@ -54,7 +60,8 @@ namespace cuBQL { inline __device__ void evaluateSAH(int &splitDim, int &splitBin, - const SAHBins &sah) + const SAHBins &sah, + int maxAllowedLeafSize) { float bestCost = CUBQL_INF; @@ -69,7 +76,7 @@ namespace cuBQL { rAreas[b] = surfaceArea(box); } const float leafCost = rAreas[0] * rCount; - if (leafCost < bestCost) { + if (leafCost < bestCost && rCount <= maxAllowedLeafSize) { bestCost = leafCost; splitDim = -1; } @@ -241,12 +248,17 @@ namespace cuBQL { int splitDim = -1; int splitBin; if (in.count > buildConfig.makeLeafThreshold) { - evaluateSAH(splitDim,splitBin,sah); + evaluateSAH(splitDim,splitBin,sah,buildConfig.maxAllowedLeafSize); } - if (splitDim < 0) { + bool makeLeaf + = (splitDim < 0) + && (in.count <= buildConfig.maxAllowedLeafSize); + + if (makeLeaf) { nodeState = DONE_NODE; auto &done = nodes[nodeID].doneNode; done.count = in.count; + // set this to max-value, so the prims can later do atomicMin // with their position ion the leaf list; this value is // greater than any prim position. @@ -295,22 +307,26 @@ namespace cuBQL { auto open = nodes[me.nodeID].openNode; const box_t primBox = primBoxes[me.primID]; + int side = -1; + if (open.dim == -1) { + // // use top bit to atomically 'sort' prims left vs right + // side = atomicAdd(&nodes[me.nodeID].openNode.atomicSideDeciderBits,1u<<31)>>31; + side = atomicAdd(&nodes[me.nodeID].openNode.atomicSideDeciderBits,1u)&1; + } else { - const int d = open.dim; - T lo = open.centBounds.get_lower(d); - T hi = open.centBounds.get_upper(d); + const int d = open.dim; + T lo = open.centBounds.get_lower(d); + T hi = open.centBounds.get_upper(d); - T prim_d = T(0.5)*(primBox.get_lower(d)+primBox.get_upper(d)); - T rel - = (prim_d - lo) - / (hi - lo + 1e-20f); - int prim_bin = int(rel*(int)SAHBins>::numBins); - prim_bin = max(0,min((int)SAHBins>::numBins-1,prim_bin)); + T prim_d = T(0.5)*(primBox.get_lower(d)+primBox.get_upper(d)); + T rel + = (prim_d - lo) + / (hi - lo + 1e-20f); + int prim_bin = int(rel*(int)SAHBins>::numBins); + prim_bin = max(0,min((int)SAHBins>::numBins-1,prim_bin)); - int side = (prim_bin >= open.bin); - // printf("updateprim %i node %i state %i dim %i bin %i -> prim bin %i -> side %i\n", - // primID,me.nodeID,ns,open.dim,open.bin, - // prim_bin,side); + side = (prim_bin >= open.bin); + } int newNodeID = open.offset+side; auto &myBranch = nodes[newNodeID].openBranch; atomicAdd(&myBranch.count,1);