diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h index ce103a5..c11b9ed 100644 --- a/cuBQL/builder/cuda/builder_common.h +++ b/cuBQL/builder/cuda/builder_common.h @@ -17,6 +17,24 @@ namespace cuBQL { namespace gpuBuilder_impl { + inline __device__ void atomic_min(int32_t *v, int32_t vv) + { atomicMin((int *)v,(int)vv); } + inline __device__ void atomic_min(int64_t *v, int64_t vv) + { atomicMin((long long *)v,(long long)vv); } + inline __device__ void atomic_min(uint32_t *v, uint32_t vv) + { atomicMin((unsigned int *)v,(unsigned int)vv); } + inline __device__ void atomic_min(uint64_t *v, uint64_t vv) + { atomicMin((unsigned long long *)v,(unsigned long long)vv); } + + inline __device__ void atomic_max(int32_t *v, int32_t vv) + { atomicMax((int *)v,(int)vv); } + inline __device__ void atomic_max(int64_t *v, int64_t vv) + { atomicMax((long long *)v,(long long)vv); } + inline __device__ void atomic_max(uint32_t *v, uint32_t vv) + { atomicMax((unsigned int *)v,(unsigned int)vv); } + inline __device__ void atomic_max(uint64_t *v, uint64_t vv) + { atomicMax((unsigned long long *)v,(unsigned long long)vv); } + template inline void _ALLOC(T *&ptr, count_t count, cudaStream_t s, GpuMemoryResource &mem_resource) @@ -28,131 +46,250 @@ namespace cuBQL { typedef enum : int8_t { OPEN_BRANCH, OPEN_NODE, DONE_NODE } NodeState; + template inline __device__ T empty_box_lower(); + template inline __device__ T empty_box_upper(); + + template<> inline __device__ + float empty_box_lower() { return +FLT_MAX; }; + + template<> inline __device__ + float empty_box_upper() { return -FLT_MAX; }; + + template<> inline __device__ + double empty_box_lower() { return +DBL_MAX; }; + + template<> inline __device__ + double empty_box_upper() { return -DBL_MAX; }; + + template<> inline __device__ + int empty_box_lower() { return INT_MAX; }; + + template<> inline __device__ + int empty_box_upper() { return INT_MIN; }; + + template<> inline __device__ + int64_t empty_box_lower() { return LLONG_MAX; }; + + template<> inline __device__ + int64_t empty_box_upper() { return LLONG_MIN; }; + + template struct int_type_of; + template<> struct int_type_of { typedef int32_t type; }; + template<> struct int_type_of { typedef int64_t type; }; + template<> struct int_type_of { typedef int32_t type; }; + template<> struct int_type_of { typedef int64_t type; }; + template<> struct int_type_of { typedef uint32_t type; }; + template<> struct int_type_of { typedef uint64_t type; }; + + template inline __device__ + typename int_type_of::type encode(T v); + + template<> inline __device__ + int32_t encode(float f) + { + const int32_t sign = 0x80000000; + int32_t bits = __float_as_int(f); + if (bits & sign) bits ^= 0x7fffffff; + return bits; + } + template<> inline __device__ + int64_t encode(double f) + { + const int64_t sign = 0x8000000000000000LL; + int64_t bits = __double_as_longlong(f); + if (bits & sign) bits ^= 0x7fffffffffffffffLL; + return bits; + } + template<> inline __device__ + int32_t encode(int32_t bits) + { + return bits; + } + template<> inline __device__ + int64_t encode(int64_t bits) + { + return bits; + } + + template inline __device__ + T decode(int32_t v); + template inline __device__ + T decode(uint32_t v); + template inline __device__ + T decode(int64_t v); + template inline __device__ + T decode(uint64_t v); + + template<> inline __device__ + float decode(int32_t bits) + { + const int32_t sign = 0x80000000; + if (bits & sign) bits ^= 0x7fffffff; + return __int_as_float(bits); + } + template<> inline __device__ + int32_t decode(int32_t bits) + { return bits; } + template<> inline __device__ + int64_t decode(int64_t bits) + { return bits; } + + template<> inline __device__ + double decode(int64_t bits) + { + const int64_t sign = 0x8000000000000000LL; + if (bits & sign) bits ^= 0x7fffffffffffffffLL; + return __longlong_as_double(bits); + } + template struct CUBQL_ALIGN(8) AtomicBox { + using scalar_t = typename box_t::scalar_t; inline __device__ bool is_empty() const { return lower[0] > upper[0]; } inline __device__ void set_empty(); // set_empty, in owl::common-style naming inline __device__ void clear() { set_empty(); } - inline __device__ float get_center(int dim) const; + inline __device__ scalar_t get_center(int dim) const; inline __device__ box_t make_box() const; - inline __device__ float get_lower(int dim) const { + inline __device__ scalar_t get_lower(int dim) const { if (box_t::numDims>4) - return decode(lower[dim]); + return decode(lower[dim]); else if (box_t::numDims==4) { - return decode(dim>1 - ?((dim>2)?lower[3]:lower[2]) - :((dim )?lower[1]:lower[0])); + return decode(dim>1 + ?((dim>2)?lower[3]:lower[2]) + :((dim )?lower[1]:lower[0])); } else if (box_t::numDims==3) { - return decode(dim>1 - ?lower[2] - :((dim )?lower[1]:lower[0])); + return decode(dim>1 + ?lower[2] + :((dim )?lower[1]:lower[0])); } else - return decode(lower[dim]); + return decode(lower[dim]); } - inline __device__ float get_upper(int dim) const { + inline __device__ scalar_t get_upper(int dim) const { if (box_t::numDims>4) - return decode(upper[dim]); + return decode(upper[dim]); else if (box_t::numDims==4) { - return decode(dim>1 - ?((dim>2)?upper[3]:upper[2]) - :((dim )?upper[1]:upper[0])); + return decode(dim>1 + ?((dim>2)?upper[3]:upper[2]) + :((dim )?upper[1]:upper[0])); } else if (box_t::numDims==3) - return decode(dim>1 - ?upper[2] - :((dim )?upper[1]:upper[0])); + return decode(dim>1 + ?upper[2] + :((dim )?upper[1]:upper[0])); else - return decode(upper[dim]); + return decode(upper[dim]); } - - int32_t lower[box_t::numDims]; - int32_t upper[box_t::numDims]; - - inline static __device__ int32_t encode(float f); - inline static __device__ float decode(int32_t bits); + + typename int_type_of::type lower[box_t::numDims]; + typename int_type_of::type upper[box_t::numDims]; + // int32_t lower[box_t::numDims]; + // int32_t upper[box_t::numDims]; + + // inline static __device__ int32_t encode(float f); + // inline static __device__ float decode(int32_t bits); }; template - inline __device__ float AtomicBox::get_center(int dim) const + inline __device__ typename AtomicBox::scalar_t + AtomicBox::get_center(int dim) const { - return 0.5f*(get_lower(dim)+get_upper(dim)); + return (get_lower(dim)+get_upper(dim))/(AtomicBox::scalar_t)2; // return 0.5f*(decode(lower[dim])+decode(upper[dim])); } + // template + // inline __device__ float AtomicBox::get_center(int dim) const + // { + // return 0.5f*(get_lower(dim)+get_upper(dim)); + // // return 0.5f*(decode(lower[dim])+decode(upper[dim])); + // } template inline __device__ box_t AtomicBox::make_box() const { + using scalar_t = typename box_t::scalar_t; box_t box; #pragma unroll for (int d=0;d(lower[d]); + box.upper[d] = decode(upper[d]); } return box; } - template - inline __device__ int32_t AtomicBox::encode(float f) - { - const int32_t sign = 0x80000000; - int32_t bits = __float_as_int(f); - if (bits & sign) bits ^= 0x7fffffff; - return bits; - } + // template + // inline __device__ int32_t AtomicBox::encode(float f) + // { + // const int32_t sign = 0x80000000; + // int32_t bits = __float_as_int(f); + // if (bits & sign) bits ^= 0x7fffffff; + // return bits; + // } - template - inline __device__ float AtomicBox::decode(int32_t bits) - { - const int32_t sign = 0x80000000; - if (bits & sign) bits ^= 0x7fffffff; - return __int_as_float(bits); - } + // template + // inline __device__ float AtomicBox::decode(int32_t bits) + // { + // const int32_t sign = 0x80000000; + // if (bits & sign) bits ^= 0x7fffffff; + // return __int_as_float(bits); + // } template inline __device__ void AtomicBox::set_empty() { + using scalar_t = typename box_t::scalar_t; #pragma unroll for (int d=0;d());//encode(+FLT_MAX); + upper[d] = encode(empty_box_upper());//encode(-FLT_MAX); } } template inline __device__ void atomic_grow(AtomicBox &abox, const typename box_t::vec_t &other) { + using scalar_t = typename AtomicBox::scalar_t; #pragma unroll for (int d=0;d::encode(other[d]);//get(other,d)); + const typename int_type_of::type enc + = //AtomicBox:: + encode(other[d]);//get(other,d)); if (enc < abox.lower[d]) - atomicMin(&abox.lower[d],enc); + atomic_min(&abox.lower[d],enc); if (enc > abox.upper[d]) - atomicMax(&abox.upper[d],enc); + atomic_max(&abox.upper[d],enc); } } template inline __device__ void atomic_grow(AtomicBox &abox, const box_t &other) { + using scalar_t = typename AtomicBox::scalar_t; #pragma unroll for (int d=0;d::encode(other.get_lower(d)); - const int32_t enc_upper = AtomicBox::encode(other.get_upper(d)); - if (enc_lower < abox.lower[d]) atomicMin(&abox.lower[d],enc_lower); - if (enc_upper > abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); + const typename int_type_of::type + enc_lower = //AtomicBox:: + encode(other.get_lower(d)); + const typename int_type_of::type + enc_upper = //AtomicBox:: + encode(other.get_upper(d)); + if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower); + if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper); } } template inline __device__ void atomic_grow(AtomicBox &abox, const AtomicBox &other) { + using scalar_t = typename AtomicBox::scalar_t; #pragma unroll for (int d=0;d abox.upper[d]) atomicMax(&abox.upper[d],enc_upper); + const typename int_type_of::type + enc_lower = other.lower[d]; + const typename int_type_of::type + enc_upper = other.upper[d]; + if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower); + if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper); } } diff --git a/cuBQL/builder/cuda/instantiate_builders.cu b/cuBQL/builder/cuda/instantiate_builders.cu index 88332cf..b54d5ef 100644 --- a/cuBQL/builder/cuda/instantiate_builders.cu +++ b/cuBQL/builder/cuda/instantiate_builders.cu @@ -14,8 +14,8 @@ namespace cuBQL { \ namespace radixBuilder_impl { \ template \ - void build(BinaryBVH &bvh, \ - const typename BuildState::box_t *boxes, \ + void build(BinaryBVH &bvh, \ + const box_t *boxes, \ uint32_t numPrims, \ BuildConfig buildConfig, \ cudaStream_t s, \