NTrace/_cuda_tracer_kernels_8hpp_source.html

 /*

  *  Copyright (c) 2009-2011, NVIDIA Corporation

  *  All rights reserved.

  *

  *  Redistribution and use in source and binary forms, with or without

  *  modification, are permitted provided that the following conditions are met:

  *      * Redistributions of source code must retain the above copyright

  *        notice, this list of conditions and the following disclaimer.

  *      * Redistributions in binary form must reproduce the above copyright

  *        notice, this list of conditions and the following disclaimer in the

  *        documentation and/or other materials provided with the distribution.

  *      * Neither the name of NVIDIA Corporation nor the

  *        names of its contributors may be used to endorse or promote products

  *        derived from this software without specific prior written permission.

  *

  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

  *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

  *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

  *  DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY

  *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

  *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

  *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

  *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  */


 #pragma once

 #include <cuda.h>


 //------------------------------------------------------------------------

 // Constants.

 //------------------------------------------------------------------------


 enum

 {

     MaxBlockHeight      = 6,            // Upper bound for blockDim.y.

     EntrypointSentinel  = 0x76543210,   // Bottom-most stack entry, indicating the end of traversal.

 };


 // Macros for kd-tree build and traversal

 #define KDTREE_MASK 0xF0000000 // Binary flag mask

 #define KDTREE_UNMASK 0x0FFFFFFF // Mask for removing the flags

 #define KDTREE_LEAF 0xC0000000 // Leaf binary flag

 #define KDTREE_EMPTYLEAF 0x80000000 // Leaf binary flag

 #define KDTREE_DIMPOS 28


 //------------------------------------------------------------------------

 // BVH memory layout.

 //------------------------------------------------------------------------


 enum BVHLayout

 {

     BVHLayout_AOS_AOS = 0,              // Nodes = array-of-structures, triangles = array-of-structures. Used by tesla_xxx kernels.

     BVHLayout_AOS_SOA,                  // Nodes = array-of-structures, triangles = structure-of-arrays.

     BVHLayout_SOA_AOS,                  // Nodes = structure-of-arrays, triangles = array-of-structures.

     BVHLayout_SOA_SOA,                  // Nodes = structure-of-arrays, triangles = structure-of-arrays.

     BVHLayout_Compact,                  // Variant of BVHLayout_AOS_AOS with implicit leaf nodes.

     BVHLayout_Compact2,                 // Variant of BVHLayout_AOS_AOS with implicit leaf nodes.

     BVHLayout_CPU,                      // Variant of BVHLayout_AOS_AOS without woop triangles, suitable for low memory CPU traversal


     BVHLayout_Max

 };


 //------------------------------------------------------------------------

 // Kernel configuration. Written by queryConfig() in each CU file.

 //------------------------------------------------------------------------


 struct KernelConfig

 {

     int         bvhLayout;              // Desired BVHLayout.

     int         blockWidth;             // Desired blockDim.x.

     int         blockHeight;            // Desired blockDim.y.

     int         usePersistentThreads;   // True to enable persistent threads.

 };


 //------------------------------------------------------------------------

 // Function signature for trace().

 //------------------------------------------------------------------------


 #define TRACE_FUNC_KDTREE \

     extern "C" __global__ void trace_kdtree( \

         int             numRays,        /* Total number of rays in the batch. */ \

         bool            anyHit,         /* False if rays need to find the closest hit. */ \

         float*          bmin,   \

         float*          bmax,   \

         float           delta,  \

         float4*         rays,           /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */ \

         int4*           results,        /* Ray output: int triangleID, float hitT, int2 padding. */ \

         float4*         nodesA,         /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */ \

         float4*         nodesB,         /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */ \

         float4*         nodesC,         /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */ \

         float4*         nodesD,         /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */ \

         float4*         trisA,          /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */ \

         float4*         trisB,          /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */ \

         float4*         trisC,          /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */ \

         int*            triIndices)     /* Triangle index remapping table. */


 #define TRACE_FUNC_BVH \

     extern "C" __global__ void trace_bvh( \

         int             numRays,        /* Total number of rays in the batch. */ \

         bool            anyHit,         /* False if rays need to find the closest hit. */ \

         float4*         rays,           /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */ \

         int4*           results,        /* Ray output: int triangleID, float hitT, int2 padding. */ \

         float4*         nodesA,         /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */ \

         float4*         nodesB,         /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */ \

         float4*         nodesC,         /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */ \

         float4*         nodesD,         /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */ \

         float4*         trisA,          /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */ \

         float4*         trisB,          /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */ \

         float4*         trisC,          /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */ \

         int*            triIndices)     /* Triangle index remapping table. */


 //------------------------------------------------------------------------

 // OTrace input


 struct OtraceInput

 {

     int             numRays;            /* Total number of rays in the batch. */

     bool            anyHit;             /* False if rays need to find the closest hit. */

     CUdeviceptr     rays;               /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */

     CUdeviceptr     results;            /* Ray output: int triangleID, float hitT, int2 padding. */

     CUdeviceptr     nodesA;             /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */

     CUdeviceptr     nodesB;             /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */

     CUdeviceptr     nodesC;             /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */

     CUdeviceptr     nodesD;             /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */

     CUdeviceptr     trisA;              /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */

     CUdeviceptr     trisB;              /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */

     CUdeviceptr     trisC;              /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */

     CUdeviceptr     triIndices;         /* Triangle index remapping table. */

     CUdeviceptr     texCoords;          /* Texture coordinates */

     CUdeviceptr     normals;            /* Normals */

     CUdeviceptr     triVertIndex;       /* Triangle vertex index */

     CUdeviceptr     atlasInfo;          /* Texture atlas */

     CUdeviceptr     matId;              /* Material ID */

     CUdeviceptr     matInfo;            /* Material data */

     int             emissiveNum;        /* Number of emissive triangles */

     CUdeviceptr     emissive;           /* Emissive triangles buffer */

     int             trisCount;          /* Number of triangles */

     int             vertsCount;         /* Number of vertices */

     CUdeviceptr     tris;               /* int3 vertex index data */

     CUdeviceptr     verts;              /* float3 vertices data */

     int             randomSeed;         /* RNG seed */

 };


 //------------------------------------------------------------------------

 // Temporary data stored in shared memory to reduce register pressure.

 //------------------------------------------------------------------------


 struct RayStruct

 {

     float   idirx;  // 1.0f / ray.direction.x

     float   idiry;  // 1.0f / ray.direction.y

     float   idirz;  // 1.0f / ray.direction.z

     float   tmin;   // ray.tmin

     float   dummy;  // Padding to avoid bank conflicts.

 };


 //------------------------------------------------------------------------

 // Globals.

 //------------------------------------------------------------------------


 #ifdef __CUDACC__

 extern "C"

 {


 __device__ KernelConfig g_config;   // Output of queryConfig().


 texture<float4, 1> t_rays;          // Linear textures wrapping the corresponding parameter arrays.

 texture<float4, 1> t_nodesA;

 texture<float4, 1> t_nodesB;

 texture<float4, 1> t_nodesC;

 texture<float4, 1> t_nodesD;

 texture<float4, 1> t_trisA;

 texture<float4, 1> t_trisB;

 texture<float4, 1> t_trisC;

 texture<int,  1>   t_triIndices;


 __global__ void queryConfig(void);  // Launched once when the kernel is loaded.


 TRACE_FUNC_BVH;                     // Launched for each batch of rays.

 TRACE_FUNC_KDTREE;


 __constant__ OtraceInput c_OtraceInput;

 __global__ void otrace_kernel(void); // Otrace kernel

 texture<float4, 2> t_textureAtlas;  // texture atlases


 }

 #endif


 //------------------------------------------------------------------------

 // Utilities.

 //------------------------------------------------------------------------


 #define FETCH_GLOBAL(NAME, IDX, TYPE) ((const TYPE*)NAME)[IDX]

 #define FETCH_TEXTURE(NAME, IDX, TYPE) tex1Dfetch(t_ ## NAME, IDX)

 //#define STORE_RESULT(RAY, TRI, T) ((int2*)results)[(RAY) * 2] = make_int2(TRI, __float_as_int(T))

 #define STORE_RESULT(RAY, TRI, T, U, V) results[RAY] = make_int4(TRI, __float_as_int(T), __float_as_int(U), __float_as_int(V))


 //------------------------------------------------------------------------


 #ifdef __CUDACC__


 template <class T> __device__ __inline__ void swap(T& a,T& b)

 {

     T t = a;

     a = b;

     b = t;

 }


 __device__ __inline__ float min4(float a, float b, float c, float d)

 {

     return fminf(fminf(fminf(a, b), c), d);

 }


 __device__ __inline__ float max4(float a, float b, float c, float d)

 {

     return fmaxf(fmaxf(fmaxf(a, b), c), d);

 }


 __device__ __inline__ float min3(float a, float b, float c)

 {

     return fminf(fminf(a, b), c);

 }


 __device__ __inline__ float max3(float a, float b, float c)

 {

     return fmaxf(fmaxf(a, b), c);

 }


 // Using integer min,max

 __inline__ __device__ float fminf2(float a,float b)

 {

     int a2 = __float_as_int(a);

     int b2 = __float_as_int(b);

     return __int_as_float( a2<b2 ? a2 : b2 );

 }


 __inline__ __device__ float fmaxf2(float a,float b)

 {

     int a2 = __float_as_int(a);

     int b2 = __float_as_int(b);

     return __int_as_float( a2>b2 ? a2 : b2 );

 }


 // Using video instructions

 __device__ __inline__ int   min_min   (int a, int b, int c) { int v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }

 __device__ __inline__ int   min_max   (int a, int b, int c) { int v; asm("vmin.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }

 __device__ __inline__ int   max_min   (int a, int b, int c) { int v; asm("vmax.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }

 __device__ __inline__ int   max_max   (int a, int b, int c) { int v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }

 __device__ __inline__ float fmin_fmin (float a, float b, float c) { return __int_as_float(min_min(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }

 __device__ __inline__ float fmin_fmax (float a, float b, float c) { return __int_as_float(min_max(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }

 __device__ __inline__ float fmax_fmin (float a, float b, float c) { return __int_as_float(max_min(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }

 __device__ __inline__ float fmax_fmax (float a, float b, float c) { return __int_as_float(max_max(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }


 __device__ __inline__ float magic_max7(float a0, float a1, float b0, float b1, float c0, float c1, float d)

 {

             float t1 = fmin_fmax(a0, a1, d);

             float t2 = fmin_fmax(b0, b1, t1);

             float t3 = fmin_fmax(c0, c1, t2);

             return t3;

 }


 __device__ __inline__ float magic_min7(float a0, float a1, float b0, float b1, float c0, float c1, float d)

 {

             float t1 = fmax_fmin(a0, a1, d);

             float t2 = fmax_fmin(b0, b1, t1);

             float t3 = fmax_fmin(c0, c1, t2);

             return t3;

 }


 // Experimentally determined best mix of float/int/video minmax instructions for Kepler.

 __device__ __inline__ float spanBeginKepler(float a0, float a1, float b0, float b1, float c0, float c1, float d){   return fmax_fmax( fminf(a0,a1), fminf(b0,b1), fmin_fmax(c0, c1, d)); }

 __device__ __inline__ float spanEndKepler(float a0, float a1, float b0, float b1, float c0, float c1, float d)  {   return fmin_fmin( fmaxf(a0,a1), fmaxf(b0,b1), fmax_fmin(c0, c1, d)); }


 // Same for Fermi.

 __device__ __inline__ float spanBeginFermi(float a0, float a1, float b0, float b1, float c0, float c1, float d) {   return magic_max7(a0, a1, b0, b1, c0, c1, d); }

 __device__ __inline__ float spanEndFermi(float a0, float a1, float b0, float b1, float c0, float c1, float d)   {   return magic_min7(a0, a1, b0, b1, c0, c1, d); }


 #endif


 //------------------------------------------------------------------------

OtraceInput::vertsCount
int vertsCount
Definition: CudaTracerKernels.hpp:140

RayStruct::idirz
float idirz
Definition: CudaTracerKernels.hpp:154

OtraceInput::anyHit
bool anyHit
Definition: CudaTracerKernels.hpp:120

KernelConfig
Definition: CudaTracerKernels.hpp:69

OtraceInput::normals
CUdeviceptr normals
Definition: CudaTracerKernels.hpp:132

OtraceInput::trisC
CUdeviceptr trisC
Definition: CudaTracerKernels.hpp:129

OtraceInput::emissiveNum
int emissiveNum
Definition: CudaTracerKernels.hpp:137

OtraceInput::matInfo
CUdeviceptr matInfo
Definition: CudaTracerKernels.hpp:136

OtraceInput::nodesD
CUdeviceptr nodesD
Definition: CudaTracerKernels.hpp:126

OtraceInput::triVertIndex
CUdeviceptr triVertIndex
Definition: CudaTracerKernels.hpp:133

BVHLayout
BVHLayout
Definition: CudaTracerKernels.hpp:52

KernelConfig::blockHeight
int blockHeight
Definition: CudaTracerKernels.hpp:73

TRACE_FUNC_KDTREE
#define TRACE_FUNC_KDTREE
Definition: CudaTracerKernels.hpp:81

OtraceInput::trisA
CUdeviceptr trisA
Definition: CudaTracerKernels.hpp:127

OtraceInput::nodesB
CUdeviceptr nodesB
Definition: CudaTracerKernels.hpp:124

OtraceInput::nodesC
CUdeviceptr nodesC
Definition: CudaTracerKernels.hpp:125

OtraceInput::triIndices
CUdeviceptr triIndices
Definition: CudaTracerKernels.hpp:130

OtraceInput::randomSeed
int randomSeed
Definition: CudaTracerKernels.hpp:143

BVHLayout_SOA_SOA
Definition: CudaTracerKernels.hpp:57

RayStruct::idirx
float idirx
Definition: CudaTracerKernels.hpp:152

EntrypointSentinel
Definition: CudaTracerKernels.hpp:38

MaxBlockHeight
Definition: CudaTracerKernels.hpp:37

BVHLayout_SOA_AOS
Definition: CudaTracerKernels.hpp:56

KernelConfig::bvhLayout
int bvhLayout
Definition: CudaTracerKernels.hpp:71

v
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
Definition: DLLImports.inl:329

RayStruct::dummy
float dummy
Definition: CudaTracerKernels.hpp:156

OtraceInput::rays
CUdeviceptr rays
Definition: CudaTracerKernels.hpp:121

OtraceInput::matId
CUdeviceptr matId
Definition: CudaTracerKernels.hpp:135

OtraceInput::tris
CUdeviceptr tris
Definition: CudaTracerKernels.hpp:141

BVHLayout_Compact2
Definition: CudaTracerKernels.hpp:59

BVHLayout_Compact
Definition: CudaTracerKernels.hpp:58

BVHLayout_AOS_SOA
Definition: CudaTracerKernels.hpp:55

OtraceInput::emissive
CUdeviceptr emissive
Definition: CudaTracerKernels.hpp:138

RayStruct::idiry
float idiry
Definition: CudaTracerKernels.hpp:153

RayStruct
Definition: CudaTracerKernels.hpp:150

BVHLayout_AOS_AOS
Definition: CudaTracerKernels.hpp:54

OtraceInput::verts
CUdeviceptr verts
Definition: CudaTracerKernels.hpp:142

RayStruct::tmin
float tmin
Definition: CudaTracerKernels.hpp:155

OtraceInput::results
CUdeviceptr results
Definition: CudaTracerKernels.hpp:122

TRACE_FUNC_BVH
#define TRACE_FUNC_BVH
Definition: CudaTracerKernels.hpp:99

OtraceInput::nodesA
CUdeviceptr nodesA
Definition: CudaTracerKernels.hpp:123

OtraceInput::trisB
CUdeviceptr trisB
Definition: CudaTracerKernels.hpp:128

BVHLayout_CPU
Definition: CudaTracerKernels.hpp:60

OtraceInput::atlasInfo
CUdeviceptr atlasInfo
Definition: CudaTracerKernels.hpp:134

FW::swap
FW_CUDA_FUNC void swap(T &a, T &b)
Definition: Defs.hpp:183

BVHLayout_Max
Definition: CudaTracerKernels.hpp:62

OtraceInput::trisCount
int trisCount
Definition: CudaTracerKernels.hpp:139

OtraceInput::texCoords
CUdeviceptr texCoords
Definition: CudaTracerKernels.hpp:131

OtraceInput::numRays
int numRays
Definition: CudaTracerKernels.hpp:119

KernelConfig::blockWidth
int blockWidth
Definition: CudaTracerKernels.hpp:72

KernelConfig::usePersistentThreads
int usePersistentThreads
Definition: CudaTracerKernels.hpp:74

OtraceInput
Definition: CudaTracerKernels.hpp:117