NTrace
GPU ray tracing framework
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
CudaTracerKernels.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009-2011, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 #include <cuda.h>
30 
31 //------------------------------------------------------------------------
32 // Constants.
33 //------------------------------------------------------------------------
34 
35 enum
36 {
37  MaxBlockHeight = 6, // Upper bound for blockDim.y.
38  EntrypointSentinel = 0x76543210, // Bottom-most stack entry, indicating the end of traversal.
39 };
40 
41 // Macros for kd-tree build and traversal
42 #define KDTREE_MASK 0xF0000000 // Binary flag mask
43 #define KDTREE_UNMASK 0x0FFFFFFF // Mask for removing the flags
44 #define KDTREE_LEAF 0xC0000000 // Leaf binary flag
45 #define KDTREE_EMPTYLEAF 0x80000000 // Leaf binary flag
46 #define KDTREE_DIMPOS 28
47 
48 //------------------------------------------------------------------------
49 // BVH memory layout.
50 //------------------------------------------------------------------------
51 
53 {
54  BVHLayout_AOS_AOS = 0, // Nodes = array-of-structures, triangles = array-of-structures. Used by tesla_xxx kernels.
55  BVHLayout_AOS_SOA, // Nodes = array-of-structures, triangles = structure-of-arrays.
56  BVHLayout_SOA_AOS, // Nodes = structure-of-arrays, triangles = array-of-structures.
57  BVHLayout_SOA_SOA, // Nodes = structure-of-arrays, triangles = structure-of-arrays.
58  BVHLayout_Compact, // Variant of BVHLayout_AOS_AOS with implicit leaf nodes.
59  BVHLayout_Compact2, // Variant of BVHLayout_AOS_AOS with implicit leaf nodes.
60  BVHLayout_CPU, // Variant of BVHLayout_AOS_AOS without woop triangles, suitable for low memory CPU traversal
61 
63 };
64 
65 //------------------------------------------------------------------------
66 // Kernel configuration. Written by queryConfig() in each CU file.
67 //------------------------------------------------------------------------
68 
70 {
71  int bvhLayout; // Desired BVHLayout.
72  int blockWidth; // Desired blockDim.x.
73  int blockHeight; // Desired blockDim.y.
74  int usePersistentThreads; // True to enable persistent threads.
75 };
76 
77 //------------------------------------------------------------------------
78 // Function signature for trace().
79 //------------------------------------------------------------------------
80 
81 #define TRACE_FUNC_KDTREE \
82  extern "C" __global__ void trace_kdtree( \
83  int numRays, /* Total number of rays in the batch. */ \
84  bool anyHit, /* False if rays need to find the closest hit. */ \
85  float* bmin, \
86  float* bmax, \
87  float delta, \
88  float4* rays, /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */ \
89  int4* results, /* Ray output: int triangleID, float hitT, int2 padding. */ \
90  float4* nodesA, /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */ \
91  float4* nodesB, /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */ \
92  float4* nodesC, /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */ \
93  float4* nodesD, /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */ \
94  float4* trisA, /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */ \
95  float4* trisB, /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */ \
96  float4* trisC, /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */ \
97  int* triIndices) /* Triangle index remapping table. */
98 
99 #define TRACE_FUNC_BVH \
100  extern "C" __global__ void trace_bvh( \
101  int numRays, /* Total number of rays in the batch. */ \
102  bool anyHit, /* False if rays need to find the closest hit. */ \
103  float4* rays, /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */ \
104  int4* results, /* Ray output: int triangleID, float hitT, int2 padding. */ \
105  float4* nodesA, /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */ \
106  float4* nodesB, /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */ \
107  float4* nodesC, /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */ \
108  float4* nodesD, /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */ \
109  float4* trisA, /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */ \
110  float4* trisB, /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */ \
111  float4* trisC, /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */ \
112  int* triIndices) /* Triangle index remapping table. */
113 
114 //------------------------------------------------------------------------
115 // OTrace input
116 
118 {
119  int numRays; /* Total number of rays in the batch. */
120  bool anyHit; /* False if rays need to find the closest hit. */
121  CUdeviceptr rays; /* Ray input: float3 origin, float tmin, float3 direction, float tmax. */
122  CUdeviceptr results; /* Ray output: int triangleID, float hitT, int2 padding. */
123  CUdeviceptr nodesA; /* SOA: bytes 0-15 of each node, AOS/Compact: 64 bytes per node. */
124  CUdeviceptr nodesB; /* SOA: bytes 16-31 of each node, AOS/Compact: unused. */
125  CUdeviceptr nodesC; /* SOA: bytes 32-47 of each node, AOS/Compact: unused. */
126  CUdeviceptr nodesD; /* SOA: bytes 48-63 of each node, AOS/Compact: unused. */
127  CUdeviceptr trisA; /* SOA: bytes 0-15 of each triangle, AOS: 64 bytes per triangle, Compact: 48 bytes per triangle. */
128  CUdeviceptr trisB; /* SOA: bytes 16-31 of each triangle, AOS/Compact: unused. */
129  CUdeviceptr trisC; /* SOA: bytes 32-47 of each triangle, AOS/Compact: unused. */
130  CUdeviceptr triIndices; /* Triangle index remapping table. */
131  CUdeviceptr texCoords; /* Texture coordinates */
132  CUdeviceptr normals; /* Normals */
133  CUdeviceptr triVertIndex; /* Triangle vertex index */
134  CUdeviceptr atlasInfo; /* Texture atlas */
135  CUdeviceptr matId; /* Material ID */
136  CUdeviceptr matInfo; /* Material data */
137  int emissiveNum; /* Number of emissive triangles */
138  CUdeviceptr emissive; /* Emissive triangles buffer */
139  int trisCount; /* Number of triangles */
140  int vertsCount; /* Number of vertices */
141  CUdeviceptr tris; /* int3 vertex index data */
142  CUdeviceptr verts; /* float3 vertices data */
143  int randomSeed; /* RNG seed */
144 };
145 
146 //------------------------------------------------------------------------
147 // Temporary data stored in shared memory to reduce register pressure.
148 //------------------------------------------------------------------------
149 
150 struct RayStruct
151 {
152  float idirx; // 1.0f / ray.direction.x
153  float idiry; // 1.0f / ray.direction.y
154  float idirz; // 1.0f / ray.direction.z
155  float tmin; // ray.tmin
156  float dummy; // Padding to avoid bank conflicts.
157 };
158 
159 //------------------------------------------------------------------------
160 // Globals.
161 //------------------------------------------------------------------------
162 
163 #ifdef __CUDACC__
164 extern "C"
165 {
166 
167 __device__ KernelConfig g_config; // Output of queryConfig().
168 
169 texture<float4, 1> t_rays; // Linear textures wrapping the corresponding parameter arrays.
170 texture<float4, 1> t_nodesA;
171 texture<float4, 1> t_nodesB;
172 texture<float4, 1> t_nodesC;
173 texture<float4, 1> t_nodesD;
174 texture<float4, 1> t_trisA;
175 texture<float4, 1> t_trisB;
176 texture<float4, 1> t_trisC;
177 texture<int, 1> t_triIndices;
178 
179 __global__ void queryConfig(void); // Launched once when the kernel is loaded.
180 
181 TRACE_FUNC_BVH; // Launched for each batch of rays.
183 
184 __constant__ OtraceInput c_OtraceInput;
185 __global__ void otrace_kernel(void); // Otrace kernel
186 texture<float4, 2> t_textureAtlas; // texture atlases
187 
188 }
189 #endif
190 
191 //------------------------------------------------------------------------
192 // Utilities.
193 //------------------------------------------------------------------------
194 
195 #define FETCH_GLOBAL(NAME, IDX, TYPE) ((const TYPE*)NAME)[IDX]
196 #define FETCH_TEXTURE(NAME, IDX, TYPE) tex1Dfetch(t_ ## NAME, IDX)
197 //#define STORE_RESULT(RAY, TRI, T) ((int2*)results)[(RAY) * 2] = make_int2(TRI, __float_as_int(T))
198 #define STORE_RESULT(RAY, TRI, T, U, V) results[RAY] = make_int4(TRI, __float_as_int(T), __float_as_int(U), __float_as_int(V))
199 
200 //------------------------------------------------------------------------
201 
202 #ifdef __CUDACC__
203 
204 template <class T> __device__ __inline__ void swap(T& a,T& b)
205 {
206  T t = a;
207  a = b;
208  b = t;
209 }
210 
211 __device__ __inline__ float min4(float a, float b, float c, float d)
212 {
213  return fminf(fminf(fminf(a, b), c), d);
214 }
215 
216 __device__ __inline__ float max4(float a, float b, float c, float d)
217 {
218  return fmaxf(fmaxf(fmaxf(a, b), c), d);
219 }
220 
221 __device__ __inline__ float min3(float a, float b, float c)
222 {
223  return fminf(fminf(a, b), c);
224 }
225 
226 __device__ __inline__ float max3(float a, float b, float c)
227 {
228  return fmaxf(fmaxf(a, b), c);
229 }
230 
231 // Using integer min,max
232 __inline__ __device__ float fminf2(float a,float b)
233 {
234  int a2 = __float_as_int(a);
235  int b2 = __float_as_int(b);
236  return __int_as_float( a2<b2 ? a2 : b2 );
237 }
238 
239 __inline__ __device__ float fmaxf2(float a,float b)
240 {
241  int a2 = __float_as_int(a);
242  int b2 = __float_as_int(b);
243  return __int_as_float( a2>b2 ? a2 : b2 );
244 }
245 
246 // Using video instructions
247 __device__ __inline__ int min_min (int a, int b, int c) { int v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
248 __device__ __inline__ int min_max (int a, int b, int c) { int v; asm("vmin.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
249 __device__ __inline__ int max_min (int a, int b, int c) { int v; asm("vmax.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
250 __device__ __inline__ int max_max (int a, int b, int c) { int v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
251 __device__ __inline__ float fmin_fmin (float a, float b, float c) { return __int_as_float(min_min(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }
252 __device__ __inline__ float fmin_fmax (float a, float b, float c) { return __int_as_float(min_max(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }
253 __device__ __inline__ float fmax_fmin (float a, float b, float c) { return __int_as_float(max_min(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }
254 __device__ __inline__ float fmax_fmax (float a, float b, float c) { return __int_as_float(max_max(__float_as_int(a), __float_as_int(b), __float_as_int(c))); }
255 
256 
257 __device__ __inline__ float magic_max7(float a0, float a1, float b0, float b1, float c0, float c1, float d)
258 {
259  float t1 = fmin_fmax(a0, a1, d);
260  float t2 = fmin_fmax(b0, b1, t1);
261  float t3 = fmin_fmax(c0, c1, t2);
262  return t3;
263 }
264 
265 __device__ __inline__ float magic_min7(float a0, float a1, float b0, float b1, float c0, float c1, float d)
266 {
267  float t1 = fmax_fmin(a0, a1, d);
268  float t2 = fmax_fmin(b0, b1, t1);
269  float t3 = fmax_fmin(c0, c1, t2);
270  return t3;
271 }
272 
273 // Experimentally determined best mix of float/int/video minmax instructions for Kepler.
274 __device__ __inline__ float spanBeginKepler(float a0, float a1, float b0, float b1, float c0, float c1, float d){ return fmax_fmax( fminf(a0,a1), fminf(b0,b1), fmin_fmax(c0, c1, d)); }
275 __device__ __inline__ float spanEndKepler(float a0, float a1, float b0, float b1, float c0, float c1, float d) { return fmin_fmin( fmaxf(a0,a1), fmaxf(b0,b1), fmax_fmin(c0, c1, d)); }
276 
277 // Same for Fermi.
278 __device__ __inline__ float spanBeginFermi(float a0, float a1, float b0, float b1, float c0, float c1, float d) { return magic_max7(a0, a1, b0, b1, c0, c1, d); }
279 __device__ __inline__ float spanEndFermi(float a0, float a1, float b0, float b1, float c0, float c1, float d) { return magic_min7(a0, a1, b0, b1, c0, c1, d); }
280 
281 #endif
282 
283 //------------------------------------------------------------------------
CUdeviceptr normals
CUdeviceptr trisC
CUdeviceptr matInfo
CUdeviceptr nodesD
CUdeviceptr triVertIndex
#define TRACE_FUNC_KDTREE
CUdeviceptr trisA
CUdeviceptr nodesB
CUdeviceptr nodesC
CUdeviceptr triIndices
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
Definition: DLLImports.inl:329
CUdeviceptr rays
CUdeviceptr matId
CUdeviceptr tris
CUdeviceptr emissive
CUdeviceptr verts
CUdeviceptr results
#define TRACE_FUNC_BVH
CUdeviceptr nodesA
CUdeviceptr trisB
CUdeviceptr atlasInfo
FW_CUDA_FUNC void swap(T &a, T &b)
Definition: Defs.hpp:183
CUdeviceptr texCoords