NTrace
GPU ray tracing framework
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
CudaPersistentBVHTracer.cpp
Go to the documentation of this file.
1 #include <sstream>
2 #include <iomanip>
3 #include "base/Random.hpp"
6 //#include "../../../../AppEnvironment.h"
7 
8 #define TASK_SIZE 430000
9 
10 using namespace FW;
11 
12 #define BENCHMARK
13 #define TEST_TASKS
14 //#define CUTOFF_DEPTH // Reports only data at specific level - for debugging purpouses
15 //#define TRACE_L1
16 
17 #ifndef TEST_TASKS
18 #include "kernels/thrustTest.hpp"
19 #endif
20 
22 {
23  m_epsilon = epsilon;
24 
25  // init
27  //m_compiler.addOptions("-use_fast_math");
28  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
29  m_compiler.clearDefines();
31  m_compiler.define("FERMI");
32 
33  // convert from scene
34  Vec3f light = Vec3f(1.0f, 2.0f, 3.0f).normalized();
35 
36  m_numTris = scene.getNumTriangles(); //scene.triangles.size();
37  m_numVerts = m_numTris * 3;
38  m_numMaterials = 1;
39  m_numShadingNormals = m_numVerts;
40  m_numTextureCoords = m_numVerts;
41  //m_bbox = scene.box;
42  scene.getBBox(m_bboxMin, m_bboxMax);
43 
44  m_tris.resizeDiscard(m_numTris * sizeof(Vec3f));
45  m_triNormals.resizeDiscard(m_numTris * sizeof(Vec3f));
46  //m_verts.resizeDiscard(m_numVerts * sizeof(Vec3f));
47  //m_materials.resizeDiscard(m_numMaterials * sizeof(Material));
48  //m_shadingNormals.resizeDiscard(m_numVerts * sizeof(Vec3f));
49  //m_shadedColor.resizeDiscard(m_numTris * sizeof(Vec3f));
50  //m_materialColor.resizeDiscard(m_numTris * sizeof(Vec3f));
51  //m_textureCoords.resizeDiscard(m_numTextureCoords * sizeof(Vec2f));
52  //m_trisBox.resizeDiscard(m_numTris * sizeof(CudaAABB));
53 
54  m_trisCompact.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
55  m_trisIndex.resizeDiscard(m_numTris * sizeof(S32));
56 
57  Vec3f* tout = (Vec3f*)m_tris.getMutablePtr();
58  Vec3f* nout = (Vec3f*)m_triNormals.getMutablePtr();
59  Vec3f* vout = (Vec3f*)m_verts.getMutablePtr();
60  //Material* mout = (Material*)m_materials.getMutablePtr();
61  //Vec3f* snout = (Vec3f*)m_shadingNormals.getMutablePtr();
62  //U32* scout = (U32*)m_shadedColor.getMutablePtr();
63  //U32* mcout = (U32*)m_materialColor.getMutablePtr();
64  //Vec2f* uvout = (Vec2f*)m_textureCoords.getMutablePtr();
65  float3* bout = (float3*)m_trisBox.getMutablePtr();
66 
67  Vec4f* tcout = (Vec4f*)m_trisCompact.getMutablePtr();
68  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
69 
70  // load vertices
71  const Vec3i* tris = (const Vec3i*)(scene.getTriVtxIndexBuffer().getPtr());
72  const Vec3f* verts = (const Vec3f*)(scene.getVtxPosBuffer().getPtr());
73  for (int i = 0; i < m_numTris; i++)
74  {
75  for (int j = 0; j < 3; j++)
76  {
77  vout[i * 3 + j] = Vec3f(verts[tris[i][j]].x, verts[tris[i][j]].y, verts[tris[i][j]].z);
78  *tcout = Vec4f(verts[tris[i][j]].x, verts[tris[i][j]].y, verts[tris[i][j]].z,0);
79  tcout++;
80  }
81 
82  *tiout = i;
83  tiout++;
84 
85  Vec3f minV = min(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
86  bout[i * 2 + 0] = make_float3(minV.x, minV.y, minV.z);
87  Vec3f maxV = max(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
88  bout[i* 2 + 1] = make_float3(maxV.x, maxV.y, maxV.z);
89 
90  /*Triangle& tris = *scene.triangles[i];
91  for(int j = 0; j < 3; j++)
92  {
93  //vout[i*3+j] = Vec3f(tris.vertices[j].x,tris.vertices[j].y,tris.vertices[j].z);
94  snout[i*3+j] = Vec3f(tris.normals[j].x,tris.normals[j].y,tris.normals[j].z);
95  //uvout[i*3+j] = Vec2f(tris.uvs[j].xx,tris.uvs[j].yy);
96 
97  *tcout = Vec4f(tris.vertices[j].x,tris.vertices[j].y,tris.vertices[j].z,0);
98  tcout++;
99  }
100 
101  Vec3f minV = min(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
102  bout[i].m_mn = make_float3(minV.x, minV.y, minV.z);
103  Vec3f maxV = max(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
104  bout[i].m_mx = make_float3(maxV.x, maxV.y, maxV.z);*/
105  }
106 
107  // default material
108  /*Material m;
109  m.diffuse = Vec3f(1.0f,1.0f,1.0f);
110  m.specular = Vec3f(0.0f,0.0f,0.0f);
111  m.type = MeshBase::Material::MaterialType_Phong;
112  m.texID = -1; // no texture
113  m.gloss_alpha = Vec2f(0.0f, 0.f);
114  mout[0] = m;*/
115 
116  //unsigned int matid = 1;
117 
118  // load triangles
119  Vec4f defaultColor(1.0f,1.0f,1.0f,1.0f);
120  for(int i=0,j=0;i<m_numTris;i++,j+=3)
121  {
122  tout[i] = Vec3i(j, j + 1, j + 2);
123  nout[i] = ((vout[j + 1] - vout[j]).cross(vout[j + 2] - vout[j]));
124  nout[i].normalize();
125 
126  // triangle data
127  /*tout->vertices = Vec3i(j,j+1,j+2);
128  Triangle& tris = *scene.triangles[i];
129  Vector3 normalVec = tris.GetNormal();
130  tout->normal = Vec3f(normalVec.x,normalVec.y,normalVec.z);
131  *nout = tout->normal;
132 
133  // material
134  Material* mat;
135  mat = &mout[0];
136  matid = 0;
137 
138  Vec4f diffuseColor(mat->diffuse,1.0f);
139  tout->materialColor = diffuseColor.toABGR();
140  tout->shadedColor = Vec4f( diffuseColor.getXYZ() * (dot(tout->normal, light) * 0.5f + 0.5f), 1.0f).toABGR();
141  tout->materialId = matid;
142 
143  *scout = tout->shadedColor;
144  *mcout = tout->materialColor;
145  scout++;
146  mcout++;
147 
148  tout++;
149  nout++;*/
150  }
151 
152  m_sizeTask = 0.f;
153  m_sizeSplit = 0.f;
154  m_sizeADS = 0.f;
155  m_sizeTri = 0.f;
156  m_sizeTriIdx = 0.f;
157  m_heap = 0.f;
158 }
159 
161 {
162  m_kernelFile = "src/rt/kernels/persistent_nostruct.cu";
163  m_compiler.setSourceFile(m_kernelFile);
164  m_module = m_compiler.compile();
165  failIfError();
166 
167  m_numRays = rays.getSize();
168 
169 #ifdef TEST_TASKS
170 #ifdef DEBUG_PPS
171  Random rand;
172  m_numRays = rand.getU32(1, 1000000);
173  m_numTris = rand.getU32(1, 1000000);
174 #endif
175 #endif
176 
177  // Set triangle index buffer
178  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
179 #ifdef DEBUG_PPS
180  S32* ptout = (S32*)m_ppsTris.getMutablePtr();
181  S32* cltout = (S32*)m_ppsTrisIndex.getMutablePtr();
182  S32* stout = (S32*)m_sortTris.getMutablePtr();
183 #endif
184  for(int i=0;i<m_numTris;i++)
185  {
186 #ifndef DEBUG_PPS
187  // indices
188  *tiout = i;
189  tiout++;
190 #else
191  int rnd = rand.getS32(-1, 2);
192  //*ptout = rnd;
193  *cltout = rnd;
194  *stout = (rnd >= 1);
195  //ptout++;
196  cltout++;
197  stout++;
198 #endif
199  }
200 
201  m_raysIndex.resizeDiscard(sizeof(int)*m_numRays);
202  rays.getResultBuffer().clear(-1); // Set as no hit
203  int *rayIdx = (int*)m_raysIndex.getMutablePtr();
204 #ifdef DEBUG_PPS
205  S32* prout = (S32*)m_ppsRays.getMutablePtr();
206  S32* clrout = (S32*)m_ppsRaysIndex.getMutablePtr();
207  S32* srout = (S32*)m_sortRays.getMutablePtr();
208 #endif
209  for(int i = 0; i < m_numRays; i++)
210  {
211 #ifndef DEBUG_PPS
212  // Set rays index buffer
213  *rayIdx = i;
214  rayIdx++;
215 
216 #ifdef TEST_TASKS
217 #if 0
218  // CPU Clipping
219  MiniMax::Ray mRay;
220  memcpy(&mRay.origin, &ray.origin, sizeof(Vec3f));
221  memcpy(&mRay.direction, &ray.direction, sizeof(Vec3f));
222  // Clip the rays to the extent of scene box
223  bool intersects = m_bbox.ComputeMinMaxT(mRay,
224  ray.tmin, ray.tmax);
225 
226  // clip the origin of the rays
227  if (ray.tmin < 1e-3f)
228  ray.tmin = 1e-3f;
229 #endif
230 #endif
231 #else
232  int rnd = rand.getS32(-1, 3);
233  //*prout = rnd;
234  *clrout = rnd;
235  *srout = (rnd >= 1);
236  //prout++;
237  clrout++;
238  srout++;
239 #endif
240  }
241 
242  // Start the timer
243  m_timer.unstart();
244  m_timer.start();
245 
246  // Create the taskData
247 #ifdef TEST_TASKS
248  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(Task) + sizeof(int)));
249  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
250 
251 #if SPLIT_TYPE == 3
252  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfo));
253  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
254 #endif
255 #endif
256 
257  m_gpuTime = traceCudaRayBuffer(rays);
258  m_cpuTime = m_timer.end();
259 
260  m_raysIndex.reset();
261 #ifdef DEBUG_PPS
262  exit(0);
263 #endif
264 
265  return m_gpuTime;
266 }
267 
269 {
270 #ifdef MALLOC_SCRATCHPAD
271  // Set the memory limit according to triangle count
272 #ifndef BENCHMARK
273  printf("Setting dynamic memory limit to %fMB\n", (float)(m_trisIndex.getSize()*5*3)/(float)(1024*1024));
274 #endif
275  cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, m_trisIndex.getSize()*5*3);
276 #endif
277 
278  // Compile the kernel
279  if(!sbvh)
280  m_kernelFile = "src/rt/kernels/persistent_bvh.cu";
281  else
282  m_kernelFile = "src/rt/kernels/persistent_sbvh.cu";
283 
284  m_compiler.setSourceFile(m_kernelFile);
285  m_module = m_compiler.compile();
286  failIfError();
287 
288 #ifdef DEBUG_PPS
289  Random rand;
290  m_numTris = rand.getU32(1, 1000000);
291 #endif
292 
293  // Set triangle index buffer
294  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
295 #ifdef DEBUG_PPS
296  S32* pout = (S32*)m_ppsTris.getMutablePtr();
297  S32* clout = (S32*)m_ppsTrisIndex.getMutablePtr();
298  S32* sout = (S32*)m_sortTris.getMutablePtr();
299 #endif
300  for(int i=0;i<m_numTris;i++)
301  {
302 #ifndef DEBUG_PPS
303  // indices
304  *tiout = i;
305  tiout++;
306 #else
307  int rnd = rand.getU32(0, 2);
308  //*pout = rnd;
309  *clout = rnd;
310  *sout = (rnd >= 1);
311  //pout++;
312  clout++;
313  sout++;
314 #endif
315  }
316 
317  // Start the timer
318  m_timer.unstart();
319  m_timer.start();
320 
321  // Create the taskData
322  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
323  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
324  //S64 bvhSize = ((m_numTris * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
325  S64 bvhSize = ((m_numTris/2 * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
326  m_bvhData.resizeDiscard(bvhSize);
327  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
328  //m_bvhData.clearRange32(0, 0, bvhSize); // Mark all tasks as 0 (important for debug)
329 #ifdef COMPACT_LAYOUT
330  if(!sbvh)
331  {
332  m_trisCompactOut.resizeDiscard(m_numTris * (3+1) * sizeof(Vec4f));
333  m_trisIndexOut.resizeDiscard(m_numTris * (3+1) * sizeof(S32));
334  }
335  else
336  {
337  m_trisCompactOut.resizeDiscard(m_numTris*2 * (3+1) * sizeof(Vec4f));
338  m_trisIndexOut.resizeDiscard(m_numTris*2 * (3+1) * sizeof(S32));
339  }
340 #endif
341 
342 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
343  m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
344  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
345 #endif
346 
347  m_gpuTime = buildCudaBVH();
348  m_cpuTime = m_timer.end();
349 
350  // Resize to exact memory
351  trimBVHBuffers();
352 
353 #ifdef DEBUG_PPS
354  exit(0);
355 #endif
356 
357  return m_gpuTime;
358 }
359 
361 {
362 #ifdef TRACE_L1
363  // Set compiler options
364  m_compiler.clearOptions();
365 #endif
366 
367  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
368 #ifdef COMPACT_LAYOUT
369 #ifdef WOOP_TRIANGLES
370  String kernelName("src/rt/kernels/fermi_speculative_while_while");
371 #else
372  String kernelName("src/rt/kernels/fermi_speculative_while_while_inter");
373 #endif
374 #ifdef TRACE_L1
375  m_compiler.addOptions("-use_fast_math");
376 #endif
377 #else
378  String kernelName("src/rt/kernels/fermi_persistent_speculative_while_while_inter");
379 #ifdef TRACE_L1
380  m_compiler.addOptions("-use_fast_math -maxrregcount 40");
381 #endif
382 #endif
383  if(stats != NULL)
384  {
385  kernelName += "_statistics";
386  }
387  kernelName += ".cu";
388  m_compiler.setSourceFile(kernelName);
389 
390  m_module = m_compiler.compile();
391  failIfError();
392 
393  CudaKernel queryKernel = m_module->getKernel("queryConfig");
394  //if (!queryKernel)
395  // fail("Config query kernel not found!");
396 
397  // Initialize config with default values.
398  KernelConfig& kernelConfig = *(KernelConfig*)m_module->getGlobal("g_config").getMutablePtr();
399  kernelConfig.bvhLayout = BVHLayout_Max;
400  kernelConfig.blockWidth = 0;
401  kernelConfig.blockHeight = 0;
402  kernelConfig.usePersistentThreads = 0;
403 
404  // Query config.
405 
406  queryKernel.launchTimed(Vec2i(1, 1));
407  //m_module->launchKernel(queryKernel, 1, 1);
408  kernelConfig = *(const KernelConfig*)m_module->getGlobal("g_config").getPtr();
409 
410  CudaKernel kernel;
411  if(stats != NULL)
412  kernel = m_module->getKernel("trace_stats");
413  else
414  kernel = m_module->getKernel("trace");
415  //if (!kernel)
416  // fail("Trace kernel not found!");
417 
418  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
419  // Start the timer
420  m_timer.unstart();
421  m_timer.start();
422 
423  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
424  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
425 
426 #ifdef COMPACT_LAYOUT
427  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
428  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
429  Buffer& indexBuf = m_trisIndexOut;
430 #else
431  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
432  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
433  Buffer& indexBuf = m_trisIndex;
434 #endif
435 
436  // Set input.
437  // The new version has it via parameters, not const memory
438  in.numRays = rays.getSize();
439  in.anyHit = (rays.getNeedClosestHit() == false);
440  in.nodesA = nodePtr + nodeOfsA.x;
441  in.trisA = triPtr + triOfsA.x;
442  in.rays = rays.getRayBuffer().getCudaPtr();
443  in.results = rays.getResultBuffer().getMutableCudaPtr();
444  in.triIndices = indexBuf.getCudaPtr();
445 
446  // Set texture references.
447  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
448  m_module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
449  m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
450  m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
451 
452  // Determine block and grid sizes.
453  int desiredWarps = (rays.getSize() + 31) / 32;
454  if (kernelConfig.usePersistentThreads != 0)
455  {
456  *(S32*)m_module->getGlobal("g_warpCounter").getMutablePtr() = 0;
457  desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
458  }
459 
460  Vec2i blockSize(kernelConfig.blockWidth, kernelConfig.blockHeight);
461  int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
462  Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
463 
464  if(stats != NULL)
465  {
466  m_module->getGlobal("g_NumNodes").clear();
467  m_module->getGlobal("g_NumLeaves").clear();
468  m_module->getGlobal("g_NumEmptyLeaves").clear();
469  m_module->getGlobal("g_NumTris").clear();
470  m_module->getGlobal("g_NumFailedTris").clear();
471  m_module->getGlobal("g_NumHitTrisOutside").clear();
472  }
473 
474  // Launch.
475  F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
476 
477  if(stats != NULL)
478  {
479  stats->numNodeTests += *(U32*)m_module->getGlobal("g_NumNodes").getPtr();
480  stats->numLeavesVisited += *(U32*)m_module->getGlobal("g_NumLeaves").getPtr();
481  stats->numEmptyLeavesVisited += *(U32*)m_module->getGlobal("g_NumEmptyLeaves").getPtr();
482  stats->numTriangleTests += *(U32*)m_module->getGlobal("g_NumTris").getPtr();
483  stats->numFailedTriangleTests += *(U32*)m_module->getGlobal("g_NumFailedTris").getPtr();
484  stats->numSuccessTriangleTestsOutside += *(U32*)m_module->getGlobal("g_NumHitTrisOutside").getPtr();
485  stats->numRays += rays.getSize();
486  }
487 
488  m_gpuTime = launchTime;
489  m_cpuTime = m_timer.end();
490 
491 #ifdef TRACE_L1
492  // reset options
493  m_compiler.clearOptions();
494  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
495 #endif
496 
497  return launchTime;
498 }
499 
501 {
502  // Compile the kernel
503  m_kernelFile = "src/rt/kernels/persistent_kdtree.cu";
504  m_compiler.setSourceFile(m_kernelFile);
505  m_module = m_compiler.compile();
506  failIfError();
507 
508  prepareDynamicMemory();
509 
510 #ifdef DEBUG_PPS
511  Random rand;
512  m_numTris = rand.getU32(1, 1000000);
513 #endif
514 
515  // Set triangle index buffer
516  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
517 #ifdef DEBUG_PPS
518  S32* pout = (S32*)m_ppsTris.getMutablePtr();
519  S32* clout = (S32*)m_ppsTrisIndex.getMutablePtr();
520  S32* sout = (S32*)m_sortTris.getMutablePtr();
521 #endif
522  for(int i=0;i<m_numTris;i++)
523  {
524 #ifndef DEBUG_PPS
525  // indices
526  *tiout = i;
527  tiout++;
528 #else
529  int rnd = rand.getU32(0, 2);
530  //*pout = rnd;
531  *clout = rnd;
532  *sout = (rnd >= 1);
533  //pout++;
534  clout++;
535  sout++;
536 #endif
537  }
538 
539  // Start the timer
540  m_timer.unstart();
541  m_timer.start();
542 
543  // Create the taskData
544  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
545  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
546 //#if SPLIT_TYPE == 3
547  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfoTri));
548 //#elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
549 // m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
550 //#endif
551  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
552 
553  // Node and triangle data
554 #ifndef INTERLEAVED_LAYOUT
555  //S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
556  //S64 kdtreeSize = ((m_numTris*10 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
557  //S64 kdtreeSize = ((m_numTris*3 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
558  S64 kdtreeSize = ((m_numTris*20 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
559  m_bvhData.resizeDiscard(kdtreeSize);
560  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
561  //m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
562 #ifndef COMPACT_LAYOUT
563  m_trisCompactOut.resizeDiscard(m_numTris*10 * 3 * sizeof(Vec4f));
564  m_trisIndexOut.resizeDiscard(m_numTris*10 * 3 * sizeof(S32));
565 #else
566 #ifdef DUPLICATE_REFERENCES
567  m_trisCompactOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(Vec4f));
568  m_trisIndexOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(S32));
569 #else
570  m_trisCompactOut.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
571  //m_trisIndexOut.resizeDiscard(m_numTris*12 * (1+1) * sizeof(S32));
572  m_trisIndexOut.resizeDiscard(m_numTris*20 * (1+1) * sizeof(S32));
573  //m_trisIndexOut.resizeDiscard(m_numTris*6 * (1+1) * sizeof(S32));
574 #endif
575 #endif
576 #else
577  // TODO: Rewrite this when double headed heap is realized
578  S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (sizeof(Vec4f)+sizeof(S32))) + 4096 - 1) & -4096;
579  //size_t f, t; cuMemGetInfo(&f, &t);
580  //S64 kdtreeSize = f & -4096;
581  m_bvhData.resizeDiscard(kdtreeSize);
582  m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
583 #endif
584 
585  m_gpuTime = buildCudaKdtree();
586  m_cpuTime = m_timer.end();
587 
588  // Resize to exact memory
590 
591 #ifdef DEBUG_PPS
592  exit(0);
593 #endif
594 
595  return m_gpuTime;
596 }
597 
599 {
600 #ifdef TRACE_L1
601  // Set compiler options
602  m_compiler.clearOptions();
603 #endif
604 
605  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
606 #ifdef COMPACT_LAYOUT
607 #ifdef WOOP_TRIANGLES
608 #ifdef DUPLICATE_REFERENCES
609  String kernelName("src/rt/kernels/fermi_kdtree_while_while_childPtr");
610  //String kernelName("src/rt/kernels/fermi_kdtree_while_while_shortStack");
611 #else
612  String kernelName("src/rt/kernels/fermi_kdtree_while_while_leafRef");
613 #endif
614 #else
615 #error Undefined kernel
616 #endif
617 #else
618  String kernelName("src/rt/kernels/fermi_kdtree_while_while");
619 #endif
620 #ifdef TRACE_L1
621  m_compiler.addOptions("-use_fast_math");
622  //m_compiler.addOptions("-use_fast_math -maxrregcount 32");
623 #endif
624 
625  if(stats != NULL)
626  {
627  kernelName += "_statistics";
628  }
629  kernelName += ".cu";
630  m_compiler.setSourceFile(kernelName);
631 
632  m_module = m_compiler.compile();
633  failIfError();
634 
635  CUfunction queryKernel = m_module->getKernel("queryConfig");
636  if (!queryKernel)
637  fail("Config query kernel not found!");
638 
639  // Initialize config with default values.
640  KernelConfig& kernelConfig = *(KernelConfig*)m_module->getGlobal("g_config").getMutablePtr();
641  kernelConfig.bvhLayout = BVHLayout_Max;
642  kernelConfig.blockWidth = 0;
643  kernelConfig.blockHeight = 0;
644  kernelConfig.usePersistentThreads = 0;
645 
646  // Query config.
647 
648  m_module->launchKernel(queryKernel, 1, 1);
649  kernelConfig = *(const KernelConfig*)m_module->getGlobal("g_config").getPtr();
650 
651  CUfunction kernel;
652  if(stats != NULL)
653  kernel = m_module->getKernel("trace_stats");
654  else
655  kernel = m_module->getKernel("trace");
656  if (!kernel)
657  fail("Trace kernel not found!");
658 
659  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
660  // Start the timer
661  m_timer.unstart();
662  m_timer.start();
663 
664  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
665  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
666 
667 #ifndef INTERLEAVED_LAYOUT
668  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
669  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
670  Buffer& indexBuf = m_trisIndexOut;
671 #else
672  CUdeviceptr triPtr = m_bvhData.getCudaPtr();
673  Vec2i triOfsA = Vec2i(0, (S32)m_bvhData.getSize());
674  Buffer& indexBuf = m_bvhData;
675 #endif
676 
677  // Set input.
678  // The new version has it via parameters, not const memory
679  in.numRays = rays.getSize();
680  in.anyHit = (rays.getNeedClosestHit() == false);
681  memcpy(&in.bmin, &m_bbox.min, sizeof(float3));
682  memcpy(&in.bmax, &m_bbox.max, sizeof(float3));
683  in.nodesA = nodePtr + nodeOfsA.x;
684  in.trisA = triPtr + triOfsA.x;
685  in.rays = rays.getRayBuffer().getCudaPtr();
686  in.results = rays.getResultBuffer().getMutableCudaPtr();
687  in.triIndices = indexBuf.getCudaPtr();
688 
689  // Set texture references.
690  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
691  m_module->setTexRef("t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
692  m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
693  m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
694 
695  // Determine block and grid sizes.
696  int desiredWarps = (rays.getSize() + 31) / 32;
697  if (kernelConfig.usePersistentThreads != 0)
698  {
699  *(S32*)m_module->getGlobal("g_warpCounter").getMutablePtr() = 0;
700  desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
701  }
702 
703  Vec2i blockSize(kernelConfig.blockWidth, kernelConfig.blockHeight);
704  int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
705  Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
706 
707  if(stats != NULL)
708  {
709  m_module->getGlobal("g_NumNodes").clear();
710  m_module->getGlobal("g_NumLeaves").clear();
711  m_module->getGlobal("g_NumEmptyLeaves").clear();
712  m_module->getGlobal("g_NumTris").clear();
713  m_module->getGlobal("g_NumFailedTris").clear();
714  m_module->getGlobal("g_NumHitTrisOutside").clear();
715  }
716 
717  // Launch.
718  F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
719 
720  if(stats != NULL)
721  {
722  stats->numNodeTests += *(U32*)m_module->getGlobal("g_NumNodes").getPtr();
723  stats->numLeavesVisited += *(U32*)m_module->getGlobal("g_NumLeaves").getPtr();
724  stats->numEmptyLeavesVisited += *(U32*)m_module->getGlobal("g_NumEmptyLeaves").getPtr();
725  stats->numTriangleTests += *(U32*)m_module->getGlobal("g_NumTris").getPtr();
726  stats->numFailedTriangleTests += *(U32*)m_module->getGlobal("g_NumFailedTris").getPtr();
727  stats->numSuccessTriangleTestsOutside += *(U32*)m_module->getGlobal("g_NumHitTrisOutside").getPtr();
728  stats->numRays += rays.getSize();
729  }
730 
731  m_gpuTime = launchTime;
732  m_cpuTime = m_timer.end();
733 
734 #ifdef TRACE_L1
735  // reset options
736  m_compiler.clearOptions();
737  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
738 #endif
739 
740  return launchTime;
741 }
742 
743 F32 CudaPersistentBVHTracer::traceOnDemandBVH(RayBuffer& rays, bool rebuild, int numRays)
744 {
745  m_numRays = numRays;
746 
747  if(rebuild)
748  {
749  // Compile the kernel
750  m_kernelFile = "src/rt/kernels/persistent_ondemand.cu";
751  m_compiler.setSourceFile(m_kernelFile);
752  m_module = m_compiler.compile();
753  failIfError();
754 
755  // Set triangle index buffer
756  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
757  for(int i=0;i<m_numTris;i++)
758  {
759  // indices
760  *tiout = i;
761  tiout++;
762  }
763  }
764 
765  // Start the timer
766  m_timer.unstart();
767  m_timer.start();
768 
769  if(rebuild)
770  {
771  // Create the taskData
772  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
773  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
774  S64 bvhSize = ((m_numTris * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
775  //S64 bvhSize = ((m_numTris/2 * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
776  m_bvhData.resizeDiscard(bvhSize);
777  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
778 
779 #ifdef COMPACT_LAYOUT
780  m_trisCompactOut.resizeDiscard(m_numTris * (3+1) * sizeof(Vec4f));
781  m_trisIndexOut.resizeDiscard(m_numTris * (3+1) * sizeof(S32));
782 #endif
783 
784 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
785  m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
786  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
787 #endif
788  }
789 
790  // Build + trace
791  m_gpuTime = traceOnDemandBVHRayBuffer(rays, rebuild);
792  m_cpuTime = m_timer.end();
793 
794  // Save sizes of buffer so that they can be printed
795  if(rebuild)
796  saveBufferSizes();
797 
798  return m_gpuTime;
799 }
800 
802 {
803  m_numRays = numRays;
804 
805  if(rebuild)
806  {
807  // Compile the kernel
808  m_kernelFile = "src/rt/kernels/persistent_ondemand_kdtree.cu";
809  m_compiler.setSourceFile(m_kernelFile);
810  m_module = m_compiler.compile();
811  failIfError();
812 
813  prepareDynamicMemory();
814 
815  // Set triangle index buffer
816  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
817  for(int i=0;i<m_numTris;i++)
818  {
819  // indices
820  *tiout = i;
821  tiout++;
822  }
823  }
824 
825  // Start the timer
826  m_timer.unstart();
827  m_timer.start();
828 
829  if(rebuild)
830  {
831  // Create the taskData
832  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
833  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
834 //#if SPLIT_TYPE == 3
835  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfoTri));
836 //#elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
837  // m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
838 //#endif
839  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
840 
841  // Node and triangle data
842 #ifndef INTERLEAVED_LAYOUT
843  S64 kdtreeSize = ((m_numTris * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
844  m_bvhData.resizeDiscard(kdtreeSize);
845  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
846  //m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
847 #ifndef COMPACT_LAYOUT
848  m_trisCompactOut.resizeDiscard(m_numTris*10 * 3 * sizeof(Vec4f));
849  m_trisIndexOut.resizeDiscard(m_numTris*10 * 3 * sizeof(S32));
850 #else
851 #ifdef DUPLICATE_REFERENCES
852  m_trisCompactOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(Vec4f));
853  m_trisIndexOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(S32));
854 #else
855  m_trisCompactOut.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
856  m_trisIndexOut.resizeDiscard(m_numTris*7 * (1+1) * sizeof(S32));
857 #endif
858 #endif
859 #else
860  // TODO: Rewrite this when double headed heap is realized
861  S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (sizeof(Vec4f)+sizeof(S32))) + 4096 - 1) & -4096;
862  //size_t f, t; cuMemGetInfo(&f, &t);
863  //S64 kdtreeSize = f & -4096;
864  m_bvhData.resizeDiscard(kdtreeSize);
865  m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
866 #endif
867  }
868 
869  // Build + trace
870  m_gpuTime = traceOnDemandKdtreeRayBuffer(rays, rebuild);
871  m_cpuTime = m_timer.end();
872 
873  // Save sizes of buffer so that they can be printed
874  if(rebuild)
875  saveBufferSizes();
876 
877  return m_gpuTime;
878 }
879 
880 void CudaPersistentBVHTracer::traceOnDemandTrace(RayBuffer& rays, F32& GPUmegakernel, F32& CPUmegakernel, F32& GPUtravKernel, F32& CPUtravKernel, int& buildNodes, RayStats* stats)
881 {
882  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
883  m_compiler.setSourceFile(m_kernelFile);
884  m_module = m_compiler.compile();
885  failIfError();
886 
887  CUfunction kernel;
888  kernel = m_module->getKernel("build");
889  if (!kernel)
890  fail("Build kernel not found!");
891 
892  F32 tTrace, tTraceCPU;
893 #ifndef ONDEMAND_FULL_BUILD
894 #if 0
895  // Needed for BVH since the data has been erased by module switch
896  // Set BVH input.
897  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
898  inBVH.numTris = m_numTris;
899  inBVH.tris = m_trisCompact.getCudaPtr();
900  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
901  //inBVH.trisBox = m_trisBox.getCudaPtr();
902  inBVH.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
903  inBVH.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
904  inBVH.sortTris = m_sortTris.getMutableCudaPtr();
905 #ifdef COMPACT_LAYOUT
906  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
907  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
908 #endif
909 
910  // Set traversal input
911  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
912  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
913  Buffer& indexBuf = m_trisIndex;
914  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
915  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
916 
917  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
918  m_timer.start();
919  in.numRays = rays.getSize();
920  in.anyHit = (rays.getNeedClosestHit() == false);
921  in.nodesA = nodePtr + nodeOfsA.x;
922  in.trisA = triPtr + triOfsA.x;
923  in.rays = rays.getRayBuffer().getCudaPtr();
924  in.results = rays.getResultBuffer().getMutableCudaPtr();
925  in.triIndices = indexBuf.getCudaPtr();
926 
927  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
928  m_module->setTexRef("t_nodesA", m_bvhData, CU_AD_FORMAT_FLOAT, 4);
929  m_module->setTexRef("t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
930  m_module->setTexRef("t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
931 #endif
932 
933  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
934  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
935  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
936  int gridSizeX = NUM_SM*numBlocksPerSM;
937  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
938 
939  // Run the kernel as long as the traversal order does not change
940  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
941  int oldNodes;
942  int i = 0;
943  do
944  {
945  oldNodes = tasks.numNodes;
946 
947  m_timer.unstart();
948  m_timer.start();
949  // Launch trace until traversal path convergence.
950 #if 0
951  // Needed for BVH since the task has been erased by module switch
952  tasks.header = (int*)m_taskData.getMutableCudaPtr();
953  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
954  tasks.nodeTop = 2;
955 #endif
956 
957  tasks.warpCounter = rays.getSize();
958  tasks.unfinished = -NUM_WARPS;
959  tasks.launchFlag = 1;
960 
961  // Launch.
962  tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
963  tTraceCPU = m_timer.end();
964  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
965  buildNodes += tasks.numNodes - oldNodes;
966 
967  //printf("Verify run %d in %f (%d -> %d)\n", i, tTrace, oldNodes, tasks.numNodes);
968  i++;
969  } while(oldNodes != tasks.numNodes);
970 
971  // Launch just trace.
972  /*tasks.warpCounter = rays.getSize();
973  tasks.unfinished = -NUM_WARPS;
974  tasks.launchFlag = 2;
975  tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);*/
976 
977  GPUmegakernel += tTrace; // Save the final traversal time inside the megakernel
978  CPUmegakernel += tTraceCPU;
979 #endif
980 
981  //cout << "Verify trace in " << tTrace << "s" << "\n";
982 
983  /*TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
984  tasks.warpCounter = rays.getSize();
985  tasks.unfinished = -NUM_WARPS;
986 
987  for(int i = 0; i < rays.getSize(); i++)
988  {
989  // Set ray result buffer
990  RayResult& res = rays.getMutableResultForSlot(i);
991  res.clear();
992 
993  // Set ray tmax
994  Ray& ray = rays.getMutableRayForSlot(i);
995  ray.tmax = defTmax;
996  }
997  rays.getResultBuffer().getMutableCudaPtr();
998  float tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
999  //cout << "Verify trace in " << tTrace << "s" << "\n";
1000  */
1001 
1002  if(m_kernelFile.endsWith("kdtree.cu"))
1003  tTrace = traceBatchKdtree(rays, stats);
1004  else
1005  tTrace = traceBatchBVH(rays, stats);
1006  //tTrace = -1.f;
1007  tTraceCPU = getCPUTime();
1008  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
1009  m_compiler.setSourceFile(m_kernelFile);
1010  m_module = m_compiler.compile();
1011 
1012  //printf("Verify kernel %d in %f\n", i, tTrace);
1013 
1014  GPUtravKernel += tTrace;
1015  CPUtravKernel += tTraceCPU;
1016 }
1017 
1019 {
1020  float minTime = FLT_MAX;
1021  float sumTime = 0.f;
1022  const int numRepeats = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numRepeats");
1023 
1024 // Create the taskData
1025  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
1026 
1027  const int width = Environment::GetSingleton()->GetInt("Aila.width");
1028  const int height = Environment::GetSingleton()->GetInt("Aila.height");
1029 
1030  cout << "Prefix scan for problem size of " << width << "x" << height << " = " << width*height << "\n";
1031 
1032 #ifdef TEST_TASKS
1033  cout << "Testing task pool" << "\n";
1034 #else
1035  cout << "Testing thrust" << "\n";
1036 #endif
1037 
1038  for(int i = 0; i < numRepeats; i++)
1039  {
1040 //#ifdef TEST_TASKS
1041  float t = testSort(width*height);
1042  //float t = testSort(m_numTris);
1043 /*#else
1044  float t = testThrustScan(width*height);
1045 #endif*/
1046 
1047  printf("Run %d sort in %fs\n", i, t);
1048  minTime = min(t, minTime);
1049  sumTime += t;
1050  }
1051 
1052  printf("Minimum time from %d runs = %fs\n", numRepeats, minTime);
1053  printf("Average time from %d runs = %fs\n", numRepeats, sumTime/numRepeats);
1054 
1055  //size_t f, t; cuMemGetInfo(&f, &t); fprintf(stdout,"CUDA Memory allocated:%dMB free:%dMB\n",(t-f)/1048576,f/1048576);
1056  return minTime;
1057 }
1058 
1059 void CudaPersistentBVHTracer::updateConstants()
1060 {
1061  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
1062 
1063  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.maxDepth", cudaEnv.optMaxDepth);
1064 
1065  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.planeSelectionOverhead", cudaEnv.optPlaneSelectionOverhead);
1066 
1067  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ci", cudaEnv.optCi);
1068 
1069  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ct", cudaEnv.optCt);
1070 
1071  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ctr", cudaEnv.optCtr);
1072 
1073  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ctt", cudaEnv.optCtt);
1074 
1075  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.triangleBasedWeight", cudaEnv.optTriangleBasedWeight);
1076 
1077  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.rayBasedWeight", cudaEnv.optRayBasedWeight);
1078 
1079  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.axisAlignedWeight", cudaEnv.optAxisAlignedWeight);
1080 
1081  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.cutOffDepth", cudaEnv.optCutOffDepth);
1082  m_cutOffDepth = cudaEnv.optCutOffDepth;
1083 
1084  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.rayLimit", cudaEnv.rayLimit);
1085 
1086  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.triLimit", cudaEnv.triLimit);
1087  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.triMaxLimit", cudaEnv.triMaxLimit);
1088 
1089  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.popCount", cudaEnv.popCount);
1090 
1091  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.granularity", cudaEnv.granularity);
1092 
1093  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.failRq", cudaEnv.failRq);
1094 
1095  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.failureCount", cudaEnv.failureCount);
1096 
1097  int siblingLimit;
1098  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.siblingLimit", siblingLimit);
1099  cudaEnv.siblingLimit = siblingLimit / WARP_SIZE;
1100 
1101  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.childLimit", cudaEnv.childLimit);
1102 
1103  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.subtreeLimit", cudaEnv.subtreeLimit);
1104 
1105  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
1106 
1107  cudaEnv.epsilon = m_epsilon;
1108  //cudaEnv.epsilon = 0.f;
1109 }
1110 
1111 //------------------------------------------------------------------------
1112 
1113 int CudaPersistentBVHTracer::warpSubtasks(int threads)
1114 {
1115  //return (threads + WARP_SIZE - 1) / WARP_SIZE;
1116  return max((threads + WARP_SIZE - 1) / WARP_SIZE, 1); // Do not create empty tasks - at least on warp gets to clean this task
1117 }
1118 
1119 //------------------------------------------------------------------------
1120 
1121 int CudaPersistentBVHTracer::floatToOrderedInt(float floatVal)
1122 {
1123  int intVal = *((int*)&floatVal);
1124  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
1125 }
1126 
1127 /*unsigned int CudaPersistentBVHTracer::floatToOrderedInt(float floatVal)
1128 {
1129  unsigned int f = *((unsigned int*)&floatVal);
1130  unsigned int mask = -(int)(f >> 31) | 0x80000000;
1131  return f ^ mask;
1132 }*/
1133 
1134 //------------------------------------------------------------------------
1135 
1136 void CudaPersistentBVHTracer::allocateSnapshots(Buffer &snapData)
1137 {
1138  // Prepare snapshot memory
1139 #ifdef SNAPSHOT_POOL
1140  snapData.resizeDiscard(sizeof(PoolInfo)*SNAPSHOT_POOL);
1141  PoolInfo* &snapshots = *(PoolInfo**)m_module->getGlobal("g_snapshots").getMutablePtr();
1142  snapshots = (PoolInfo*)snapData.getMutableCudaPtr();
1143  snapData.clearRange32(0, 0, SNAPSHOT_POOL*sizeof(PoolInfo)); // Mark all tasks as empty (important for debug)
1144 #endif
1145 #ifdef SNAPSHOT_WARP
1146  snapData.resizeDiscard(sizeof(WarpInfo)*SNAPSHOT_WARP*NUM_WARPS);
1147  WarpInfo* &snapshots = *(WarpInfo**)m_module->getGlobal("g_snapshots").getMutablePtr();
1148  snapshots = (WarpInfo*)snapData.getMutableCudaPtr();
1149  snapData.clearRange32(0, 0, SNAPSHOT_WARP*NUM_WARPS*sizeof(WarpInfo)); // Mark all tasks as empty (important for debug)
1150 #endif
1151 }
1152 
1153 //------------------------------------------------------------------------
1154 
1155 void CudaPersistentBVHTracer::printSnapshots(Buffer &snapData)
1156 {
1157 #ifdef SNAPSHOT_POOL
1158  PoolInfo* snapshots = (PoolInfo*)snapData.getPtr();
1159 
1160  if(snapshots[SNAPSHOT_POOL-1].pool != 0) // Full
1161  printf("\aSnapshot memory full!\n");
1162 
1163  long long int clockMin = snapshots[0].clockStart;
1164  long long int clockMax = 0;
1165  for(int i = 0; i < SNAPSHOT_POOL; i++)
1166  {
1167  if(snapshots[i].pool == 0)
1168  {
1169  clockMax = snapshots[i-1].clockEnd;
1170  break;
1171  }
1172  }
1173 
1174  ofstream snapfile("plots\\pool\\activity.dat");
1175  snapfile << "Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" << "\n";
1176  for(int i = 0; i < SNAPSHOT_POOL; i++)
1177  {
1178  if(snapshots[i].pool == 0)
1179  break;
1180 
1181  snapfile << i << "\t" << snapshots[i].pool << "\t" << snapshots[i].tasks << "\t" << snapshots[i].active << "\t" << snapshots[i].chunks << "\t" << snapshots[i].depth
1182  << "\t" << snapshots[i].clockEnd - snapshots[i].clockStart << "\n";
1183  }
1184  snapfile.close();
1185 
1186  snapfile.open("plots\\pool\\activity_clockCor.dat");
1187  snapfile << "Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" << "\n";
1188  for(int i = 0; i < SNAPSHOT_POOL; i++)
1189  {
1190  if(snapshots[i].pool == 0)
1191  break;
1192 
1193  snapfile << (float)((long double)(snapshots[i].clockEnd - clockMin) / (long double)(clockMax - clockMin)) << "\t" << snapshots[i].pool << "\t" << snapshots[i].tasks << "\t"
1194  << snapshots[i].active << "\t" << snapshots[i].chunks << "\t" << snapshots[i].depth << "\t" << snapshots[i].clockEnd - snapshots[i].clockStart << "\n";
1195  }
1196 
1197  snapfile.close();
1198 #endif
1199 #ifdef SNAPSHOT_WARP
1200  WarpInfo* snapshots = (WarpInfo*)snapData.getPtr();
1201 
1202  for(int w = 0; w < NUM_WARPS; w++)
1203  {
1204  if(snapshots[SNAPSHOT_WARP-1].reads != 0) // Full
1205  printf("\aSnapshot memory full for warp %d!\n", w);
1206 
1207  ostringstream filename;
1208  filename.fill('0');
1209  filename << "plots\\warps\\warp" << setw(3) << w << ".dat";
1210  //cout << filename.str() << "\n";
1211  ofstream snapfile(filename.str());
1212 
1213  snapfile << "Snap#\t#reads\t#rays\t#tris\ttype(leaf=8)\t#chunks\tpopCount\tdepth\tcDequeue\tcCompute\tstackTop\ttaskIdx" << "\n";
1214  for(int i = 0; i < SNAPSHOT_WARP; i++)
1215  {
1216  if(snapshots[i].reads == 0)
1217  break;
1218 
1219  if(snapshots[i].clockDequeue < snapshots[i].clockSearch || snapshots[i].clockFinished < snapshots[i].clockDequeue)
1220  cout << "Error timer for warp " << w << "\n";
1221 
1222  snapfile << i << "\t" << snapshots[i].reads << "\t" << snapshots[i].rays << "\t" << snapshots[i].tris << "\t" << snapshots[i].type << "\t"
1223  << snapshots[i].chunks << "\t" << snapshots[i].popCount << "\t" << snapshots[i].depth << "\t" << (snapshots[i].clockDequeue - snapshots[i].clockSearch) << "\t"
1224  << (snapshots[i].clockFinished - snapshots[i].clockDequeue) << "\t" << snapshots[i].stackTop << "\t" << snapshots[i].idx << "\n";
1225  }
1226 
1227  snapfile.close();
1228  snapshots += SNAPSHOT_WARP; // Next warp
1229  }
1230 #endif
1231 }
1232 
1233 //------------------------------------------------------------------------
1234 
1235 void CudaPersistentBVHTracer::initPool(int numRays, Buffer* rayBuffer, Buffer* nodeBuffer)
1236 {
1237  // Prepare the task data
1238  updateConstants();
1239 #if PARALLELISM_TEST >= 0
1240  int& numActive = *(int*)m_module->getGlobal("g_numActive").getMutablePtr();
1241  numActive = 1;
1242 #endif
1243 
1244 #ifndef MALLOC_SCRATCHPAD
1245  // Set PPS buffers
1246  m_ppsTris.resizeDiscard(sizeof(int)*m_numTris);
1247  m_ppsTrisIndex.resizeDiscard(sizeof(int)*m_numTris);
1248  m_sortTris.resizeDiscard(sizeof(int)*m_numTris);
1249 
1250  if(numRays > 0)
1251  {
1252  m_ppsRays.resizeDiscard(sizeof(int)*numRays);
1253  m_ppsRaysIndex.resizeDiscard(sizeof(int)*numRays);
1254  m_sortRays.resizeDiscard(sizeof(int)*numRays);
1255  }
1256 #endif
1257 
1258 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1259  // Prepare snapshot memory
1260  Buffer snapData;
1261  allocateSnapshots(snapData);
1262 #endif
1263 
1264  // Set all headers empty
1265 #ifdef TEST_TASKS
1266  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
1267 #ifdef BENCHMARK
1268  m_taskData.clearRange32(0, TaskHeader_Empty, TASK_SIZE * sizeof(int)); // Mark all tasks as empty
1269 #else
1270  m_taskData.clearRange32(0, TaskHeader_Empty, TASK_SIZE * (sizeof(int)+sizeof(Task))); // Mark all tasks as empty (important for debug)
1271 #endif
1272 #endif
1273 
1274  // Increase printf output size so that more can fit
1275  //cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 536870912);
1276 
1277  /*cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); // Driver does not seem to care and preffers L1
1278  cuFuncSetCacheConfig(kernel, CU_FUNC_CACHE_PREFER_SHARED);
1279  CUfunc_cache test;
1280  cuCtxGetCacheConfig(&test);
1281  if(test != CU_FUNC_CACHE_PREFER_SHARED)
1282  printf("Error\n");*/
1283 
1284  // Set texture references.
1285  if(rayBuffer != NULL)
1286  {
1287  m_module->setTexRef("t_rays", *rayBuffer, CU_AD_FORMAT_FLOAT, 4);
1288  }
1289  if(nodeBuffer != NULL)
1290  {
1291  m_module->setTexRef("t_nodesA", *nodeBuffer, CU_AD_FORMAT_FLOAT, 4);
1292  }
1293  m_module->setTexRef("t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
1294  m_module->setTexRef("t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
1295 
1296 /*#ifdef COMPACT_LAYOUT
1297  if(numRays == 0)
1298  {
1299  m_module->setTexRef("t_trisAOut", m_trisCompactOut, CU_AD_FORMAT_FLOAT, 4);
1300  m_module->setTexRef("t_triIndicesOut", m_trisIndexOut, CU_AD_FORMAT_SIGNED_INT32, 1);
1301  }
1302 #endif*/
1303 }
1304 
1305 //------------------------------------------------------------------------
1306 
1307 void CudaPersistentBVHTracer::deinitPool(int numRays)
1308 {
1309  m_ppsTris.reset();
1310  m_ppsTrisIndex.reset();
1311  m_sortTris.reset();
1312 
1313  if(numRays > 0)
1314  {
1315  m_ppsRays.reset();
1316  m_ppsRaysIndex.reset();
1317  m_sortRays.reset();
1318  }
1319 }
1320 
1321 //------------------------------------------------------------------------
1322 
1323 void CudaPersistentBVHTracer::printPoolHeader(TaskStackBase* tasks, int* header, int numWarps, FW::String state)
1324 {
1325 #if PARALLELISM_TEST >= 0
1326  numActive = *(int*)m_module->getGlobal("g_numActive").getPtr();
1327  printf("Active: %d\n", numActive);
1328 #endif
1329 
1330 
1331 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1332  printSnapshots(snapData);
1333 #endif
1334 
1335 #ifdef DEBUG_INFO
1336  Debug << "\nPRINTING DEBUG_INFO STATISTICS" << "\n\n";
1337 #else
1338  Debug << "\nPRINTING STATISTICS" << "\n\n";
1339 #endif
1340 
1341  float4* debugData = (float4*)m_debug.getPtr();
1342  float minAll[4] = {MAX_FLOAT, MAX_FLOAT, MAX_FLOAT, MAX_FLOAT};
1343  float maxAll[4] = {0, 0, 0, 0};
1344  float sumAll[4] = {0, 0, 0, 0};
1345  int countDead = 0;
1346  Debug << "Warp No. cnt_task_queues Avg. #Reads Max #Reads #Restarts" << "\n";
1347  for(int i = 0; i < numWarps; i++)
1348  {
1349  Debug << "Warp " << i << ": (" << debugData[i].x << ", " << debugData[i].y << ", " << debugData[i].z << ", " << debugData[i].w << ")" << "\n";
1350 
1351  //fabs is because we do not care whether the warp stopped prematurely or not
1352  minAll[0] = min(fabs(debugData[i].x), minAll[0]);
1353  minAll[1] = min(fabs(debugData[i].y), minAll[1]);
1354  minAll[2] = min(fabs(debugData[i].z), minAll[2]);
1355  minAll[3] = min(fabs(debugData[i].w), minAll[3]);
1356 
1357  maxAll[0] = max(fabs(debugData[i].x), maxAll[0]);
1358  maxAll[1] = max(fabs(debugData[i].y), maxAll[1]);
1359  maxAll[2] = max(fabs(debugData[i].z), maxAll[2]);
1360  maxAll[3] = max(fabs(debugData[i].w), maxAll[3]);
1361 
1362  sumAll[0] += fabs(debugData[i].x);
1363  sumAll[1] += fabs(debugData[i].y);
1364  sumAll[2] += fabs(debugData[i].z);
1365  sumAll[3] += fabs(debugData[i].w);
1366 
1367  if(debugData[i].x < 0)
1368  countDead++;
1369  }
1370  Debug << "Dead=" << countDead << " / All=" << numWarps << " = " << (float)countDead/(float)numWarps << "\n";
1371  Debug << "Min: " << minAll[0] << ", " << minAll[1] << ", " << minAll[2] << ", " << minAll[3] << "\n";
1372  Debug << "Max: " << maxAll[0] << ", " << maxAll[1] << ", " << maxAll[2] << ", " << maxAll[3] << "\n";
1373  Debug << "Sum: " << sumAll[0] << ", " << sumAll[1] << ", " << sumAll[2] << ", " << sumAll[3] << "\n";
1374  Debug << "Avg: " << sumAll[0]/numWarps << ", " << sumAll[1]/numWarps << ", " << sumAll[2]/numWarps << ", " << sumAll[3]/numWarps << "\n\n" << "\n";
1375  Debug << "cnt_task_queues per object = " << sumAll[0]/(float)m_numTris << "\n";
1376 
1377  Debug << "Pool" << "\n";
1378  Debug << "Top = " << tasks->top << "; Bottom = " << tasks->bottom << "; Unfinished = " << tasks->unfinished << "; Size = " << tasks->sizePool << "; ";
1379  Debug << state.getPtr() << "\n";
1380  Debug << "ActiveTop = " << tasks->activeTop << "; Active = ";
1381  for(int i = 0; i < ACTIVE_MAX+1; i++)
1382  Debug << tasks->active[i] << " ";
1383  Debug << "\n" << "\n";
1384  Debug << "EmptyTop = " << tasks->emptyTop << "; EmptyBottom = " << tasks->emptyBottom << "\nEmpty\n";
1385  for(int i = 0; i < EMPTY_MAX+1; i++)
1386  {
1387  if(i % 50 == 0)
1388  Debug << "\n";
1389  else
1390  Debug << " ";
1391  Debug << tasks->empty[i];
1392  }
1393 
1394  /*for(int i = 0; i < EMPTY_MAX+1; i++)
1395  Debug << tasks->empty[i] << " ";*/
1396  Debug << "\n" << "\n";
1397 
1398  int emptyItems = 0;
1399  int bellowEmpty = 0;
1400  Debug << "Header" << "\n";
1401  for(int i = 0; i < TASK_SIZE; i++)
1402  {
1403  if(i % 50 == 0)
1404  Debug << "\n";
1405  else
1406  Debug << " ";
1407  if(header[i] != TaskHeader_Empty)
1408  {
1409  Debug << header[i];
1410  }
1411  else
1412  {
1413  Debug << TaskHeader_Active;
1414  if(i < tasks->top)
1415  emptyItems++;
1416  }
1417 
1418  if(header[i] < TaskHeader_Empty)
1419  bellowEmpty++;
1420  }
1421 
1422  Debug << "\n\nEmptyItems = " << emptyItems << "\n";
1423  Debug << "BellowEmpty = " << bellowEmpty << "\n";
1424 }
1425 
1426 //------------------------------------------------------------------------
1427 
1428 void CudaPersistentBVHTracer::printPool(TaskStackBVH &tasks, int numWarps)
1429 {
1430 #ifdef LEAF_HISTOGRAM
1431  printf("Leaf histogram\n");
1432  unsigned int leafSum = 0;
1433  unsigned int triSum = 0;
1434  for(S32 i = 0; i <= Environment::GetSingleton()->GetInt("SubdivisionRayCaster.triLimit"); i++)
1435  {
1436  printf("%d: %d\n", i, tasks.leafHist[i]);
1437  leafSum += tasks.leafHist[i];
1438  triSum += i*tasks.leafHist[i];
1439  }
1440  printf("Leafs total %d, average leaf %.2f\n", leafSum, (float)triSum/(float)leafSum);
1441 #endif
1442 
1443  int* header = (int*)m_taskData.getPtr();
1444  FW::String state = sprintf("BVH Top = %d; Tri Top = %d; Warp counter = %d; ", tasks.nodeTop, tasks.triTop, tasks.warpCounter);
1445 #ifdef BVH_COUNT_NODES
1446  state.appendf("Number of inner nodes = %d; Number of leaves = %d; Sorted tris = %d; ", tasks.numNodes, tasks.numLeaves, tasks.numSortedTris);
1447 #endif
1448  printPoolHeader(&tasks, header, numWarps, state);
1449 
1450  Debug << "\n\nTasks" << "\n";
1451  TaskBVH* task = (TaskBVH*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
1452  int stackMax = 0;
1453  int maxDepth = 0;
1454  int syncCount = 0;
1455  int maxTaskId = -1;
1456  long double sumTris = 0;
1457  long double maxTris = 0;
1458 
1459  int sortTasks = 0;
1460  long double cntSortTris = 0;
1461 
1462  int subFailed = 0;
1463 
1464 #ifdef DEBUG_INFO
1465  char terminatedNames[TerminatedBy_Max][255] = {
1466  "None", "Depth","TotalLimit","OverheadLimit","Cost","FailureCounter"
1467  };
1468 
1469  int terminatedBy[TerminatedBy_Max];
1470  memset(&terminatedBy,0,sizeof(int)*TerminatedBy_Max);
1471 #endif
1472 
1473  for(int i = 0; i < TASK_SIZE; i++)
1474  {
1475  if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
1476  {
1477 #ifdef DEBUG_INFO
1478  _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1479  terminatedBy[ task[i].terminatedBy ]++;
1480 #endif
1481 
1482  Debug << "Task " << i << "\n";
1483  Debug << "Header: " << header[i] << "\n";
1484  Debug << "Unfinished: " << task[i].unfinished << "\n";
1485  Debug << "Type: " << task[i].type << "\n";
1486  Debug << "TriStart: " << task[i].triStart << "\n";
1487  Debug << "TriLeft: " << task[i].triLeft << "\n";
1488  Debug << "TriRight: " << task[i].triRight << "\n";
1489  Debug << "TriEnd: " << task[i].triEnd << "\n";
1490  Debug << "ParentIdx: " << task[i].parentIdx << "\n";
1491  Debug << "NodeIdx: " << task[i].nodeIdx << "\n";
1492  Debug << "TaskID: " << task[i].taskID << "\n";
1493  Debug << "Split: (" << task[i].splitPlane.x << ", " << task[i].splitPlane.y << ", " << task[i].splitPlane.z << ", " << task[i].splitPlane.w << ")\n";
1494  Debug << "Box: (" << task[i].bbox.m_mn.x << ", " << task[i].bbox.m_mn.y << ", " << task[i].bbox.m_mn.z << ") - ("
1495  << task[i].bbox.m_mx.x << ", " << task[i].bbox.m_mx.y << ", " << task[i].bbox.m_mx.z << ")\n";
1496  //Debug << "BoxLeft: (" << task[i].bboxLeft.m_mn.x << ", " << task[i].bboxLeft.m_mn.y << ", " << task[i].bboxLeft.m_mn.z << ") - ("
1497  // << task[i].bboxLeft.m_mx.x << ", " << task[i].bboxLeft.m_mx.y << ", " << task[i].bboxLeft.m_mx.z << ")\n";
1498  //Debug << "BoxRight: (" << task[i].bboxRight.m_mn.x << ", " << task[i].bboxRight.m_mn.y << ", " << task[i].bboxRight.m_mn.z << ") - ("
1499  // << task[i].bboxRight.m_mx.x << ", " << task[i].bboxRight.m_mx.y << ", " << task[i].bboxRight.m_mx.z << ")\n";
1500  Debug << "Axis: " << task[i].axis << "\n";
1501  Debug << "Depth: " << task[i].depth << "\n";
1502  Debug << "Step: " << task[i].step << "\n";
1503 #ifdef DEBUG_INFO
1504  //Debug << "Step: " << task[i].step << "\n";
1505  //Debug << "Lock: " << task[i].lock << "\n";
1506 #ifdef MALLOC_SCRATCHPAD
1507  Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
1508 #endif
1509  Debug << "GMEMSync: " << task[i].sync << "\n";
1510  Debug << "Parent: " << task[i].parent << "\n";
1511 #endif
1512 
1513 #ifdef DEBUG_INFO
1514  Debug << "TerminatedBy: " << task[i].terminatedBy << "\n";
1515 #endif
1516  if(task[i].terminatedBy != TerminatedBy_None)
1517  Debug << "Triangles: " << task[i].triEnd - task[i].triStart << "\n";
1518 
1519  Debug << "\n";
1520  stackMax = i;
1521 
1522  if(header[i] > (int)0xFF800000) // Not waiting
1523  {
1524 #ifdef CUTOFF_DEPTH
1525  if(task[i].depth == m_cutOffDepth)
1526  {
1527 #endif
1528  long double tris = task[i].triEnd - task[i].triStart;
1529  if(task[i].terminatedBy != TerminatedBy_None)
1530  {
1531  if(tris > maxTris)
1532  {
1533  maxTris = tris;
1534  maxTaskId = i;
1535  }
1536  sumTris += tris;
1537  }
1538  sortTasks++;
1539  cntSortTris += tris;
1540 #ifdef CUTOFF_DEPTH
1541  }
1542 #endif
1543 
1544 #ifdef DEBUG_INFO
1545  maxDepth = max(task[i].depth, maxDepth);
1546  syncCount += task[i].sync;
1547 #endif
1548  }
1549  }
1550  }
1551 
1552  if(stackMax == TASK_SIZE-1)
1553  printf("\aIncomplete result!\n");
1554 #ifdef CUTOFF_DEPTH
1555  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
1556 #else
1557  Debug << "\n\n";
1558 #endif
1559 
1560 #ifdef DEBUG_INFO
1561  Debug << "Avg naive task height (tris) = " << sumTris/(long double)sortTasks << "\n";
1562  Debug << "Max naive task height (tris) = " << maxTris << ", taskId: " << maxTaskId << "\n";
1563  Debug << "Cnt sorted operations = " << sortTasks << "\n";
1564  double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0f)));
1565  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1566  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1567  Debug << "\n";
1568  Debug << "Max task depth = " << maxDepth << "\n";
1569  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
1570  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
1571  Debug << "Terminated by:" << "\n";
1572  for(int i = 0; i < TerminatedBy_Max; i++)
1573  {
1574  Debug << terminatedNames[i] << ": " << terminatedBy[i] << "\n";
1575  }
1576 #endif
1577 
1578  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
1579 }
1580 
1581 //------------------------------------------------------------------------
1582 
1583 void CudaPersistentBVHTracer::printPool(TaskStack &tasks, int numWarps)
1584 {
1585  tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getPtr();
1586  int* header = (int*)m_taskData.getPtr();
1587  printPoolHeader(&tasks, header, numWarps, FW::sprintf(""));
1588 
1589  Debug << "\n\nTasks" << "\n";
1590  Task* task = (Task*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
1591  int stackMax = 0;
1592  int maxDepth = 0;
1593  int syncCount = 0;
1594  int maxTaskId = -1;
1595  int rayIssues = 0;
1596  int triIssues = 0;
1597  long double sumRays = 0;
1598  long double maxRays = 0;
1599  long double sumTris = 0;
1600  long double maxTris = 0;
1601 
1602  int isectTasks = 0;
1603  long double cntIsect = 0;
1604  long double maxIsect = 0;
1605  long double clippedIsect = 0;
1606 
1607  int sortTasks = 0;
1608  long double cntSortRays = 0;
1609  long double cntClippedRays = 0;
1610  long double cntSortTris = 0;
1611 
1612  int subFailed = 0;
1613  int failureCount = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.failureCount");
1614 
1615 #ifdef DEBUG_INFO
1616  char terminatedNames[TerminatedBy_Max][255] = {
1617  "None", "Depth","TotalLimit","OverheadLimit","Cost","FailureCounter"
1618  };
1619 
1620  int terminatedBy[TerminatedBy_Max];
1621  memset(&terminatedBy,0,sizeof(int)*TerminatedBy_Max);
1622 #endif
1623 
1624  for(int i = 0; i < TASK_SIZE; i++)
1625  {
1626  if(task[i].depend1 != TaskHeader_Empty || task[i].depend2 != TaskHeader_Empty)
1627  {
1628 #ifdef DEBUG_INFO
1629  _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1630  terminatedBy[ task[i].terminatedBy ]++;
1631 #endif
1632 
1633  Debug << "Task " << i << "\n";
1634  Debug << "Header: " << header[i] << "\n";
1635  Debug << "Unfinished: " << task[i].unfinished << "\n";
1636  Debug << "Type: " << task[i].type << "\n";
1637  Debug << "RayStart: " << task[i].rayStart << "\n";
1638  Debug << "RayEnd: " << task[i].rayEnd << "\n";
1639  if(task[i].type != TaskType_Intersect) // Splitted
1640  {
1641  Debug << "RayLeft: " << task[i].rayLeft << "\n";
1642  Debug << "RayRight: " << task[i].rayRight << "\n";
1643  Debug << "RayActive: " << task[i].rayActive << "\n";
1644  }
1645 #ifdef CLIP_INTERSECT
1646  if(task[i].type == TaskType_Intersect)
1647  Debug << "RayActive: " << task[i].rayActive << "\n";
1648 #endif
1649  Debug << "TriStart: " << task[i].triStart << "\n";
1650  Debug << "TriEnd: " << task[i].triEnd << "\n";
1651  if(task[i].type != TaskType_Intersect) // Splitted
1652  {
1653  //Debug << "BestOrder: " << task[i].bestOrder << "\n";
1654  Debug << "TriLeft: " << task[i].triLeft << "\n";
1655  Debug << "TriRight: " << task[i].triRight << "\n";
1656  }
1657  Debug << "Depend1: " << task[i].depend1 << "\n";
1658  Debug << "Depend2: " << task[i].depend2 << "\n";
1659  if(task[i].type != TaskType_Intersect) // Splitted
1660  {
1661  Debug << "Split: (" << task[i].splitPlane.x << ", " << task[i].splitPlane.y << ", " << task[i].splitPlane.z << ", " << task[i].splitPlane.w << ")\n";
1662  }
1663  Debug << "Box: (" << task[i].bbox.m_mn.x << ", " << task[i].bbox.m_mn.y << ", " << task[i].bbox.m_mn.z << ") - ("
1664  << task[i].bbox.m_mx.x << ", " << task[i].bbox.m_mx.y << ", " << task[i].bbox.m_mx.z << ")\n";
1665  //Debug << "BoxLeft: (" << task[i].bboxLeft.m_mn.x << ", " << task[i].bboxLeft.m_mn.y << ", " << task[i].bboxLeft.m_mn.z << ") - ("
1666  // << task[i].bboxLeft.m_mx.x << ", " << task[i].bboxLeft.m_mx.y << ", " << task[i].bboxLeft.m_mx.z << ")\n";
1667  //Debug << "BoxMiddle (" << task[i].bboxMiddle.m_mn.x << ", " << task[i].bboxMiddle.m_mn.y << ", " << task[i].bboxMiddle.m_mn.z << ") - ("
1668  // << task[i].bboxMiddle.m_mx.x << ", " << task[i].bboxMiddle.m_mx.y << ", " << task[i].bboxMiddle.m_mx.z << ")\n";
1669  //Debug << "BoxRight: (" << task[i].bboxRight.m_mn.x << ", " << task[i].bboxRight.m_mn.y << ", " << task[i].bboxRight.m_mn.z << ") - ("
1670  // << task[i].bboxRight.m_mx.x << ", " << task[i].bboxRight.m_mx.y << ", " << task[i].bboxRight.m_mx.z << ")\n";
1671  Debug << "Depth: " << task[i].depth << "\n";
1672 #ifdef DEBUG_INFO
1673  //Debug << "Step: " << task[i].step << "\n";
1674  //Debug << "Lock: " << task[i].lock << "\n";
1675  Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
1676  Debug << "GMEMSync: " << task[i].sync << "\n";
1677  Debug << "TaskID: " << task[i].taskID << "\n";
1678  Debug << "Parent: " << task[i].parent << "\n";
1679 #if AABB_TYPE < 3
1680  if(task[i].type == TaskType_AABB_Max)
1681 #elif AABB_TYPE == 3
1682  if(task[i].type == TaskType_AABB)
1683 #endif
1684  {
1685  Debug << "SubtaskIdx: " << task[i].subtaskIdx << "\n";
1686  Debug << "Clipped rays: " << task[i].rayEnd-task[i].rayActive << "\n";
1687  }
1688 #endif
1689 
1690 #ifdef CUTOFF_DEPTH
1691  if(task[i].depth == m_cutOffDepth)
1692 #endif
1693  if(task[i].type == TaskType_Intersect)
1694  {
1695 #ifdef CLIP_INTERSECT
1696  long double locRays = task[i].rayActive - task[i].rayStart;
1697 #else
1698  long double locRays = task[i].rayEnd - task[i].rayStart;
1699 #endif
1700  long double locTris = task[i].triEnd - task[i].triStart;
1701  Debug << "Intersections: " << locRays * locTris << "\n";
1702  //if(locRays > 1000 || locTris > 1000 )
1703  {
1704  if( locRays < sqrt((double)locTris) )
1705  triIssues++;
1706  if( locTris < sqrt((double)locRays) )
1707  rayIssues++;
1708  }
1709 
1710  Debug << "ClippedIntersections: " << task[i].clippedRays * locTris << "\n";
1711  clippedIsect += task[i].clippedRays * locTris;
1712  }
1713 
1714 #ifdef ONE_WARP_RUN
1715  //Debug << "Clock: " << task[i].clockEnd - task[i].clockStart << "\n";
1716  Debug << "Clock: " << task[i].clockEnd << "\n";
1717 #endif
1718 #ifdef DEBUG_INFO
1719  Debug << "TerminatedBy: " << task[i].terminatedBy << "\n";
1720 #endif
1721 
1722  Debug << "\n";
1723  stackMax = i;
1724 
1725 #ifdef CUTOFF_DEPTH
1726  if(task[i].depth == m_cutOffDepth)
1727  {
1728 #endif
1729 
1730 #ifdef CLIP_INTERSECT
1731  long double rays = task[i].rayActive - task[i].rayStart;
1732 #else
1733  long double rays = task[i].rayEnd - task[i].rayStart;
1734 #endif
1735 
1736  long double tris = task[i].triEnd - task[i].triStart;
1737  if(task[i].type == TaskType_Intersect)
1738  {
1739  isectTasks++;
1740  cntIsect += rays*tris;
1741  maxIsect = max<long double>(rays*tris, maxIsect);
1742  if(maxIsect==(rays*tris)) maxTaskId = i;
1743  sumRays += rays;
1744  maxRays = max<long double>(rays, maxRays);
1745  sumTris += tris;
1746  maxTris = max<long double>(tris, maxTris);
1747  if(task[i].subFailureCounter > failureCount)
1748  subFailed++;
1749  }
1750 #if AABB_TYPE < 3
1751  if(task[i].type == TaskType_AABB_Max)
1752 #elif AABB_TYPE == 3
1753  if(task[i].type == TaskType_AABB)
1754 #endif
1755  {
1756  sortTasks++;
1757  cntSortRays += rays;
1758  cntClippedRays += task[i].rayEnd-task[i].rayActive;
1759  cntSortTris += tris;
1760  }
1761 #ifdef CUTOFF_DEPTH
1762  }
1763 #endif
1764 
1765 #ifdef DEBUG_INFO
1766  maxDepth = max(task[i].depth, maxDepth);
1767  syncCount += task[i].sync;
1768 #endif
1769  }
1770  }
1771 
1772  if(stackMax == TASK_SIZE-1)
1773  printf("\aIncomplete result!\n");
1774 #ifdef CUTOFF_DEPTH
1775  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
1776 #else
1777  Debug << "\n\n";
1778 #endif
1779 
1780 #ifdef DEBUG_INFO
1781  Debug << "ray_obj_intersections per ray = " << cntIsect/m_numRays << "\n";
1782  Debug << "cnt_leaves = " << isectTasks << "\n";
1783  Debug << "cnt_leaves per obj = " << (float)isectTasks/(float)m_numTris << "\n";
1784  Debug << "ray_obj_intersections = " << cntIsect << "\n";
1785  Debug << "Useless ray_obj_intersections = " << clippedIsect << "\n";
1786  Debug << "Avg ray_obj_intersections per leaf = " << cntIsect/(long double)isectTasks << "\n";
1787  Debug << "Max ray_obj_intersections per leaf = " << maxIsect << ", taskId: " << maxTaskId << "\n";
1788  Debug << "reduction [%] = " << 100.0f * (cntIsect/((long double)m_numRays*(long double)m_numTris)) << "\n";
1789  Debug << "Avg naive task width (rays) = " << sumRays/(long double)isectTasks << "\n";
1790  Debug << "Max naive task width (rays) = " << maxRays << "\n";
1791  Debug << "Avg naive task height (tris) = " << sumTris/(long double)isectTasks << "\n";
1792  Debug << "Max naive task height (tris) = " << maxTris << "\n";
1793  Debug << "Cnt sorted operations = " << sortTasks << "\n";
1794  double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0f)));
1795  double cntRaysLog2Tris = (double(m_numRays) * (double)(logf(m_numTris)/logf(2.0f)));
1796  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1797  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1798  Debug << "Cnt sorted rays = " << cntSortRays << " BEFORE CLIPPING\n";
1799  Debug << "Cnt sorted rays/(log N)/R, N=#tris,R=#rays = " << cntSortRays/cntRaysLog2Tris << " BEFORE CLIPPING\n";
1800  Debug << "Cnt clipped rays = " << cntClippedRays << "\n";
1801  Debug << "\n";
1802  Debug << "Max task depth = " << maxDepth << "\n";
1803  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
1804  Debug << "Ray issues = " << rayIssues << ", tris issues = " << triIssues << "\n";
1805  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
1806 
1807  Debug << "Terminated by:" << "\n";
1808  for(int i = 0; i < TerminatedBy_Max; i++)
1809  {
1810  Debug << terminatedNames[i] << ": " << terminatedBy[i] << "\n";
1811  }
1812 #endif
1813 
1814  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
1815 }
1816 
1817 //------------------------------------------------------------------------
1818 
1819 F32 CudaPersistentBVHTracer::traceCudaRayBuffer(RayBuffer& rb)
1820 {
1821  CUfunction kernel;
1822 #ifdef TEST_TASKS
1823  kernel = m_module->getKernel("trace");
1824  if (!kernel)
1825  fail("Trace kernel not found!");
1826 #endif
1827 
1828  // Prepare the task data
1829  initPool(m_numRays, &rb.getRayBuffer());
1830 
1831  // Set input.
1832  KernelInputNoStruct& in = *(KernelInputNoStruct*)m_module->getGlobal("c_ns_in").getMutablePtr();
1833  in.numRays = m_numRays;
1834  in.numTris = m_numTris;
1835  in.anyHit = !rb.getNeedClosestHit();
1836  in.rays = rb.getRayBuffer().getMutableCudaPtr();
1837  in.results = rb.getResultBuffer().getMutableCudaPtr();
1838  in.tris = m_trisCompact.getCudaPtr();
1839  in.trisIndex = m_trisIndex.getMutableCudaPtr();
1840  in.raysIndex = m_raysIndex.getMutableCudaPtr();
1841  in.ppsRaysBuf = m_ppsRays.getMutableCudaPtr();
1842  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
1843  in.ppsRaysIndex = m_ppsRaysIndex.getMutableCudaPtr();
1844  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
1845  in.sortRays = m_sortRays.getMutableCudaPtr();
1846  in.sortTris = m_sortTris.getMutableCudaPtr();
1847 
1848 
1849 #ifndef TEST_TASKS
1850  kernel = m_module->getKernel("__naive");
1851  if (!kernel)
1852  fail("Trace kernel not found!");
1853 
1854  Vec2i blockSizeN(1024, 1);
1855  Vec2i gridSizeN((m_numRays+1023)/1024, 1);
1856 
1857  float tNaive = m_module->launchKernelTimed(kernel, blockSizeN, gridSizeN);
1858 
1859  printf("Verifying GPU trace\n");
1860  /*for(int i = 0; i < m_numRays; i++)
1861  {
1862  const RayResult& res = rb.getResultForSlot(i);
1863  Debug << "Ray " << i << "\tGPU naive: id=" << res.id << ", t=" << res.t << "\n";
1864  }*/
1865 
1866  return tNaive;
1867 #endif
1868 
1869 
1870 #if SPLIT_TYPE == 3
1871  m_splitData.clearRange(0, 0, sizeof(SplitInfo)); // Set first split to zeros
1872  // Prepare split stack
1873  SplitInfo* &splits = *(SplitInfo**)m_module->getGlobal("g_splitStack").getMutablePtr();
1874  splits = (SplitInfo*)m_splitData.getMutableCudaPtr();
1875 #endif
1876 
1877  CudaAABB bbox;
1878  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
1879  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
1880 
1881  // Set parent task containing all the work
1882  Task all;
1883  all.rayStart = 0;
1884  all.rayLeft = 0;
1885  all.rayRight = m_numRays;
1886  all.rayEnd = m_numRays;
1887  all.triStart = 0;
1888  all.triLeft = 0;
1889  all.triRight = m_numTris;
1890  all.triEnd = m_numTris;
1891  all.bbox = bbox;
1892  all.step = 0;
1893  all.depend1 = DependType_Root;
1894  all.depend2 = DependType_None; // Only one task is dependent on this one - the unfinished counter
1895  all.lock = LockType_Free;
1896  all.bestCost = 1e38f;
1897  all.depth = 0;
1898  all.subFailureCounter = 0;
1899  Vector3 size = m_bbox.Diagonal();
1900  all.axis = size.MajorAxis();
1901  all.terminatedBy = TerminatedBy_None;
1902 #ifdef DEBUG_INFO
1903  all.sync = 0;
1904  all.parent = -1;
1905  all.taskID = 0;
1906  all.clippedRays = 0;
1907  all.clockStart = 0;
1908  all.clockEnd = 0;
1909 #endif
1910 
1911 #if SPLIT_TYPE == 0
1912 #if SCAN_TYPE == 0
1913  all.type = TaskType_Sort_PPS1;
1914 #elif SCAN_TYPE == 1
1915  all.type = TaskType_Sort_PPS1_Up;
1916 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
1917  all.type = TaskType_Sort_SORT1;
1918 #endif
1919 
1920  all.unfinished = warpSubtasks(m_numRays) + warpSubtasks(m_numTris);
1921  all.bestOrder = warpSubtasks(m_numRays);
1922  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
1923  if(all.axis == 0)
1924  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
1925  else if(all.axis == 1)
1926  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
1927  else
1928  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
1929 #else
1930  all.type = TaskType_Split;
1931 #if SPLIT_TYPE == 1
1932  int evaluatedCandidates = (int)sqrtf(m_numRays) + (int)sqrtf(m_numTris);
1933  int numPlanes = 0.5f * (m_numRays + m_numTris)/evaluatedCandidates;
1934  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
1935 #elif SPLIT_TYPE == 2
1936  all.unfinished = 1;
1937 #elif SPLIT_TYPE == 3
1938  all.type = TaskType_SplitParallel;
1939  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
1940  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
1941  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
1942 #endif
1943 #endif
1944 
1945 #ifdef DEBUG_PPS
1946  all.type = TaskType_Sort_PPS1_Up;
1947  int pRays = warpSubtasks(m_numRays);
1948  all.bestOrder = pRays;
1949  int pTris = warpSubtasks(m_numTris);
1950  all.unfinished = pRays+pTris;
1951 #endif
1952 
1953  all.origSize = all.unfinished;
1954 
1955  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(Task)); // Set the first task
1956 
1957  // Set parent task header
1958  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
1959 
1960  // Prepare the task stack
1961  TaskStack& tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getMutablePtr();
1962  tasks.header = (int*)m_taskData.getMutableCudaPtr();
1963  tasks.tasks = (Task*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
1964  tasks.top = 0;
1965  tasks.bottom = 0;
1966  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
1967  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
1968  tasks.active[0] = 0;
1969  //for(int i = 0; i < ACTIVE_MAX+1; i++)
1970  // tasks.active[i] = i;
1971  tasks.activeTop = 1;
1972  //tasks.empty[0] = 0;
1973  //int j = 1;
1974  //for(int i = EMPTY_MAX; i > 0; i--, j++)
1975  // tasks.empty[i] = j;
1976  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
1977  tasks.emptyTop = 0;
1978  tasks.emptyBottom = 0;
1979  tasks.unfinished = -1; // We are waiting for one task to finish = task all
1980  tasks.sizePool = TASK_SIZE;
1981  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
1982  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
1983 
1984  // Determine block and grid sizes.
1985 #ifdef ONE_WARP_RUN
1986  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
1987  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
1988  int numWarps = 1;
1989 #else
1990  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
1991  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
1992  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
1993  int gridSizeX = NUM_SM*numBlocksPerSM;
1994  int numWarps = numWarpsPerBlock*gridSizeX;
1995  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
1996 
1997  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
1998  printf("\aNUM_WARPS constant does not match the launch parameters\n");
1999 #endif
2000 
2001  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2002  m_debug.clear();
2003  in.debug = m_debug.getMutableCudaPtr();
2004 
2005  // Launch.
2006  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2007 
2008 #ifndef BENCHMARK
2009  cuCtxSynchronize(); // Flushes printfs
2010 #endif
2011 
2012 #ifdef DEBUG_PPS
2013  ptout = (S32*)m_ppsTris.getPtr();
2014  stout = (S32*)m_sortTris.getPtr();
2015 
2016  prout = (S32*)m_ppsRays.getPtr();
2017  srout = (S32*)m_sortRays.getPtr();
2018  S32 sum = 0;
2019  S32 error = 0;
2020  int j = 0;
2021  for(int i=0;i<m_numTris;i++)
2022  {
2023  sum += *stout; // Here for inclusive scan
2024  if(*ptout != sum)
2025  {
2026  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *ptout << " for " << m_numTris << " triangles!" << "\n";
2027  error = 1;
2028  if(j == 10)
2029  break;
2030  j++;
2031  }
2032  if(*stout < -1 || *stout > 1)
2033  {
2034  cout << "\nWTF " << i << " of " << m_numTris << ": " << *stout << "!\n" << "\n";
2035  break;
2036  }
2037  //sum += *stout; // Here for exclusive scan
2038  ptout++;
2039  stout++;
2040  }
2041 
2042  sum = 0;
2043  for(int i=0;i<m_numRays;i++)
2044  {
2045  sum += *srout; // Here for inclusive scan
2046  if(*prout != sum)
2047  {
2048  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *prout << " for " << m_numRays << " rays!" << "\n";
2049  error = 1;
2050  if(j == 10)
2051  break;
2052  j++;
2053  }
2054  if(*srout < -1 || *srout > 2)
2055  {
2056  cout << "\nWTF " << i << " of " << m_numRays << ": " << *srout << "!\n" << "\n";
2057  break;
2058  }
2059  //sum += *srout; // Here for exclusive scan
2060  prout++;
2061  srout++;
2062  }
2063 
2064  if(!error)
2065  cout << "PPS correct for " << m_numTris << " triangles and " << m_numRays << " rays!" << "\n";
2066  return 0;
2067 #endif
2068 
2069  // Set rays index buffer
2070  /*int* ind = (int*)m_raysIndex.getPtr();
2071  int count = 0;
2072  int mismatched = 0;
2073 
2074  // Validate if rays hit triangles
2075  printf("Verifying GPU trace\n");
2076  for(int i = 0; i < m_numRays; i++)
2077  {
2078  const RayResult& res = rb.getResultForSlot(i);
2079  //RayResult& res = rb.getMutableResultForSlot(i);
2080  //res.clear();
2081  Ray ray = rb.getRayForSlot(i);
2082  RayResult cpu;
2083  ray.tmax = 1e36;
2084 
2085  if(i % 10000 == 0)
2086  printf("rid: %d\n", i);
2087  traceCpuRay(ray, cpu, !rb.getNeedClosestHit());
2088  //traceCpuRay(ray, res, !rb.getNeedClosestHit());
2089 
2090  if(ind[i] != i)
2091  count++;
2092 
2093  if(res.id != cpu.id)
2094  {
2095  Debug << "Ray " << i << " CPU/GPU mismatch! Swapped: " << (ind[i] != i) << "\n"
2096  << "\tCPU: id=" << cpu.id << ", t=" << cpu.t << "\n"
2097  << "\tGPU: id=" << res.id << ", t=" << res.t << "\n";
2098  mismatched++;
2099  }
2100 
2101  //Debug << "Ray " << i << " CPU: id=" << cpu.id << ", t=" << cpu.t << "\n";
2102  }
2103  Debug << "Swaped " << count << "\n";
2104  Debug << "Mismatched " << mismatched << "\n";*/
2105 
2106  //Debug << "\nTraced in " << tKernel << "\n\n";
2107 
2108 #ifndef BENCHMARK
2109  tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getPtr();
2110  printPool(tasks, numWarps);
2111 
2112  /*for(int i = 0; i < m_numRays; i++)
2113  {
2114  const RayResult& res = rb.getResultForSlot(i);
2115  if(res.id == -2)
2116  {
2117  printf("Error on ray %d! Value: (%d, %f, %f, %f)\n", i, res.id, res.t, res.u, res.v);
2118  }
2119  //Debug << "Ray " << i << "! Value: (" << res.id << ", " << res.t << ", " << res.u << ", " << res.v << ")" << "\n";
2120  }*/
2121 
2122  /*CUcontext ctx;
2123  cuCtxPopCurrent(&ctx);
2124  cuCtxDestroy(ctx);
2125  exit(0);*/
2126 #endif
2127 
2128  return tKernel;
2129 }
2130 
2131 F32 CudaPersistentBVHTracer::buildCudaBVH()
2132 {
2133  CUfunction kernel;
2134  kernel = m_module->getKernel("build");
2135  if (!kernel)
2136  fail("Build kernel not found!");
2137 
2138 #ifdef MALLOC_SCRATCHPAD
2139  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2140  in.numTris = m_numTris;
2141  in.tris = m_trisCompact.getCudaPtr();
2142  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2143 #ifdef COMPACT_LAYOUT
2144  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2145  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2146 #endif
2147 #endif
2148 
2149  // Prepare the task data
2150  initPool();
2151 
2152 #ifndef MALLOC_SCRATCHPAD
2153  // Set input.
2154  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2155  in.numTris = m_numTris;
2156  in.tris = m_trisCompact.getCudaPtr();
2157  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2158  //in.trisBox = m_trisBox.getCudaPtr();
2159  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
2160  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
2161  in.sortTris = m_sortTris.getMutableCudaPtr();
2162 #ifdef COMPACT_LAYOUT
2163  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2164  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2165 #endif
2166 #else
2167  CUfunction kernelAlloc = m_module->getKernel("allocFreeableMemory", 2*sizeof(int));
2168  if (!kernelAlloc)
2169  fail("Memory allocation kernel not found!");
2170 
2171  int offset = 0;
2172  offset += m_module->setParami(kernelAlloc, offset, m_numTris);
2173  offset += m_module->setParami(kernelAlloc, offset, 0);
2174  F32 allocTime = m_module->launchKernelTimed(kernelAlloc, Vec2i(1,1), Vec2i(1, 1));
2175 
2176 #ifndef BENCHMARK
2177  printf("Memory allocated in %f\n", allocTime);
2178 #endif
2179 
2180  CUfunction kernelMemCpyIndex = m_module->getKernel("MemCpyIndex", sizeof(CUdeviceptr)+sizeof(int));
2181  if (!kernelMemCpyIndex)
2182  fail("Memory copy kernel not found!");
2183 
2184  int memSize = m_trisIndex.getSize()/sizeof(int);
2185  offset = 0;
2186  offset += m_module->setParamPtr(kernelMemCpyIndex, offset, m_trisIndex.getCudaPtr());
2187  offset += m_module->setParami(kernelMemCpyIndex, offset, memSize);
2188  F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex, Vec2i(256,1), Vec2i((memSize-1+256)/256, 1));
2189 
2190 #ifndef BENCHMARK
2191  printf("Triangle indices copied in %f\n", memcpyTime);
2192 #endif
2193  in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2194 #endif
2195 
2196 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2197 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2198  SplitRed split;
2199  for(int i = 0; i < 2; i++)
2200  {
2201  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2202  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2203  split.children[i].cnt = 0;
2204  }
2205 
2206  SplitArray sArray;
2207  for(int i = 0; i < NUM_WARPS; i++)
2208  {
2209  for(int j = 0; j < PLANE_COUNT; j++)
2210  sArray.splits[i][j] = split;
2211  }
2212 #else
2213  SplitRed split;
2214  for(int i = 0; i < 2; i++)
2215  {
2216  //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2217  //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2218  split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2219  split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2220  split.children[i].cnt = 0;
2221  }
2222 
2223  SplitArray sArray;
2224  for(int j = 0; j < PLANE_COUNT; j++)
2225  sArray.splits[j] = split;
2226 
2227  m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
2228 #endif
2229 
2230  m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
2231 #endif
2232 
2233  CudaAABB bbox;
2234  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
2235  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
2236 
2237  // Set parent task containing all the work
2238  TaskBVH all;
2239  all.triStart = 0;
2240  all.triLeft = 0;
2241 #ifndef MALLOC_SCRATCHPAD
2242  all.triRight = m_numTris;
2243 #else
2244  all.triRight = 0;
2245 #endif
2246  all.triEnd = m_numTris;
2247  all.bbox = bbox;
2248  all.step = 0;
2249  all.lock = LockType_Free;
2250  all.bestCost = 1e38f;
2251  all.depth = 0;
2252  all.dynamicMemory= 0;
2253 #ifndef MALLOC_SCRATCHPAD
2254  all.triIdxCtr = 0;
2255 #endif
2256  all.parentIdx = -1;
2257  all.nodeIdx = 0;
2258  all.taskID = 0;
2259  Vector3 size = m_bbox.Diagonal();
2260  all.axis = size.MajorAxis();
2261  all.terminatedBy = TerminatedBy_None;
2262 #ifdef DEBUG_INFO
2263  all.sync = 0;
2264  all.parent = -1;
2265  all.clockStart = 0;
2266  all.clockEnd = 0;
2267 #endif
2268 
2269 #if SPLIT_TYPE == 0
2270 #if SCAN_TYPE == 0
2271  all.type = TaskType_Sort_PPS1;
2272 #elif SCAN_TYPE == 1
2273  all.type = TaskType_Sort_PPS1_Up;
2274 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2275  all.type = TaskType_Sort_SORT1;
2276 #endif
2277  all.unfinished = warpSubtasks(m_numTris);
2278  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2279  if(all.axis == 0)
2280  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2281  else if(all.axis == 1)
2282  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2283  else
2284  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2285 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2286 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2287  all.type = TaskType_InitMemory;
2288  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
2289 #else
2290  all.type = TaskType_BinTriangles;
2291  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2292  /*all.type = TaskType_BuildObjectSAH;
2293  all.unfinished = 1;*/
2294 #endif
2295 #endif
2296  all.origSize = all.unfinished;
2297 
2298  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2299 
2300  // Set parent task header
2301  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2302 
2303  // Prepare the task stack
2304  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2305  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2306  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2307  tasks.nodeTop = 1;
2308  tasks.triTop = 0;
2309  tasks.top = 0;
2310  tasks.bottom = 0;
2311  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2312  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
2313  tasks.active[0] = 0;
2314  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
2315  tasks.active[i] = i;*/
2316  tasks.activeTop = 1;
2317  //tasks.empty[0] = 0;
2318  //int j = 1;
2319  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2320  // tasks.empty[i] = j;
2321  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
2322  tasks.emptyTop = 0;
2323  tasks.emptyBottom = 0;
2324  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2325  tasks.numSortedTris = 0;
2326  tasks.numNodes = 0;
2327  tasks.numLeaves = 0;
2328  tasks.numEmptyLeaves = 0;
2329  tasks.sizePool = TASK_SIZE;
2330  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2331  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2332  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
2333 
2334 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2335  // Prepare split stack
2336  SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
2337  splits = (SplitArray*)m_splitData.getMutableCudaPtr();
2338 #endif
2339 
2340  CudaBVHNode* &bvh = *(CudaBVHNode**)m_module->getGlobal("g_bvh").getMutablePtr();
2341  bvh = (CudaBVHNode*)m_bvhData.getMutableCudaPtr();
2342 
2343  // Determine block and grid sizes.
2344 #ifdef ONE_WARP_RUN
2345  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2346  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2347  int numWarps = 1;
2348 #else
2349  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2350  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2351  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2352  int gridSizeX = NUM_SM*numBlocksPerSM;
2353  int numWarps = numWarpsPerBlock*gridSizeX;
2354  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2355 
2356  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2357  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2358 #endif
2359 
2360  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2361  m_debug.clear();
2362  in.debug = m_debug.getMutableCudaPtr();
2363 
2364  // Launch.
2365  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2366 
2367 /*#ifdef MALLOC_SCRATCHPAD
2368  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
2369  if (!kernelDealloc)
2370  fail("Memory allocation kernel not found!");
2371 
2372  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
2373 
2374  printf("Memory freed in %f\n", deallocTime);
2375 #endif*/
2376 
2377 #ifndef BENCHMARK
2378  cuCtxSynchronize(); // Flushes printfs
2379 #endif
2380 
2381 #ifdef DEBUG_PPS
2382  pout = (S32*)m_ppsTris.getPtr();
2383  sout = (S32*)m_sortTris.getPtr();
2384  S32 sum = 0;
2385  S32 error = 0;
2386  int j = 0;
2387  for(int i=0;i<m_numTris;i++)
2388  {
2389  sum += *sout; // Here for inclusive scan
2390  if(*pout != sum)
2391  {
2392  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *pout << " for " << m_numTris << " triangles!" << "\n";
2393  error = 1;
2394  if(j == 10)
2395  break;
2396  j++;
2397  }
2398  if(*sout != 0 && *sout != 1)
2399  {
2400  cout << "\nWTF " << i << " of " << m_numTris << ": " << *sout << "!\n" << "\n";
2401  break;
2402  }
2403  //sum += *sout; // Here for exclusive scan
2404  pout++;
2405  sout++;
2406  }
2407 
2408  if(!error)
2409  cout << "PPS correct for " << m_numTris << " triangles!" << "\n";
2410  return 0;
2411 #endif
2412 
2413  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2414  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
2415  tKernel = 1e38f;
2416  //printf("%d (%d x %d) (%d x %d)\n", tasks.unfinished != 0, tasks.nodeTop, m_bvhData.getSize() / sizeof(CudaBVHNode), tasks.triTop, m_trisIndexOut.getSize() / sizeof(S32));
2417 
2418  //Debug << "\nBuild in " << tKernel << "\n\n";
2419 
2420 #ifndef BENCHMARK
2421  printPool(tasks, numWarps);
2422 
2423  /*Debug << "\n\nBVH" << "\n";
2424  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
2425 
2426  for(int i = 0; i < tasks.nodeTop; i++)
2427  {
2428  Debug << "Node " << i << "\n";
2429  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
2430  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
2431  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
2432  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
2433  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
2434  }*/
2435 
2436  // Free data
2437  deinitPool();
2438 #endif
2439 
2440  return tKernel;
2441 }
2442 
2443 F32 CudaPersistentBVHTracer::buildCudaKdtree()
2444 {
2445  CUfunction kernel;
2446  kernel = m_module->getKernel("build");
2447  if (!kernel)
2448  fail("Build kernel not found!");
2449 
2450  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2451  in.numTris = m_numTris;
2452  in.tris = m_trisCompact.getCudaPtr();
2453  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2454 
2455 #ifndef INTERLEAVED_LAYOUT
2456  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2457  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2458 #endif
2459 
2460  // Prepare the task data
2461  initPool();
2462  // Set the maximum depth for the current triangle count
2463  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
2464  float k1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK1");
2465  float k2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK2");
2466  float f1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.failK1");
2467  float f2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.failK2");
2468  cudaEnv.optMaxDepth = k1 * log2((F32)m_numTris) + k2;
2469  cudaEnv.failureCount = f1 * cudaEnv.optMaxDepth + f2;
2470 #ifndef BENCHMARK
2471  printf("Maximum depth = %d\n", cudaEnv.optMaxDepth);
2472  printf("Failure count = %d\n", cudaEnv.failureCount);
2473 #endif
2474 
2475  int baseOffset = setDynamicMemory();
2476  in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2477 
2478 #if SPLIT_TYPE == 3
2479  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
2480 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2481 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2482  SplitRed split;
2483  for(int i = 0; i < 2; i++)
2484  {
2485  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2486  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2487  split.children[i].cnt = 0;
2488  }
2489 
2490  SplitArray sArray;
2491  for(int i = 0; i < NUM_WARPS; i++)
2492  {
2493  for(int j = 0; j < PLANE_COUNT; j++)
2494  sArray.splits[i][j] = split;
2495  }
2496 #else
2497  //SplitRed split;
2498  //for(int i = 0; i < 2; i++)
2499  //{
2500  // //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2501  // //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2502  // split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2503  // split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2504  // split.children[i].cnt = 0;
2505  //}
2506 
2507  //SplitArray sArray;
2508  //for(int j = 0; j < PLANE_COUNT; j++)
2509  // sArray.splits[j] = split;
2510 
2511  //m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
2512  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
2513 #endif
2514 
2515  //m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
2516  // Prepare split stack
2517  //SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
2518  //splits = (SplitArray*)m_splitData.getMutableCudaPtr();
2519 
2520  SplitInfoTri* &splits = *(SplitInfoTri**)m_module->getGlobal("g_splitStack").getMutablePtr();
2521  splits = (SplitInfoTri*)m_splitData.getMutableCudaPtr();
2522 #endif
2523 
2524  CudaAABB bbox;
2525  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
2526  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
2527  /*bbox.m_mn.x -= m_epsilon;
2528  bbox.m_mn.y -= m_epsilon;
2529  bbox.m_mn.z -= m_epsilon;
2530  bbox.m_mx.x += m_epsilon;
2531  bbox.m_mx.y += m_epsilon;
2532  bbox.m_mx.z += m_epsilon;*/
2533 
2534  // Set parent task containing all the work
2535  TaskBVH all;
2536  all.triStart = 0;
2537  all.triLeft = 0;
2538  all.triRight = 0;
2539  all.triEnd = m_numTris;
2540  all.bbox = bbox;
2541  all.step = 0;
2542  all.lock = LockType_Free;
2543  all.bestCost = 1e38f;
2544  all.depth = 0;
2545  all.dynamicMemory= baseOffset;
2546 #ifdef MALLOC_SCRATCHPAD
2547  all.subFailureCounter = 0;
2548 #endif
2549  all.parentIdx = -1;
2550  all.nodeIdx = 0;
2551  all.taskID = 0;
2552  Vector3 size = m_bbox.Diagonal();
2553  all.axis = size.MajorAxis();
2554  all.terminatedBy = TerminatedBy_None;
2555 #ifdef DEBUG_INFO
2556  all.sync = 0;
2557  all.parent = -1;
2558  all.clockStart = 0;
2559  all.clockEnd = 0;
2560 #endif
2561 
2562 #if SPLIT_TYPE == 0
2563 #if SCAN_TYPE == 0
2564  all.type = TaskType_Sort_PPS1;
2565 #elif SCAN_TYPE == 1
2566  all.type = TaskType_Sort_PPS1_Up;
2567 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2568  all.type = TaskType_Sort_SORT1;
2569 #endif
2570  all.unfinished = warpSubtasks(m_numTris);
2571  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2572  if(all.axis == 0)
2573  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2574  else if(all.axis == 1)
2575  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2576  else
2577  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2578 #elif SPLIT_TYPE == 1
2579  all.type = TaskType_Split;
2580 #if 0 // SQRT candidates
2581  int evaluatedCandidates = (int)sqrtf(m_numTris);
2582  int evaluatedCandidates = 1;
2583  int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
2584 #elif 0 // Fixed candidates
2585  int numPlanes = 32768;
2586 #else // All candidates
2587  int numPlanes = m_numTris*6; // Number of warp sized subtasks
2588 #endif
2589  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
2590 #elif SPLIT_TYPE == 2
2591  all.type = TaskType_Split;
2592  all.unfinished = 1;
2593 #elif SPLIT_TYPE == 3
2594  all.type = TaskType_SplitParallel;
2595  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
2596  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
2597  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
2598 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2599 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2600  all.type = TaskType_InitMemory;
2601  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
2602 #else
2603  all.type = TaskType_BinTriangles;
2604  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2605  /*all.type = TaskType_BuildObjectSAH;
2606  all.unfinished = 1;*/
2607 #endif
2608 #endif
2609  all.origSize = all.unfinished;
2610 
2611  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2612 
2613  // Set parent task header
2614  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2615 
2616  // Prepare the task stack
2617  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2618  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2619  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2620 #ifndef INTERLEAVED_LAYOUT
2621  tasks.nodeTop = 1;
2622 #else
2623  tasks.nodeTop = sizeof(CudaKdtreeNode);
2624 #endif
2625  tasks.triTop = 0;
2626  tasks.top = 0;
2627  tasks.bottom = 0;
2628  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2629  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
2630  tasks.active[0] = 0;
2631  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
2632  tasks.active[i] = i;*/
2633  tasks.activeTop = 1;
2634  //tasks.empty[0] = 0;
2635  //int j = 1;
2636  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2637  // tasks.empty[i] = j;
2638  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
2639  tasks.emptyTop = 0;
2640  tasks.emptyBottom = 0;
2641  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2642  tasks.numSortedTris = 0;
2643  tasks.numNodes = 0;
2644  tasks.numEmptyLeaves = 0;
2645  tasks.numLeaves = 0;
2646  tasks.sizePool = TASK_SIZE;
2647  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2648  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2649  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
2650 
2651  CudaKdtreeNode* &kdtree = *(CudaKdtreeNode**)m_module->getGlobal("g_kdtree").getMutablePtr();
2652  kdtree = (CudaKdtreeNode*)m_bvhData.getMutableCudaPtr();
2653 
2654  // Determine block and grid sizes.
2655 #ifdef ONE_WARP_RUN
2656  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2657  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2658  int numWarps = 1;
2659 #else
2660  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2661  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2662  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2663  int gridSizeX = NUM_SM*numBlocksPerSM;
2664  int numWarps = numWarpsPerBlock*gridSizeX;
2665  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2666 
2667  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2668  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2669 #endif
2670 
2671  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2672  m_debug.clear();
2673  in.debug = m_debug.getMutableCudaPtr();
2674 
2675  // Launch.
2676  float tKernel = 0.f;
2677 #ifndef DUPLICATE_REFERENCES
2678  tKernel += convertWoop();
2679 #endif
2680  tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
2681 
2682 /*#ifdef MALLOC_SCRATCHPAD
2683  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
2684  if (!kernelDealloc)
2685  fail("Memory allocation kernel not found!");
2686 
2687  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
2688 
2689  printf("Memory freed in %f\n", deallocTime);
2690 #endif*/
2691 
2692 #ifndef BENCHMARK
2693  cuCtxSynchronize(); // Flushes printfs
2694 #endif
2695 
2696 #ifdef DEBUG_PPS
2697  pout = (S32*)m_ppsTris.getPtr();
2698  sout = (S32*)m_sortTris.getPtr();
2699  S32 sum = 0;
2700  S32 error = 0;
2701  int j = 0;
2702  for(int i=0;i<m_numTris;i++)
2703  {
2704  sum += *sout; // Here for inclusive scan
2705  if(*pout != sum)
2706  {
2707  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *pout << " for " << m_numTris << " triangles!" << "\n";
2708  error = 1;
2709  if(j == 10)
2710  break;
2711  j++;
2712  }
2713  if(*sout != 0 && *sout != 1)
2714  {
2715  cout << "\nWTF " << i << " of " << m_numTris << ": " << *sout << "!\n" << "\n";
2716  break;
2717  }
2718  //sum += *sout; // Here for exclusive scan
2719  pout++;
2720  sout++;
2721  }
2722 
2723  if(!error)
2724  cout << "PPS correct for " << m_numTris << " triangles!" << "\n";
2725  return 0;
2726 #endif
2727 
2728  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2729 #ifndef INTERLEAVED_LAYOUT
2730  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
2731 #else
2732  if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.getSize()) // Something went fishy
2733 #endif
2734  tKernel = 1e38f;
2735  //printf("%d (%d x %d) (%d x %d)\n", tasks.unfinished != 0, tasks.nodeTop, m_bvhData.getSize() / sizeof(CudaKdtreeNode), tasks.triTop, m_trisIndexOut.getSize() / sizeof(S32));
2736 
2737  //Debug << "\nBuild in " << tKernel << "\n\n";
2738 
2739 #ifndef BENCHMARK
2740  printPool(tasks, numWarps);
2741 
2742  /*Debug << "\n\nKdtree" << "\n";
2743  CudaBVHNode* nodes = (CudaKdtreeNode*)m_bvhData.getPtr();
2744 
2745  for(int i = 0; i < tasks.nodeTop; i++)
2746  {
2747  Debug << "Node " << i << "\n";
2748  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
2749  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
2750  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
2751  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
2752  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
2753  }*/
2754 
2755  // Free data
2756  deinitPool();
2757 #endif
2758 
2759  return tKernel;
2760 }
2761 
2762 F32 CudaPersistentBVHTracer::testSort(S32 arraySize)
2763 {
2764  m_compiler.setSourceFile("src/rt/kernels/persistent_test.cu");
2765  m_module = m_compiler.compile();
2766  failIfError();
2767 
2768  CUfunction kernel;
2769  //kernel = m_module->getKernel("sort");
2770  //kernel = m_module->getKernel("testMemoryCamping");
2771  kernel = m_module->getKernel("testKeplerSort");
2772  if (!kernel)
2773  fail("Sort kernel not found!");
2774 
2775  // Prepare the task data
2776  initPool();
2777 
2778  // Set ppsTrisIndex
2779  /*S32* tid = (S32*)m_ppsTrisIndex.getMutablePtr();
2780  for(int i=0; i<arraySize/2; i++)
2781  {
2782  *tid = 0;
2783  tid++;
2784  }
2785  for(int i=arraySize/2; i<arraySize; i++)
2786  {
2787  *tid = 1;
2788  tid++;
2789  }*/
2790 
2791  m_trisIndex.resizeDiscard(sizeof(int)*arraySize);
2792  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
2793  for(int i=0; i < arraySize; i++)
2794  {
2795  // indices
2796  *tiout = (arraySize-1) - i;
2797  tiout++;
2798  }
2799 
2800  // Set input.
2801  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2802  in.numTris = arraySize;
2803  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2804  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
2805  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
2806  in.sortTris = m_sortTris.getMutableCudaPtr();
2807 
2808  // Set parent task containing all the work
2809  TaskBVH all;
2810  all.triStart = 0;
2811  all.triEnd = arraySize;
2812  //all.bbox = bbox;
2813  all.step = 0;
2814  all.lock = 0;
2815  all.bestCost = 1e38f;
2816  all.depth = 0;
2817  all.parentIdx = -1;
2818  all.nodeIdx = 0;
2819  all.taskID = 0;
2820  all.pivot = arraySize / 2;
2821 #ifdef DEBUG_INFO
2822  all.sync = 0;
2823  all.parent = -1;
2824  all.clockStart = 0;
2825  all.clockEnd = 0;
2826 #endif
2827 
2828  all.type = TaskType_Sort_PPS1;
2829  all.unfinished = warpSubtasks(arraySize);
2830  all.origSize = all.unfinished;
2831 
2832  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2833 
2834  // Set parent task header
2835  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2836 
2837  // Prepare the task stack
2838  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2839  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2840  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2841  tasks.nodeTop = 1;
2842  tasks.top = 0;
2843  tasks.bottom = 0;
2844  memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2845  tasks.activeTop = 1;
2846  //tasks.empty[0] = 0;
2847  //int j = 1;
2848  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2849  // tasks.empty[i] = j;
2850  tasks.emptyTop = 0;
2851  tasks.emptyBottom = 0;
2852  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2853  tasks.sizePool = TASK_SIZE;
2854  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2855  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2856 
2857  // Determine block and grid sizes.
2858 #ifdef ONE_WARP_RUN
2859  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2860  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2861 #else
2862  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2863  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2864  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2865  int gridSizeX = NUM_SM*numBlocksPerSM;
2866  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2867 
2868  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2869  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2870 #endif
2871 
2872  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2873  m_debug.clear();
2874  in.debug = m_debug.getMutableCudaPtr();
2875 
2876  // Launch.
2877  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize, false, 0, false);
2878 
2879 #ifndef BENCHMARK
2880  cuCtxSynchronize(); // Flushes printfs
2881 #endif
2882 
2883  // Verify sort
2884  S32* tsort = (S32*)m_trisIndex.getPtr();
2885  for(int i=0; i < arraySize; i++)
2886  {
2887  if(*tsort != i)
2888  {
2889  printf("Sort error %d instead of %d\n", *tsort, i);
2890  break;
2891  }
2892  tsort++;
2893  }
2894 
2895  Debug << "\nSort in " << tKernel << "\n\n";
2896 
2897  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2898  int* header = (int*)m_taskData.getPtr();
2899  printPoolHeader(&tasks, header, blockSize.y*gridSize.x, sprintf(""));
2900 
2901  Debug << "\n\nTasks" << "\n";
2902  TaskBVH* task = (TaskBVH*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
2903  int stackMax = 0;
2904  int maxDepth = 0;
2905  int syncCount = 0;
2906  int maxTaskId = -1;
2907  long double sumTris = 0;
2908  long double maxTris = 0;
2909 
2910  int sortTasks = 0;
2911  long double cntSortTris = 0;
2912 
2913  int subFailed = 0;
2914 
2915  for(int i = 0; i < TASK_SIZE; i++)
2916  {
2917  if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
2918  {
2919  Debug << "Task " << i << "\n";
2920  Debug << "Header: " << header[i] << "\n";
2921  Debug << "Unfinished: " << task[i].unfinished << "\n";
2922  Debug << "Type: " << task[i].type << "\n";
2923  Debug << "TriStart: " << task[i].triStart << "\n";
2924  Debug << "TriEnd: " << task[i].triEnd << "\n";
2925  Debug << "TriRight: " << task[i].triRight << "\n";
2926  Debug << "ParentIdx: " << task[i].parentIdx << "\n";
2927  Debug << "NodeIdx: " << task[i].nodeIdx << "\n";
2928  Debug << "TaskID: " << task[i].taskID << "\n";
2929  Debug << "Depth: " << task[i].depth << "\n";
2930 #ifdef DEBUG_INFO
2931  //Debug << "Step: " << task[i].step << "\n";
2932  //Debug << "Lock: " << task[i].lock << "\n";
2933  //Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
2934  Debug << "GMEMSync: " << task[i].sync << "\n";
2935  Debug << "Parent: " << task[i].parent << "\n";
2936 #endif
2937  Debug << "Triangles: " << task[i].triEnd - task[i].triStart << "\n";
2938  Debug << "Pivot: " << task[i].pivot << "\n";
2939 
2940  Debug << "\n";
2941  stackMax = i;
2942 
2943 #ifdef CUTOFF_DEPTH
2944  if(task[i].depth == m_cutOffDepth)
2945  {
2946 #endif
2947  long double tris = task[i].triEnd - task[i].triStart;
2948  if(tris > maxTris)
2949  {
2950  maxTris = tris;
2951  maxTaskId = i;
2952  }
2953  sumTris += tris;
2954  sortTasks++;
2955  cntSortTris += tris;
2956 #ifdef CUTOFF_DEPTH
2957  }
2958 #endif
2959 
2960 #ifdef DEBUG_INFO
2961  maxDepth = max(task[i].depth, maxDepth);
2962  syncCount += task[i].sync;
2963 #endif
2964  }
2965  }
2966 
2967  if(stackMax == TASK_SIZE-1)
2968  printf("\aIncomplete result!\n");
2969 #ifdef CUTOFF_DEPTH
2970  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
2971 #else
2972  Debug << "\n\n";
2973 #endif
2974 
2975 #ifdef DEBUG_INFO
2976  Debug << "Avg naive task height (tris) = " << sumTris/(long double)sortTasks << "\n";
2977  Debug << "Max naive task height (tris) = " << maxTris << ", taskId: " << maxTaskId << "\n";
2978  Debug << "Cnt sorted operations = " << sortTasks << "\n";
2979  double cntTrisLog2Tris = (double(arraySize) * (double)(logf(arraySize)/logf(2.0f)));
2980  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
2981  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
2982  Debug << "\n";
2983  Debug << "Max task depth = " << maxDepth << "\n";
2984  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
2985  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
2986 #endif
2987 
2988  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
2989 
2990  return tKernel;
2991 }
2992 
2993 F32 CudaPersistentBVHTracer::traceOnDemandBVHRayBuffer(RayBuffer& rays, bool rebuild)
2994 {
2995  CUfunction kernel;
2996  kernel = m_module->getKernel("build");
2997  if (!kernel)
2998  fail("Build kernel not found!");
2999 
3000  // Prepare the task data
3001  if(rebuild)
3002  {
3003  initPool(0, &rays.getRayBuffer(), &m_bvhData);
3004  }
3005 
3006  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
3007  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3008 
3009  // Set BVH input.
3010  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
3011  inBVH.numTris = m_numTris;
3012  inBVH.tris = m_trisCompact.getCudaPtr();
3013  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
3014  //inBVH.trisBox = m_trisBox.getCudaPtr();
3015  inBVH.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
3016  inBVH.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
3017  inBVH.sortTris = m_sortTris.getMutableCudaPtr();
3018 #ifdef COMPACT_LAYOUT
3019  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
3020  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
3021 #endif
3022 
3023  // Set traversal input
3024  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
3025  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
3026  Buffer& indexBuf = m_trisIndex;
3027  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3028  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
3029 
3030  // Stop the timer for this copy as it is stopped in other algorithms as well
3031  m_timer.end();
3032  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
3033  m_timer.start();
3034  in.numRays = rays.getSize();
3035  in.anyHit = (rays.getNeedClosestHit() == false);
3036  in.nodesA = nodePtr + nodeOfsA.x;
3037  in.trisA = triPtr + triOfsA.x;
3038  in.rays = rays.getRayBuffer().getCudaPtr();
3039  in.results = rays.getResultBuffer().getMutableCudaPtr();
3040  in.triIndices = indexBuf.getCudaPtr();
3041 
3042  if(rebuild)
3043  {
3044 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3045 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3046  SplitRed split;
3047  for(int i = 0; i < 2; i++)
3048  {
3049  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3050  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3051  split.children[i].cnt = 0;
3052  }
3053 
3054  SplitArray sArray;
3055  for(int i = 0; i < NUM_WARPS; i++)
3056  {
3057  for(int j = 0; j < PLANE_COUNT; j++)
3058  sArray.splits[i][j] = split;
3059  }
3060 #else
3061  SplitRed split;
3062  for(int i = 0; i < 2; i++)
3063  {
3064  //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3065  //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3066  split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3067  split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3068  split.children[i].cnt = 0;
3069  }
3070 
3071  SplitArray sArray;
3072  for(int j = 0; j < PLANE_COUNT; j++)
3073  sArray.splits[j] = split;
3074 
3075  m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
3076 #endif
3077 
3078  m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
3079 #endif
3080 
3081  m_bvhData.clearRange32(0, UNBUILD_FLAG, sizeof(CudaBVHNode)); // Set the root as unbuild
3082 
3083  CudaAABB bbox;
3084  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
3085  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
3086 
3087  // Set parent task containing all the work
3088  TaskBVH all;
3089  all.triStart = 0;
3090  all.triLeft = 0;
3091  all.triRight = m_numTris;
3092  all.triEnd = m_numTris;
3093  all.bbox = bbox;
3094  all.step = 0;
3095  all.lock = LockType_Free;
3096  all.bestCost = 1e38f;
3097  all.depth = 0;
3098 #ifndef MALLOC_SCRATCHPAD
3099  all.triIdxCtr = 0;
3100 #endif
3101  all.parentIdx = -1;
3102  all.nodeIdx = 0;
3103  all.taskID = 0;
3104  Vector3 size = m_bbox.Diagonal();
3105  all.axis = size.MajorAxis();
3106  all.pivot = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.subtreeLimit");
3107  all.terminatedBy = TerminatedBy_None;
3108 #ifdef DEBUG_INFO
3109  all.sync = 0;
3110  all.parent = -1;
3111  all.clockStart = 0;
3112  all.clockEnd = 0;
3113 #endif
3114  all.cached = LockType_None; // Mark this task as cached
3115 
3116 #if SPLIT_TYPE == 0
3117 #if SCAN_TYPE == 0
3118  all.type = TaskType_Sort_PPS1;
3119 #elif SCAN_TYPE == 1
3120  all.type = TaskType_Sort_PPS1_Up;
3121 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3122  all.type = TaskType_Sort_SORT1;
3123 #endif
3124  all.unfinished = warpSubtasks(m_numTris);
3125  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3126  if(all.axis == 0)
3127  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3128  else if(all.axis == 1)
3129  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3130  else
3131  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3132 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3133 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3134  all.type = TaskType_InitMemory;
3135  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
3136 #else
3137  all.type = TaskType_BinTriangles;
3138  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3139 #endif
3140 #endif
3141  all.origSize = all.unfinished;
3142 
3143  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
3144 
3145  // Set parent task header
3146  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
3147  }
3148 
3149  // Prepare the task stack
3150  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
3151 
3152  tasks.header = (int*)m_taskData.getMutableCudaPtr();
3153  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
3154  tasks.launchFlag = 0;
3155 
3156  if(rebuild)
3157  {
3158  tasks.nodeTop = 1;
3159  tasks.triTop = 0;
3160  tasks.top = 0;
3161  tasks.bottom = 0;
3162  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
3163  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
3164  tasks.active[0] = 0;
3165  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
3166  tasks.active[i] = i;*/
3167  tasks.activeTop = 1;
3168  //tasks.empty[0] = 0;
3169  //int j = 1;
3170  //for(int i = EMPTY_MAX; i > 0; i--, j++)
3171  // tasks.empty[i] = j;
3172  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
3173  tasks.emptyTop = 0;
3174  tasks.emptyBottom = 0;
3175  tasks.numSortedTris = 0;
3176  tasks.numNodes = 0;
3177  tasks.numLeaves = 0;
3178  tasks.numEmptyLeaves = 0;
3179  tasks.sizePool = TASK_SIZE;
3180  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
3181  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
3182  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
3183  }
3184  /*else
3185  {
3186  tasks.emptyTop = 3;
3187  }*/
3188 
3189  tasks.warpCounter = rays.getSize();
3190  tasks.unfinished = -NUM_WARPS; // We are waiting for one task to finish = task all
3191 
3192 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3193  // Prepare split stack
3194  SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
3195  splits = (SplitArray*)m_splitData.getMutableCudaPtr();
3196 #endif
3197 
3198  CudaBVHNode* &bvh = *(CudaBVHNode**)m_module->getGlobal("g_bvh").getMutablePtr();
3199  bvh = (CudaBVHNode*)m_bvhData.getMutableCudaPtr();
3200 
3201  // Determine block and grid sizes.
3202 #ifdef ONE_WARP_RUN
3203  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
3204  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
3205  int numWarps = 1;
3206 #else
3207  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
3208  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
3209  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
3210  int gridSizeX = NUM_SM*numBlocksPerSM;
3211  int numWarps = numWarpsPerBlock*gridSizeX;
3212  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
3213 
3214  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3215  printf("\aNUM_WARPS constant does not match the launch parameters\n");
3216 #endif
3217 
3218  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
3219  m_debug.clear();
3220  inBVH.debug = m_debug.getMutableCudaPtr();
3221 
3222  // Launch.
3223  //cuFuncSetSharedSize(kernel, 0); // Set shared memory to force some launch configurations
3224  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
3225 
3226 #ifndef BENCHMARK
3227  cuCtxSynchronize(); // Flushes printfs
3228 #endif
3229 
3230  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
3231  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
3232  tKernel = 1e38f;
3233 
3234  //Debug << "\nBuild in " << tKernel << "\n\n";
3235 
3236 #ifndef BENCHMARK
3237  printPool(tasks, numWarps);
3238 
3239  /*Debug << "\n\nBVH" << "\n";
3240  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
3241 
3242  for(int i = 0; i < tasks.nodeTop; i++)
3243  {
3244  Debug << "Node " << i << "\n";
3245  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
3246  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
3247  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
3248  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
3249  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
3250  }*/
3251 #endif
3252 
3253  return tKernel;
3254 }
3255 
3256 F32 CudaPersistentBVHTracer::traceOnDemandKdtreeRayBuffer(RayBuffer& rays, bool rebuild)
3257 {
3258  CUfunction kernel;
3259  kernel = m_module->getKernel("build");
3260  if (!kernel)
3261  fail("Build kernel not found!");
3262 
3263  // Prepare the task data
3264  if(rebuild)
3265  {
3266  initPool(0, &rays.getRayBuffer(), &m_bvhData);
3267  }
3268 
3269  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
3270  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3271  float k1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK1");
3272  float k2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK2");
3273  cudaEnv.optMaxDepth = k1 * log2((F32)m_numTris) + k2;
3274  //cudaEnv.failureCount = 0.2f*cudaEnv.optMaxDepth + 1.0f;
3275 #ifndef BENCHMARK
3276  if(rebuild)
3277  {
3278  printf("Maximum depth = %d\n", cudaEnv.optMaxDepth);
3279  printf("Failure count = %d\n", cudaEnv.failureCount);
3280  }
3281 #endif
3282 
3283  // Set BVH input.
3284  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
3285  inBVH.numTris = m_numTris;
3286  inBVH.tris = m_trisCompact.getCudaPtr();
3287  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
3288 #ifndef INTERLEAVED_LAYOUT
3289  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
3290  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
3291 #endif
3292 
3293  // Set traversal input
3294  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
3295  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3296 #ifndef INTERLEAVED_LAYOUT
3297  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
3298  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
3299  Buffer& indexBuf = m_trisIndexOut;
3300 #else
3301  CUdeviceptr triPtr = m_bvhData.getCudaPtr();
3302  Vec2i triOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3303  Buffer& indexBuf = m_bvhData;
3304 #endif
3305 
3306  // Stop the timer for this copy as it is stopped in other algorithms as well
3307  m_timer.end();
3308  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
3309  m_timer.start();
3310  in.numRays = rays.getSize();
3311  in.anyHit = (rays.getNeedClosestHit() == false);
3312  memcpy(&in.bmin, &m_bbox.min, sizeof(float3));
3313  memcpy(&in.bmax, &m_bbox.max, sizeof(float3));
3314  in.nodesA = nodePtr + nodeOfsA.x;
3315  in.trisA = triPtr + triOfsA.x;
3316  in.rays = rays.getRayBuffer().getCudaPtr();
3317  in.results = rays.getResultBuffer().getMutableCudaPtr();
3318  in.triIndices = indexBuf.getCudaPtr();
3319 
3320  // Set texture references.
3321  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
3322  m_module->setTexRef("t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3323  //m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3324  //m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
3325 
3326  if(rebuild)
3327  {
3328  int baseOffset = setDynamicMemory();
3329  inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
3330 
3331 #if SPLIT_TYPE == 3
3332  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
3333 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3334 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3335  SplitRed split;
3336  for(int i = 0; i < 2; i++)
3337  {
3338  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3339  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3340  split.children[i].cnt = 0;
3341  }
3342 
3343  SplitArray sArray;
3344  for(int i = 0; i < NUM_WARPS; i++)
3345  {
3346  for(int j = 0; j < PLANE_COUNT; j++)
3347  sArray.splits[i][j] = split;
3348  }
3349 #else
3350  //SplitRed split;
3351  //for(int i = 0; i < 2; i++)
3352  //{
3353  // //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3354  // //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3355  // split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3356  // split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3357  // split.children[i].cnt = 0;
3358  //}
3359 
3360  //SplitArray sArray;
3361  //for(int j = 0; j < PLANE_COUNT; j++)
3362  // sArray.splits[j] = split;
3363 
3364  //m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
3365  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
3366 #endif
3367 
3368  //m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
3369  // Prepare split stack
3370  //SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
3371  //splits = (SplitArray*)m_splitData.getMutableCudaPtr();
3372 
3373  SplitInfoTri* &splits = *(SplitInfoTri**)m_module->getGlobal("g_splitStack").getMutablePtr();
3374  splits = (SplitInfoTri*)m_splitData.getMutableCudaPtr();
3375 #endif
3376 
3377  m_bvhData.clearRange32(0, UNBUILD_FLAG, sizeof(CudaKdtreeNode)); // Set the root as unbuild
3378 
3379  CudaAABB bbox;
3380  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
3381  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
3382 
3383  // Set parent task containing all the work
3384  TaskBVH all;
3385  all.triStart = 0;
3386  all.triLeft = 0;
3387  all.triRight = 0;
3388  all.triEnd = m_numTris;
3389  all.bbox = bbox;
3390  all.step = 0;
3391  all.lock = LockType_Free;
3392  all.bestCost = 1e38f;
3393  all.depth = 0;
3394  all.dynamicMemory= baseOffset;
3395 #ifdef MALLOC_SCRATCHPAD
3396  all.subFailureCounter = 0;
3397 #endif
3398  all.parentIdx = -1;
3399  all.nodeIdx = 0;
3400  all.taskID = 0;
3401  Vector3 size = m_bbox.Diagonal();
3402  all.axis = size.MajorAxis();
3403  all.pivot = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.subtreeLimit");
3404  all.terminatedBy = TerminatedBy_None;
3405 #ifdef DEBUG_INFO
3406  all.sync = 0;
3407  all.parent = -1;
3408  all.clockStart = 0;
3409  all.clockEnd = 0;
3410 #endif
3411  all.cached = LockType_None; // Mark this task as cached
3412 
3413 #if SPLIT_TYPE == 0
3414 #if SCAN_TYPE == 0
3415  all.type = TaskType_Sort_PPS1;
3416 #elif SCAN_TYPE == 1
3417  all.type = TaskType_Sort_PPS1_Up;
3418 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3419  all.type = TaskType_Sort_SORT1;
3420 #endif
3421  all.unfinished = warpSubtasks(m_numTris);
3422  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3423  if(all.axis == 0)
3424  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3425  else if(all.axis == 1)
3426  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3427  else
3428  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3429 #elif SPLIT_TYPE == 1
3430  all.type = TaskType_Split;
3431 #if 0 // SQRT candidates
3432  int evaluatedCandidates = (int)sqrtf(m_numTris);
3433  int evaluatedCandidates = 1;
3434  int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
3435 #elif 0 // Fixed candidates
3436  int numPlanes = 32768;
3437 #else // All candidates
3438  int numPlanes = m_numTris*6; // Number of warp sized subtasks
3439 #endif
3440  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
3441 #elif SPLIT_TYPE == 2
3442  all.type = TaskType_Split;
3443  all.unfinished = 1;
3444 #elif SPLIT_TYPE == 3
3445  all.type = TaskType_SplitParallel;
3446  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
3447  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
3448  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
3449 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3450 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3451  all.type = TaskType_InitMemory;
3452  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
3453 #else
3454  all.type = TaskType_BinTriangles;
3455  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3456 #endif
3457 #endif
3458  all.origSize = all.unfinished;
3459 
3460  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
3461 
3462  // Set parent task header
3463  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
3464  }
3465 
3466  // Prepare the task stack
3467  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
3468 
3469  tasks.header = (int*)m_taskData.getMutableCudaPtr();
3470  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
3471  tasks.launchFlag = 0;
3472 
3473  if(rebuild)
3474  {
3475 #ifndef INTERLEAVED_LAYOUT
3476  tasks.nodeTop = 1;
3477 #else
3478  tasks.nodeTop = sizeof(CudaKdtreeNode);
3479 #endif
3480  tasks.triTop = 0;
3481  tasks.top = 0;
3482  tasks.bottom = 0;
3483  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
3484  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
3485  tasks.active[0] = 0;
3486  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
3487  tasks.active[i] = i;*/
3488  tasks.activeTop = 1;
3489  //tasks.empty[0] = 0;
3490  //int j = 1;
3491  //for(int i = EMPTY_MAX; i > 0; i--, j++)
3492  // tasks.empty[i] = j;
3493  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
3494  tasks.emptyTop = 0;
3495  tasks.emptyBottom = 0;
3496  tasks.numSortedTris = 0;
3497  tasks.numNodes = 0;
3498  tasks.numLeaves = 0;
3499  tasks.numEmptyLeaves = 0;
3500  tasks.sizePool = TASK_SIZE;
3501  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
3502  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
3503  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
3504  }
3505  /*else
3506  {
3507  tasks.emptyTop = 3;
3508  }*/
3509 
3510  tasks.warpCounter = rays.getSize();
3511 #ifndef ONDEMAND_FULL_BUILD
3512  tasks.unfinished = -NUM_WARPS; // We are waiting for all trace warps to finish
3513 #else
3514  tasks.unfinished = -1; // We are waiting for one task to finish = task all
3515 #endif
3516 
3517  CudaKdtreeNode* &kdtree = *(CudaKdtreeNode**)m_module->getGlobal("g_kdtree").getMutablePtr();
3518  kdtree = (CudaKdtreeNode*)m_bvhData.getMutableCudaPtr();
3519 
3520  // Determine block and grid sizes.
3521 #ifdef ONE_WARP_RUN
3522  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
3523  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
3524  int numWarps = 1;
3525 #else
3526  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
3527  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
3528  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
3529  int gridSizeX = NUM_SM*numBlocksPerSM;
3530  int numWarps = numWarpsPerBlock*gridSizeX;
3531  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
3532 
3533  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3534  printf("\aNUM_WARPS constant does not match the launch parameters\n");
3535 #endif
3536 
3537  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
3538  m_debug.clear();
3539  inBVH.debug = m_debug.getMutableCudaPtr();
3540 
3541  // Launch.
3542  //cuFuncSetSharedSize(kernel, 0); // Set shared memory to force some launch configurations
3543  float tKernel = 0.f;
3544 #ifndef DUPLICATE_REFERENCES
3545  if(rebuild)
3546  tKernel += convertWoop();
3547 #endif
3548  tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
3549 
3550 /*#ifdef MALLOC_SCRATCHPAD
3551  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
3552  if (!kernelDealloc)
3553  fail("Memory allocation kernel not found!");
3554 
3555  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
3556 
3557  printf("Memory freed in %f\n", deallocTime);
3558 #endif*/
3559 
3560 #ifndef BENCHMARK
3561  cuCtxSynchronize(); // Flushes printfs
3562 #endif
3563 
3564  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
3565 #ifndef INTERLEAVED_LAYOUT
3566  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
3567 #else
3568  if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.getSize()) // Something went fishy
3569 #endif
3570  tKernel = 1e38f;
3571 
3572  //Debug << "\nBuild in " << tKernel << "\n\n";
3573 
3574 #ifndef BENCHMARK
3575  printPool(tasks, numWarps);
3576 
3577  /*Debug << "\n\nBVH" << "\n";
3578  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
3579 
3580  for(int i = 0; i < tasks.nodeTop; i++)
3581  {
3582  Debug << "Node " << i << "\n";
3583  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
3584  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
3585  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
3586  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
3587  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
3588  }*/
3589 #endif
3590 
3591  return tKernel;
3592 }
3593 
3594 F32 CudaPersistentBVHTracer::traceCpuRayBuffer(RayBuffer& rb)
3595 {
3596  const Ray* rays = (const Ray*)rb.getRayBuffer().getPtr();
3597  RayResult* results = (RayResult*)rb.getResultBuffer().getMutablePtr();
3598  for(int rid=0; rid < rb.getSize(); rid++)
3599  {
3600  if(rid % 10000 == 0) printf("rid: %d\n",rid);
3601  traceCpuRay(rays[rid], results[rid], !rb.getNeedClosestHit());
3602  }
3603 
3604  return 0;
3605 }
3606 
3607 void CudaPersistentBVHTracer::traceCpuRay(const Ray& r, RayResult& result, bool anyHit)
3608 {
3609  const Vec4f *t_trisA = (Vec4f*)(m_trisCompact.getPtr());
3610  const S32 *t_trisIndices = (S32*)(m_trisIndex.getPtr());
3611 
3612  int hitIndex;
3613  float hitT;
3614  float hitU;
3615  float hitV;
3616  float tmin;
3617 
3618  hitIndex = -1;
3619  hitT = r.tmax;
3620  hitU = 0;
3621  hitV = 0;
3622  tmin = 0;
3623 
3624  // naive over all triangles
3625  for (int triAddr = 0; triAddr < m_numTris * 3 ; triAddr += 3)
3626  {
3627  const Vec3f &v00 = t_trisA[triAddr + 0].getXYZ();
3628  const Vec3f &v11 = t_trisA[triAddr + 1].getXYZ();
3629  const Vec3f &v22 = t_trisA[triAddr + 2].getXYZ();
3630 
3631  Vec3f nrmN = cross(v11-v00,v22-v00);
3632  const float den = dot(nrmN,r.direction);
3633 
3634  if(den >= 0.0f)
3635  continue;
3636 
3637  const float deni = 1.0f / den;
3638  const Vec3f org0 = v00-r.origin;
3639  float t = dot(nrmN,org0)*deni;
3640 
3641  if (t > tmin && t < hitT)
3642  {
3643  const Vec3f crossProd = cross(r.direction,org0);
3644  const float v = dot(v00-v22,crossProd)*deni;
3645  if (v >= 0.0f && v <= 1.0f)
3646  {
3647  const float u = -dot(v00-v11,crossProd)*deni;
3648  if (u >= 0.0f && u + v <= 1.0f)
3649  {
3650  hitT = t;
3651  hitU = u;
3652  hitV = v;
3653  hitIndex = triAddr;
3654  }
3655  }
3656  }
3657  }
3658 
3659  if(hitIndex != -1)
3660  hitIndex = hitIndex / 3;
3661 
3662  result.id = hitIndex;
3663  result.t = hitT;
3664  result.u = hitU;
3665  result.v = hitV;
3666 }
3667 
3668 void CudaPersistentBVHTracer::saveBufferSizes(bool ads, bool aux)
3669 {
3670  float MB = (float)(1024*1024);
3671 
3672  if(ads)
3673  {
3674  m_sizeADS = m_bvhData.getSize()/MB;
3675 #ifndef COMPACT_LAYOUT
3676  m_sizeTri = m_trisCompact.getSize()/MB;
3677  m_sizeTriIdx = m_trisIndex.getSize()/MB;
3678 #else
3679  m_sizeTri = m_trisCompactOut.getSize()/MB;
3680  m_sizeTriIdx = m_trisIndexOut.getSize()/MB;
3681 #endif
3682  }
3683 
3684  if(aux)
3685  {
3686  m_sizeTask = m_taskData.getSize()/MB;
3687  m_sizeSplit = m_splitData.getSize()/MB;
3688 #ifdef MALLOC_SCRATCHPAD
3689 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3690  size_t heapSize;
3691  cuCtxGetLimit(&heapSize, CU_LIMIT_MALLOC_HEAP_SIZE);
3692  m_heap = heapSize/MB;
3693 #else
3694  m_heap = (m_mallocData.getSize()+m_mallocData2.getSize())/MB;
3695 #endif
3696 #else
3697  m_heap = 0.f;
3698 #endif
3699  }
3700 }
3701 
3702 void CudaPersistentBVHTracer::prepareDynamicMemory()
3703 {
3704  // Set the memory limit according to triangle count
3705  //U64 allocSize = (U64)m_trisIndex.getSize()*15ULL;
3706  //U64 allocSize = (U64)m_trisIndex.getSize()*20ULL;
3707  U64 allocSize = (U64)m_trisIndex.getSize()*150ULL;
3708  //U64 allocSize = (U64)m_trisIndex.getSize()*200ULL;
3709 
3710 #if defined(SCATTER_ALLOC) || defined(FDG_ALLOC)
3711  U64 allocSize = max(allocSize, 8ULL*1024ULL*1024ULL);
3712 #endif
3713 
3714 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3715  cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, allocSize);
3716 #elif defined(ATOMIC_MALLOC) || defined(CIRCULAR_MALLOC)
3717  m_mallocData.resizeDiscard(allocSize);
3718 #ifdef WITH_SCATTER_ALLOC
3719  m_mallocData2.resizeDiscard(allocSize);
3720 #endif
3721 #elif defined(SCATTER_ALLOC)
3722  m_mallocData.resizeDiscard(allocSize);
3723 #endif
3724 
3725 #if defined(SCATTER_ALLOC) || defined(WITH_SCATTER_ALLOC)
3726  // CUDA Driver API cannot deal with templates -> use C++ mangled name
3727  CUfunction initHeap = m_module->getKernel("_ZN8GPUTools8initHeapILj4096ELj8ELj16ELj2ELb0ELb1EEEvPNS_10DeviceHeapIXT_EXT0_EXT1_EXT2_EXT3_EXT4_EEEPvj", 2*sizeof(CUdeviceptr)+sizeof(int));
3728  if (!initHeap)
3729  fail("Scatter alloc initialization kernel not found!");
3730 
3731  int offset = 0;
3732  offset += m_module->setParamPtr(initHeap, offset, m_module->getGlobal("theHeap").getMutableCudaPtr());
3733 #ifdef WITH_SCATTER_ALLOC
3734  offset += m_module->setParamPtr(initHeap, offset, m_mallocData2.getMutableCudaPtr());
3735 #else
3736  offset += m_module->setParamPtr(initHeap, offset, m_mallocData.getMutableCudaPtr());
3737 #endif
3738  offset += m_module->setParami(initHeap, offset, allocSize);
3739  F32 initTime = m_module->launchKernelTimed(initHeap, Vec2i(256,1), Vec2i(1, 1));
3740 
3741  printf("Scatter alloc initialized in %f\n", initTime);
3742 #endif
3743 }
3744 
3745 int CudaPersistentBVHTracer::setDynamicMemory()
3746 {
3747  int baseOffset = 0;
3748 #if !defined(ATOMIC_MALLOC) && !defined(CIRCULAR_MALLOC)
3749  CUfunction kernelAlloc = m_module->getKernel("allocFreeableMemory", 2*sizeof(int));
3750  if (!kernelAlloc)
3751  fail("Memory allocation kernel not found!");
3752 
3753  int offset = 0;
3754  offset += m_module->setParami(kernelAlloc, offset, m_numTris);
3755  offset += m_module->setParami(kernelAlloc, offset, 0);
3756  F32 allocTime = m_module->launchKernelTimed(kernelAlloc, Vec2i(1,1), Vec2i(1, 1));
3757 
3758 #ifndef BENCHMARK
3759  printf("Memory allocated in %f\n", allocTime);
3760 #endif
3761 #else
3762  // Set the heapBase, heapOffset and heapSize
3763  char*& heapBase = *(char**)m_module->getGlobal("g_heapBase").getMutablePtr();
3764  heapBase = (char*)m_mallocData.getMutableCudaPtr();
3765  int& heapOffset = *(int*)m_module->getGlobal("g_heapOffset").getMutablePtr();
3766 #if SCAN_TYPE < 2
3767  heapOffset = 4*m_numTris*sizeof(int);
3768 #else
3769  heapOffset = m_numTris*sizeof(int);
3770 #endif
3771 
3772  int& heapSize = *(int*)m_module->getGlobal("g_heapSize").getMutablePtr();
3773  heapSize = m_mallocData.getSize();
3774 
3775 #if defined(CIRCULAR_MALLOC)
3776 #ifndef DOUBLY_LINKED
3777  int headerSize = 2*sizeof(int);
3778 #else
3779  int headerSize = 3*sizeof(int);
3780 #endif
3781  heapOffset += headerSize;
3782 
3783  int& heapLock = *(int*)m_module->getGlobal("g_heapLock").getMutablePtr();
3784  heapLock = 0;
3785 
3786 #ifndef DOUBLY_LINKED
3787  Vec2i first(LockType_Set, heapOffset); // Locked, allocated memory for parent
3788  m_mallocData.setRange(0, &first, sizeof(Vec2i)); // Set the first header
3789 #else
3790  Vec3i first(LockType_Set, heapSize-headerSize, heapOffset); // Locked, allocated memory for parent
3791  m_mallocData.setRange(0, &first, sizeof(Vec3i)); // Set the first header
3792 #endif
3793 
3794 #ifdef GLOBAL_HEAP_LOCK
3795 #ifndef DOUBLY_LINKED
3796  Vec2i second(LockType_Free, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3797  m_mallocData.setRange(heapOffset, &second, sizeof(Vec2i)); // Set the second header
3798 #else
3799  Vec3i second(LockType_Free, 0, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3800  m_mallocData.setRange(heapOffset, &second, sizeof(Vec3i)); // Set the second header
3801 #endif
3802 #else
3803 #if 0
3804  // Create regular chunks
3805  int numChunks = m_mallocData.getSize()/heapOffset;
3806  for(int i = 1; i < numChunks; i++)
3807  {
3808  #ifndef DOUBLY_LINKED
3809  Vec2i next(0, (i+1)*heapOffset); // Unlocked, next at the multiple of heapOffset
3810  m_mallocData.setRange(i*heapOffset, &next, sizeof(Vec2i)); // Set the next header
3811  #else
3812  Vec3i next(0, (i-1)*heapOffset, (i+1)*heapOffset); // Unlocked, next at the multiple of heapOffset
3813  m_mallocData.setRange(i*heapOffset, &next, sizeof(Vec3i)); // Set the next header
3814  #endif
3815  }
3816 
3817 #else
3818  // Create hierarchical chunks
3819  int delta = ((int)(heapOffset)+headerSize+3) & -4;
3820  int prevOfs = 0;
3821  int ofs;
3822  int i = 0;
3823  int lvl = 2;
3824  for(ofs = heapOffset; true; ofs += delta, i++)
3825  {
3826  if(i == lvl) // New level in BFS order
3827  {
3828  delta = ((int)(delta * 0.8f)+headerSize+3) & -4;
3829  i = 0;
3830  lvl *= 2;
3831  }
3832 
3833  if(ofs+delta >= heapSize-2*headerSize) // We cannot make another chunk
3834  break;
3835 
3836 #ifndef DOUBLY_LINKED
3837  Vec2i next(LockType_Free, ofs+delta); // Unlocked, next at the multiple of heapOffset
3838  m_mallocData.setRange(ofs, &next, sizeof(Vec2i)); // Set the next header
3839 #else
3840  Vec3i next(LockType_Free, prevOfs, ofs+delta); // Unlocked, next at the multiple of heapOffset
3841  m_mallocData.setRange(ofs, &next, sizeof(Vec3i)); // Set the next header
3842 #endif
3843 
3844  prevOfs = ofs;
3845  }
3846 #endif
3847 
3848 #ifndef DOUBLY_LINKED
3849  Vec2i last(LockType_Free, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3850  m_mallocData.setRange(ofs, &last, sizeof(Vec2i)); // Set the last header
3851 #else
3852  Vec3i last(LockType_Free, prevOfs, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3853  m_mallocData.setRange(ofs, &last, sizeof(Vec3i)); // Set the last header
3854 #endif
3855 #endif
3856 
3857 #ifndef DOUBLY_LINKED
3858  Vec2i tail(LockType_Set, 0); // Locked, next at the start of heap
3859  m_mallocData.setRange(heapSize-headerSize, &tail, sizeof(Vec2i)); // Set the last header
3860 #else
3861  Vec3i tail(LockType_Set, ofs, 0); // Locked, next at the start of heap
3862  m_mallocData.setRange(heapSize-headerSize, &tail, sizeof(Vec3i)); // Set the last header
3863 #endif
3864 
3865  // Offset of the memory allocation
3866  baseOffset = headerSize;
3867 
3868 #ifdef WITH_SCATTER_ALLOC
3869  // With scatter alloc
3870  char*& heapBase2 = *(char**)m_module->getGlobal("g_heapBase2").getMutablePtr();
3871  heapBase2 = (char*)m_mallocData2.getMutableCudaPtr();
3872 #endif
3873 #endif
3874 
3875  int offset;
3876 #endif
3877 
3878  CUfunction kernelMemCpyIndex = m_module->getKernel("MemCpyIndex", sizeof(CUdeviceptr)+sizeof(int));
3879  if (!kernelMemCpyIndex)
3880  fail("Memory copy kernel not found!");
3881 
3882  int memSize = m_trisIndex.getSize()/sizeof(int);
3883  offset = 0;
3884  offset += m_module->setParamPtr(kernelMemCpyIndex, offset, m_trisIndex.getCudaPtr());
3885  offset += m_module->setParami(kernelMemCpyIndex, offset, memSize);
3886  F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex, Vec2i(256,1), Vec2i((memSize-1+256)/256, 1));
3887 
3888 #ifndef BENCHMARK
3889  printf("Triangle indices copied in %f\n", memcpyTime);
3890 #endif
3891 
3892 #ifdef SCATTER_ALLOC
3893  CUdeviceptr& heap = *(CUdeviceptr*)m_module->getGlobal("g_heapBase").getMutablePtr();
3894  CUdeviceptr base = m_mallocData.getMutableCudaPtr();
3895  baseOffset = heap - base;
3896  heap = base;
3897  //if(heap != m_mallocData.getCudaPtr())
3898  // printf("Wrong base address!\n");
3899 #endif
3900 
3901  return baseOffset;
3902 }
3903 
3904 
3906 {
3907  // Convert woop triangles
3908  CUfunction kernelCreateWoop = m_module->getKernel("createWoop", 2*sizeof(CUdeviceptr)+sizeof(int));
3909  if (!kernelCreateWoop)
3910  fail("Regular triangle to Woop triangle conversion kernel not found!");
3911 
3912  int offset = 0;
3913  offset += m_module->setParamPtr(kernelCreateWoop, offset, m_trisCompact.getCudaPtr());
3914  offset += m_module->setParamPtr(kernelCreateWoop, offset, m_trisCompactOut.getMutableCudaPtr());
3915  offset += m_module->setParami(kernelCreateWoop, offset, m_numTris);
3916  F32 woopTime = m_module->launchKernelTimed(kernelCreateWoop, Vec2i(256,1), Vec2i((m_numTris-1+256)/256, 1));
3917 
3918 #ifndef BENCHMARK
3919  printf("Woop triangles created in %f\n", woopTime);
3920 #endif
3921 
3922  return woopTime;
3923 }
3924 
3925 void CudaPersistentBVHTracer::resetBuffers(bool resetADSBuffers)
3926 {
3927  // Reset buffers so that reuse of space does not cause timing disturbs
3928  if(resetADSBuffers)
3929  {
3930  m_bvhData.reset();
3931  m_trisCompactOut.reset();
3932  m_trisIndexOut.reset();
3933  }
3934 
3935  m_mallocData.reset();
3936  m_mallocData2.reset();
3937  m_taskData.reset();
3938  m_splitData.reset();
3939 
3940  m_raysIndex.reset();
3941 
3942  m_ppsTris.reset();
3943  m_ppsTrisIndex.reset();
3944  m_sortTris.reset();
3945  m_ppsRays.reset();
3946  m_ppsRaysIndex.reset();
3947  m_sortRays.reset();
3948 }
3949 
3951 {
3952  // Save sizes of auxiliary buffers so that they can be printed
3953  saveBufferSizes(false, true);
3954  // Free auxiliary buffers
3955  resetBuffers(false);
3956 
3957  // Resize to exact memory
3958  U32 nN, nL, eL, sT, bT, tT, sTr;
3959  getStats(nN, nL, eL, sT, bT, tT, sTr);
3960 #ifdef COMPACT_LAYOUT
3961  m_bvhData.resize(nN * sizeof(CudaBVHNode));
3962  m_trisCompactOut.resize(tT*3*sizeof(float4) + nL*sizeof(float4));
3963  m_trisIndexOut.resize(tT*3*sizeof(int) + nL*sizeof(int));
3964 #else
3965  m_bvhData.resize((nN + nL) * sizeof(CudaBVHNode));
3966 #endif
3967 
3968  // Save sizes of ads buffers so that they can be printed
3969  saveBufferSizes(true, false);
3970 }
3971 
3973 {
3974  // Save sizes of auxiliary buffers so that they can be printed
3975  saveBufferSizes(false, true);
3976  // Free auxiliary buffers
3977  resetBuffers(false);
3978 
3979  // Resize to exact memory
3980  U32 nN, nL, eL, sT, nT, tT, sTr;
3981  getStats(nN, nL, eL, sT, nT, tT, sTr);
3982 #ifndef INTERLEAVED_LAYOUT
3983 #ifndef COMPACT_LAYOUT
3984  getStats(nN, nL, eL, sT, nT, tT, sTr, false);
3985  m_bvhData.resize((nN + nL) * sizeof(CudaKdtreeNode));
3986  m_trisCompactOut.resize(tT*3*sizeof(float4));
3987  m_trisIndexOut.resize(tT*3*sizeof(int));
3988 #else
3989 #ifdef DUPLICATE_REFERENCES
3990  m_bvhData.resize(nN * sizeof(CudaKdtreeNode));
3991  m_trisCompactOut.resize(tT*3*sizeof(float4) + nL*sizeof(float4));
3992  m_trisIndexOut.resize(tT*3*sizeof(int) + nL*sizeof(int));
3993 #else
3994  m_bvhData.resize(nN * sizeof(CudaKdtreeNode));
3995  m_trisIndexOut.resize(tT*sizeof(int) + nL*sizeof(int));
3996 #endif
3997 #endif
3998 #else
3999  //m_bvhData.resize((nN + nL) * sizeof(CudaKdtreeNode) + tT*3*sizeof(float4) + tT*3*sizeof(int));
4000  m_bvhData.resize(nT);
4001 #endif
4002 
4003  // Save sizes of ads buffers so that they can be printed
4004  saveBufferSizes(true, false);
4005 }
4006 
4007 void CudaPersistentBVHTracer::getStats(U32& nodes, U32& leaves, U32& emptyLeaves, U32& stackTop, U32& nodeTop, U32& tris, U32& sortedTris, bool sub)
4008 {
4009  TaskStackBVH tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
4010 
4011 #ifndef INTERLEAVED_LAYOUT
4012 #ifndef BVH_COUNT_NODES
4013 #ifndef COMPACT_LAYOUT
4014  nodes = tasks.nodeTop / 2;
4015  leaves = tasks.nodeTop - nodes;
4016 #else
4017  nodes = tasks.nodeTop;
4018  leaves = tasks.triTop;
4019  emptyLeaves = 0;
4020 #endif
4021 #else // BVH_COUNT_NODES
4022  nodes = tasks.numNodes;
4023  leaves = tasks.numLeaves;
4024  emptyLeaves = tasks.numEmptyLeaves;
4025 #endif // BVH_COUNT_NODES
4026 
4027 #ifdef COMPACT_LAYOUT
4028  tris = tasks.triTop;
4029  if(sub)
4030  tris -= (leaves-emptyLeaves);
4031 #ifdef DUPLICATE_REFERENCES
4032  tris /= 3;
4033 #endif
4034 #else
4035  if(sub)
4036  {
4037  tris = m_numTris;
4038  }
4039  else
4040  {
4041  tris = tasks.triTop;
4042  tris /= 3;
4043  }
4044 #endif
4045 #else
4046 #ifndef BVH_COUNT_NODES
4047  nodes = tasks.nodeTop / 2;
4048  leaves = tasks.nodeTop - nodes;
4049  emptyLeaves = 0;
4050 #else // BVH_COUNT_NODES
4051  nodes = tasks.numNodes;
4052  leaves = tasks.numLeaves;
4053  emptyLeaves = tasks.numEmptyLeaves;
4054 #endif // BVH_COUNT_NODES
4055 
4056  tris = tasks.nodeTop - (nodes+leaves)*sizeof(CudaKdtreeNode); // Subtract node memory
4057  tris /= 3*sizeof(float4)+sizeof(int); // Only approximate because of padding
4058 #endif
4059 
4060  nodeTop = tasks.nodeTop;
4061  sortedTris = tasks.numSortedTris;
4062  stackTop = tasks.top;
4063 }
4064 
4065 void CudaPersistentBVHTracer::getSizes(F32& task, F32& split, F32& ads, F32& tri, F32& triIdx, F32& heap)
4066 {
4067  task = m_sizeTask;
4068  split = m_sizeSplit;
4069  ads = m_sizeADS;
4070  tri = m_sizeTri;
4071  triIdx = m_sizeTriIdx;
4072  heap = m_heap;
4073 }
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int offset
Definition: DLLImports.inl:84
void resetBuffers(bool resetADSBuffers)
S32 getSize() const
Gets size of the buffer (number of rays).
Definition: RayBuffer.hpp:52
#define NULL
Definition: Defs.hpp:39
bool endsWith(const String &str) const
Definition: String.cpp:273
void setRange(S64 dstOfs, const void *src, S64 size, bool async=false, CUstream cudaStream=NULL)
Definition: Buffer.cpp:149
Buffer & getTriVtxIndexBuffer(void)
Returns buffer of triangle's vertex indieces.
Definition: Scene.hpp:75
float GetFloat(const char *name, const bool isFatal=false) const
void clear(int value=0)
Definition: Buffer.hpp:100
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
int GetInt(const char *name, const bool isFatal=false) const
void getStats(U32 &nodes, U32 &leaves, U32 &emptyLeaves, U32 &stackTop, U32 &nodeTop, U32 &tris, U32 &sortedTris, bool sub=true)
Buffer & getVtxPosBuffer(void)
Returns vertex position buffer.
Definition: Scene.hpp:103
float t
Definition: Util.hpp:84
void unstart(void)
Definition: Timer.hpp:43
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N
Definition: DLLImports.inl:132
CUdeviceptr getCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:108
unsigned __int64 U64
Definition: Defs.hpp:97
S32 numTriangleTests
Total number of ray-triangle tests.
Definition: BVH.hpp:63
S32 numRays
Total number of rays.
Definition: BVH.hpp:62
void start(void)
Definition: Timer.hpp:42
F32 traceBatchBVH(RayBuffer &rays, RayStats *stats=NULL)
CudaKernel getKernel(const String &name)
Definition: CudaModule.cpp:80
FW_CUDA_FUNC Vec3f getXYZ(void) const
Definition: Math.hpp:365
FW_CUDA_FUNC F32 sqrt(F32 a)
Definition: Math.hpp:39
int setParami(CUfunction kernel, int offset, S32 value)
Definition: CudaModule.cpp:90
Definition: Util.hpp:62
S64 getSize(void) const
Definition: Buffer.hpp:69
Structure holding ray statistics. Also provides print to the console. These statistics are used in a ...
Definition: BVH.hpp:45
const U8 * getPtr(S64 ofs=0)
Definition: Buffer.hpp:106
void clearOptions(void)
void setOwner(Module module, bool modify, bool async=false, CUstream cudaStream=NULL, S64 validSize=-1)
Definition: Buffer.cpp:220
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx cuCtxSynchronize
Definition: DLLImports.inl:58
FW_CUDA_FUNC T dot(const VectorBase< T, L, S > &a, const VectorBase< T, L, V > &b)
Definition: Math.hpp:477
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat x
Definition: DLLImports.inl:363
Buffer & getRayBuffer()
Gets ray buffer.
Definition: RayBuffer.hpp:167
bool getNeedClosestHit() const
Returns whether the closest hit is needed.
Definition: RayBuffer.hpp:150
void setTexRef(const String &name, Buffer &buf, CUarray_format format, int numComponents)
Definition: CudaModule.cpp:193
void define(const String &key, const String &value="")
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z
Definition: DLLImports.inl:363
static int getComputeCapability(void)
Definition: CudaModule.cpp:508
void getBBox(Vec3f &lo, Vec3f &hi) const
Gets scene AABB's minimum and maximum vector.
Definition: Scene.hpp:163
S32 getS32(void)
Definition: Random.hpp:54
Vec3f origin
Definition: Util.hpp:67
FW_CUDA_FUNC T sum(const VectorBase< T, L, S > &v)
Definition: Math.hpp:463
float F32
Definition: Defs.hpp:89
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat y
Definition: DLLImports.inl:363
static Environment * GetSingleton()
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:112
F32 log2(F32 a)
Definition: Math.hpp:90
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
Definition: DLLImports.inl:329
F32 traceOnDemandKdtree(RayBuffer &rays, bool rebuild, int numRays=0)
Buffer & getResultBuffer()
Gets ray result buffer.
Definition: RayBuffer.hpp:173
F32 traceOnDemandBVH(RayBuffer &rays, bool rebuild, int numRays=0)
Ray buffer class. Stores rays.
Definition: RayBuffer.hpp:38
U8 * getMutablePtr(S64 ofs=0)
Definition: Buffer.hpp:110
#define TASK_SIZE
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
Definition: Math.hpp:461
Vec3f direction
Definition: Util.hpp:69
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
Definition: Math.hpp:462
int getNumTriangles(void) const
Definition: Scene.hpp:61
float tmax
Definition: Util.hpp:70
signed int S32
Definition: Defs.hpp:88
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx, F32 &heap)
F32 end(void)
Definition: Timer.hpp:69
String sprintf(const char *fmt,...)
Definition: Defs.cpp:241
bool GetFloatValue(const char *name, float &value, const bool isFatal=false) const
S32 numNodeTests
Total number of ray-node tests.
Definition: BVH.hpp:64
signed __int64 S64
Definition: Defs.hpp:98
unsigned int U32
Definition: Defs.hpp:85
Class holding information about a split of a BVH node.
Definition: BVHNode.hpp:58
Class holding 3d scene.
Definition: Scene.hpp:44
FW_CUDA_FUNC void normalize(T len=(T) 1)
Definition: Math.hpp:145
Buffer & getGlobal(const String &name)
Definition: CudaModule.cpp:117
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
Definition: DLLImports.inl:88
FW_CUDA_FUNC F32 cross(const Vec2f &a, const Vec2f &b)
Definition: Math.hpp:481
void clearRange(S64 dstOfs, int value, S64 size, bool async=false, CUstream cudaStream=NULL)
Definition: Buffer.cpp:202
F32 traceBatchKdtree(RayBuffer &rays, RayStats *stats=NULL)
CudaPersistentBVHTracer(Scene &scene, F32 epsilon)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
Definition: DLLImports.inl:325
void printf(const char *fmt,...)
Definition: Defs.cpp:225
void traceOnDemandTrace(RayBuffer &rays, F32 &GPUmegakernel, F32 &CPUmegakernel, F32 &GPUtravKernel, F32 &CPUtravKernel, int &buildNodes, RayStats *stats=NULL)
String & appendf(const char *fmt,...)
Definition: String.cpp:207
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei width
Definition: DLLImports.inl:347
FW_CUDA_FUNC F64 log(F64 a)
Definition: Math.hpp:47
void failIfError(void)
Definition: Defs.cpp:361
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height
Definition: DLLImports.inl:347
FW_CUDA_FUNC S normalized(T len=(T) 1) const
Definition: Math.hpp:144
U32 getU32(void)
Definition: Random.hpp:51
int setParamPtr(CUfunction kernel, int offset, CUdeviceptr value)
Definition: CudaModule.cpp:108
const float MB
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei depth
Definition: DLLImports.inl:349
bool GetIntValue(const char *name, int &value, const bool isFatal=false) const
F32 launchTimed(bool yield=true)
Definition: CudaKernel.cpp:188
static void staticInit(void)
Definition: CudaModule.cpp:311
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type
Definition: DLLImports.inl:323
void fail(const char *fmt,...)
Definition: Defs.cpp:304
void addOptions(const String &options)
void resizeDiscard(S64 size)
Definition: Buffer.hpp:83
void clearDefines(void)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
Definition: DLLImports.inl:319
void resize(S64 size)
Definition: Buffer.hpp:82
void setCachePath(const String &path)
void reset(U32 hints, int align)
Definition: Buffer.hpp:76