NTrace
GPU ray tracing framework
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
CudaPersistentTracer.cpp
Go to the documentation of this file.
1 #include <sstream>
2 #include <iomanip>
3 #include "base/Random.hpp"
4 #include "CudaNoStructTracer.hpp"
6 #include "../../../../AppEnvironment.h"
7 
8 #define TASK_SIZE 430000
9 
10 using namespace FW;
11 
12 #define BENCHMARK
13 #define TEST_TASKS
14 //#define CUTOFF_DEPTH // Reports only data at specific level - for debugging purpouses
15 //#define TRACE_L1
16 
17 #ifndef TEST_TASKS
18 #include "kernels/thrustTest.hpp"
19 #endif
20 
21 CudaNoStructTracer::CudaNoStructTracer(MiniMax::Scene& scene, F32 epsilon):
22 m_epsilon(epsilon)
23 {
24  // init
26  //m_compiler.addOptions("-use_fast_math");
27  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
28  m_compiler.clearDefines();
30  m_compiler.define("FERMI");
31 
32  // convert from scene
33  Vec3f light = Vec3f(1.0f, 2.0f, 3.0f).normalized();
34 
35  m_numTris = scene.triangles.size();
36  m_numVerts = m_numTris * 3;
37  m_numMaterials = 1;
38  m_numShadingNormals = m_numVerts;
39  m_numTextureCoords = m_numVerts;
40  m_bbox = scene.box;
41 
42  m_tris.resizeDiscard(m_numTris * sizeof(SceneTriangle));
43  m_triNormals.resizeDiscard(m_numTris * sizeof(Vec3f));
44  //m_verts.resizeDiscard(m_numVerts * sizeof(Vec3f));
45  m_materials.resizeDiscard(m_numMaterials * sizeof(Material));
46  m_shadingNormals.resizeDiscard(m_numVerts * sizeof(Vec3f));
47  m_shadedColor.resizeDiscard(m_numTris * sizeof(Vec3f));
48  m_materialColor.resizeDiscard(m_numTris * sizeof(Vec3f));
49  //m_textureCoords.resizeDiscard(m_numTextureCoords * sizeof(Vec2f));
50  //m_trisBox.resizeDiscard(m_numTris * sizeof(CudaAABB));
51 
52  m_trisCompact.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
53  m_trisIndex.resizeDiscard(m_numTris * sizeof(S32));
54 
55  SceneTriangle* tout = (SceneTriangle*)m_tris.getMutablePtr();
56  Vec3f* nout = (Vec3f*)m_triNormals.getMutablePtr();
57  //Vec3f* vout = (Vec3f*)m_verts.getMutablePtr();
58  Material* mout = (Material*)m_materials.getMutablePtr();
59  Vec3f* snout = (Vec3f*)m_shadingNormals.getMutablePtr();
60  U32* scout = (U32*)m_shadedColor.getMutablePtr();
61  U32* mcout = (U32*)m_materialColor.getMutablePtr();
62  //Vec2f* uvout = (Vec2f*)m_textureCoords.getMutablePtr();
63  //CudaAABB* bout = (CudaAABB*)m_trisBox.getMutablePtr();
64 
65  Vec4f* tcout = (Vec4f*)m_trisCompact.getMutablePtr();
66  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
67 
68  // load vertices
69  for (int i = 0; i < m_numTris; i++)
70  {
71  Triangle& tris = *scene.triangles[i];
72  for(int j = 0; j < 3; j++)
73  {
74  //vout[i*3+j] = Vec3f(tris.vertices[j].x,tris.vertices[j].y,tris.vertices[j].z);
75  snout[i*3+j] = Vec3f(tris.normals[j].x,tris.normals[j].y,tris.normals[j].z);
76  //uvout[i*3+j] = Vec2f(tris.uvs[j].xx,tris.uvs[j].yy);
77 
78  *tcout = Vec4f(tris.vertices[j].x,tris.vertices[j].y,tris.vertices[j].z,0);
79  tcout++;
80  }
81 
82  /*Vec3f minV = min(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
83  bout[i].m_mn = make_float3(minV.x, minV.y, minV.z);
84  Vec3f maxV = max(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
85  bout[i].m_mx = make_float3(maxV.x, maxV.y, maxV.z);*/
86  }
87 
88  // default material
89  Material m;
90  m.diffuse = Vec3f(1.0f,1.0f,1.0f);
91  m.specular = Vec3f(0.0f,0.0f,0.0f);
92  m.type = MeshBase::Material::MaterialType_Phong;
93  m.texID = -1; // no texture
94  m.gloss_alpha = Vec2f(0.0f, 0.f);
95  mout[0] = m;
96 
97  unsigned int matid = 1;
98 
99  // load triangles
100  Vec4f defaultColor(1.0f,1.0f,1.0f,1.0f);
101  for(int i=0,j=0;i<m_numTris;i++,j+=3)
102  {
103  // triangle data
104  tout->vertices = Vec3i(j,j+1,j+2);
105  Triangle& tris = *scene.triangles[i];
106  Vector3 normalVec = tris.GetNormal();
107  tout->normal = Vec3f(normalVec.x,normalVec.y,normalVec.z);
108  *nout = tout->normal;
109 
110  // material
111  Material* mat;
112  mat = &mout[0];
113  matid = 0;
114 
115  Vec4f diffuseColor(mat->diffuse,1.0f);
116  tout->materialColor = diffuseColor.toABGR();
117  tout->shadedColor = Vec4f( diffuseColor.getXYZ() * (dot(tout->normal, light) * 0.5f + 0.5f), 1.0f).toABGR();
118  tout->materialId = matid;
119 
120  *scout = tout->shadedColor;
121  *mcout = tout->materialColor;
122  scout++;
123  mcout++;
124 
125  tout++;
126  nout++;
127  }
128 
129  m_sizeTask = 0.f;
130  m_sizeSplit = 0.f;
131  m_sizeADS = 0.f;
132  m_sizeTri = 0.f;
133  m_sizeTriIdx = 0.f;
134  m_heap = 0.f;
135 }
136 
138 {
139  m_kernelFile = "src/rt/kernels/persistent_nostruct.cu";
140  m_compiler.setSourceFile(m_kernelFile);
141  m_module = m_compiler.compile();
142  failIfError();
143 
144  m_numRays = rays.getSize();
145 
146 #ifdef TEST_TASKS
147 #ifdef DEBUG_PPS
148  Random rand;
149  m_numRays = rand.getU32(1, 1000000);
150  m_numTris = rand.getU32(1, 1000000);
151 #endif
152 #endif
153 
154  // Set triangle index buffer
155  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
156 #ifdef DEBUG_PPS
157  S32* ptout = (S32*)m_ppsTris.getMutablePtr();
158  S32* cltout = (S32*)m_ppsTrisIndex.getMutablePtr();
159  S32* stout = (S32*)m_sortTris.getMutablePtr();
160 #endif
161  for(int i=0;i<m_numTris;i++)
162  {
163 #ifndef DEBUG_PPS
164  // indices
165  *tiout = i;
166  tiout++;
167 #else
168  int rnd = rand.getS32(-1, 2);
169  //*ptout = rnd;
170  *cltout = rnd;
171  *stout = (rnd >= 1);
172  //ptout++;
173  cltout++;
174  stout++;
175 #endif
176  }
177 
178  m_raysIndex.resizeDiscard(sizeof(int)*m_numRays);
179  rays.getResultBuffer().clear(-1); // Set as no hit
180  int *rayIdx = (int*)m_raysIndex.getMutablePtr();
181 #ifdef DEBUG_PPS
182  S32* prout = (S32*)m_ppsRays.getMutablePtr();
183  S32* clrout = (S32*)m_ppsRaysIndex.getMutablePtr();
184  S32* srout = (S32*)m_sortRays.getMutablePtr();
185 #endif
186  for(int i = 0; i < m_numRays; i++)
187  {
188 #ifndef DEBUG_PPS
189  // Set rays index buffer
190  *rayIdx = i;
191  rayIdx++;
192 
193 #ifdef TEST_TASKS
194 #if 0
195  // CPU Clipping
196  MiniMax::Ray mRay;
197  memcpy(&mRay.origin, &ray.origin, sizeof(Vec3f));
198  memcpy(&mRay.direction, &ray.direction, sizeof(Vec3f));
199  // Clip the rays to the extent of scene box
200  bool intersects = m_bbox.ComputeMinMaxT(mRay,
201  ray.tmin, ray.tmax);
202 
203  // clip the origin of the rays
204  if (ray.tmin < 1e-3f)
205  ray.tmin = 1e-3f;
206 #endif
207 #endif
208 #else
209  int rnd = rand.getS32(-1, 3);
210  //*prout = rnd;
211  *clrout = rnd;
212  *srout = (rnd >= 1);
213  //prout++;
214  clrout++;
215  srout++;
216 #endif
217  }
218 
219  // Start the timer
220  m_timer.unstart();
221  m_timer.start();
222 
223  // Create the taskData
224 #ifdef TEST_TASKS
225  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(Task) + sizeof(int)));
226  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
227 
228 #if SPLIT_TYPE == 3
229  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfo));
230  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
231 #endif
232 #endif
233 
234  m_gpuTime = traceCudaRayBuffer(rays);
235  m_cpuTime = m_timer.end();
236 
237  m_raysIndex.reset();
238 #ifdef DEBUG_PPS
239  exit(0);
240 #endif
241 
242  return m_gpuTime;
243 }
244 
246 {
247 #ifdef MALLOC_SCRATCHPAD
248  // Set the memory limit according to triangle count
249 #ifndef BENCHMARK
250  printf("Setting dynamic memory limit to %fMB\n", (float)(m_trisIndex.getSize()*5*3)/(float)(1024*1024));
251 #endif
252  cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, m_trisIndex.getSize()*5*3);
253 #endif
254 
255  // Compile the kernel
256  if(!sbvh)
257  m_kernelFile = "src/rt/kernels/persistent_bvh.cu";
258  else
259  m_kernelFile = "src/rt/kernels/persistent_sbvh.cu";
260 
261  m_compiler.setSourceFile(m_kernelFile);
262  m_module = m_compiler.compile();
263  failIfError();
264 
265 #ifdef DEBUG_PPS
266  Random rand;
267  m_numTris = rand.getU32(1, 1000000);
268 #endif
269 
270  // Set triangle index buffer
271  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
272 #ifdef DEBUG_PPS
273  S32* pout = (S32*)m_ppsTris.getMutablePtr();
274  S32* clout = (S32*)m_ppsTrisIndex.getMutablePtr();
275  S32* sout = (S32*)m_sortTris.getMutablePtr();
276 #endif
277  for(int i=0;i<m_numTris;i++)
278  {
279 #ifndef DEBUG_PPS
280  // indices
281  *tiout = i;
282  tiout++;
283 #else
284  int rnd = rand.getU32(0, 2);
285  //*pout = rnd;
286  *clout = rnd;
287  *sout = (rnd >= 1);
288  //pout++;
289  clout++;
290  sout++;
291 #endif
292  }
293 
294  // Start the timer
295  m_timer.unstart();
296  m_timer.start();
297 
298  // Create the taskData
299  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
300  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
301  //S64 bvhSize = ((m_numTris * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
302  S64 bvhSize = ((m_numTris/2 * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
303  m_bvhData.resizeDiscard(bvhSize);
304  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
305  //m_bvhData.clearRange32(0, 0, bvhSize); // Mark all tasks as 0 (important for debug)
306 #ifdef COMPACT_LAYOUT
307  if(!sbvh)
308  {
309  m_trisCompactOut.resizeDiscard(m_numTris * (3+1) * sizeof(Vec4f));
310  m_trisIndexOut.resizeDiscard(m_numTris * (3+1) * sizeof(S32));
311  }
312  else
313  {
314  m_trisCompactOut.resizeDiscard(m_numTris*2 * (3+1) * sizeof(Vec4f));
315  m_trisIndexOut.resizeDiscard(m_numTris*2 * (3+1) * sizeof(S32));
316  }
317 #endif
318 
319 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
320  m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
321  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
322 #endif
323 
324  m_gpuTime = buildCudaBVH();
325  m_cpuTime = m_timer.end();
326 
327  // Resize to exact memory
328  trimBVHBuffers();
329 
330 #ifdef DEBUG_PPS
331  exit(0);
332 #endif
333 
334  return m_gpuTime;
335 }
336 
338 {
339 #ifdef TRACE_L1
340  // Set compiler options
341  m_compiler.clearOptions();
342 #endif
343 
344  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
345 #ifdef COMPACT_LAYOUT
346 #ifdef WOOP_TRIANGLES
347  String kernelName("src/rt/kernels/fermi_speculative_while_while");
348 #else
349  String kernelName("src/rt/kernels/fermi_speculative_while_while_inter");
350 #endif
351 #ifdef TRACE_L1
352  m_compiler.addOptions("-use_fast_math");
353 #endif
354 #else
355  String kernelName("src/rt/kernels/fermi_persistent_speculative_while_while_inter");
356 #ifdef TRACE_L1
357  m_compiler.addOptions("-use_fast_math -maxrregcount 40");
358 #endif
359 #endif
360  if(stats != NULL)
361  {
362  kernelName += "_statistics";
363  }
364  kernelName += ".cu";
365  m_compiler.setSourceFile(kernelName);
366 
367  m_module = m_compiler.compile();
368  failIfError();
369 
370  CUfunction queryKernel = m_module->getKernel("queryConfig");
371  if (!queryKernel)
372  fail("Config query kernel not found!");
373 
374  // Initialize config with default values.
375  KernelConfig& kernelConfig = *(KernelConfig*)m_module->getGlobal("g_config").getMutablePtr();
376  kernelConfig.bvhLayout = BVHLayout_Max;
377  kernelConfig.blockWidth = 0;
378  kernelConfig.blockHeight = 0;
379  kernelConfig.usePersistentThreads = 0;
380 
381  // Query config.
382 
383  m_module->launchKernel(queryKernel, 1, 1);
384  kernelConfig = *(const KernelConfig*)m_module->getGlobal("g_config").getPtr();
385 
386  CUfunction kernel;
387  if(stats != NULL)
388  kernel = m_module->getKernel("trace_stats");
389  else
390  kernel = m_module->getKernel("trace");
391  if (!kernel)
392  fail("Trace kernel not found!");
393 
394  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
395  // Start the timer
396  m_timer.unstart();
397  m_timer.start();
398 
399  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
400  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
401 
402 #ifdef COMPACT_LAYOUT
403  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
404  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
405  Buffer& indexBuf = m_trisIndexOut;
406 #else
407  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
408  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
409  Buffer& indexBuf = m_trisIndex;
410 #endif
411 
412  // Set input.
413  // The new version has it via parameters, not const memory
414  in.numRays = rays.getSize();
415  in.anyHit = (rays.getNeedClosestHit() == false);
416  in.nodesA = nodePtr + nodeOfsA.x;
417  in.trisA = triPtr + triOfsA.x;
418  in.rays = rays.getRayBuffer().getCudaPtr();
419  in.results = rays.getResultBuffer().getMutableCudaPtr();
420  in.triIndices = indexBuf.getCudaPtr();
421 
422  // Set texture references.
423  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
424  m_module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
425  m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
426  m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
427 
428  // Determine block and grid sizes.
429  int desiredWarps = (rays.getSize() + 31) / 32;
430  if (kernelConfig.usePersistentThreads != 0)
431  {
432  *(S32*)m_module->getGlobal("g_warpCounter").getMutablePtr() = 0;
433  desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
434  }
435 
436  Vec2i blockSize(kernelConfig.blockWidth, kernelConfig.blockHeight);
437  int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
438  Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
439 
440  if(stats != NULL)
441  {
442  m_module->getGlobal("g_NumNodes").clear();
443  m_module->getGlobal("g_NumLeaves").clear();
444  m_module->getGlobal("g_NumEmptyLeaves").clear();
445  m_module->getGlobal("g_NumTris").clear();
446  m_module->getGlobal("g_NumFailedTris").clear();
447  m_module->getGlobal("g_NumHitTrisOutside").clear();
448  }
449 
450  // Launch.
451  F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
452 
453  if(stats != NULL)
454  {
455  stats->numNodeTests += *(U32*)m_module->getGlobal("g_NumNodes").getPtr();
456  stats->numLeavesVisited += *(U32*)m_module->getGlobal("g_NumLeaves").getPtr();
457  stats->numEmptyLeavesVisited += *(U32*)m_module->getGlobal("g_NumEmptyLeaves").getPtr();
458  stats->numTriangleTests += *(U32*)m_module->getGlobal("g_NumTris").getPtr();
459  stats->numFailedTriangleTests += *(U32*)m_module->getGlobal("g_NumFailedTris").getPtr();
460  stats->numSuccessTriangleTestsOutside += *(U32*)m_module->getGlobal("g_NumHitTrisOutside").getPtr();
461  stats->numRays += rays.getSize();
462  }
463 
464  m_gpuTime = launchTime;
465  m_cpuTime = m_timer.end();
466 
467 #ifdef TRACE_L1
468  // reset options
469  m_compiler.clearOptions();
470  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
471 #endif
472 
473  return launchTime;
474 }
475 
477 {
478  // Compile the kernel
479  m_kernelFile = "src/rt/kernels/persistent_kdtree.cu";
480  m_compiler.setSourceFile(m_kernelFile);
481  m_module = m_compiler.compile();
482  failIfError();
483 
484  prepareDynamicMemory();
485 
486 #ifdef DEBUG_PPS
487  Random rand;
488  m_numTris = rand.getU32(1, 1000000);
489 #endif
490 
491  // Set triangle index buffer
492  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
493 #ifdef DEBUG_PPS
494  S32* pout = (S32*)m_ppsTris.getMutablePtr();
495  S32* clout = (S32*)m_ppsTrisIndex.getMutablePtr();
496  S32* sout = (S32*)m_sortTris.getMutablePtr();
497 #endif
498  for(int i=0;i<m_numTris;i++)
499  {
500 #ifndef DEBUG_PPS
501  // indices
502  *tiout = i;
503  tiout++;
504 #else
505  int rnd = rand.getU32(0, 2);
506  //*pout = rnd;
507  *clout = rnd;
508  *sout = (rnd >= 1);
509  //pout++;
510  clout++;
511  sout++;
512 #endif
513  }
514 
515  // Start the timer
516  m_timer.unstart();
517  m_timer.start();
518 
519  // Create the taskData
520  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
521  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
522 //#if SPLIT_TYPE == 3
523  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfoTri));
524 //#elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
525 // m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
526 //#endif
527  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
528 
529  // Node and triangle data
530 #ifndef INTERLEAVED_LAYOUT
531  //S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
532  //S64 kdtreeSize = ((m_numTris*10 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
533  //S64 kdtreeSize = ((m_numTris*3 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
534  S64 kdtreeSize = ((m_numTris*20 * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
535  m_bvhData.resizeDiscard(kdtreeSize);
536  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
537  //m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
538 #ifndef COMPACT_LAYOUT
539  m_trisCompactOut.resizeDiscard(m_numTris*10 * 3 * sizeof(Vec4f));
540  m_trisIndexOut.resizeDiscard(m_numTris*10 * 3 * sizeof(S32));
541 #else
542 #ifdef DUPLICATE_REFERENCES
543  m_trisCompactOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(Vec4f));
544  m_trisIndexOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(S32));
545 #else
546  m_trisCompactOut.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
547  //m_trisIndexOut.resizeDiscard(m_numTris*12 * (1+1) * sizeof(S32));
548  m_trisIndexOut.resizeDiscard(m_numTris*20 * (1+1) * sizeof(S32));
549  //m_trisIndexOut.resizeDiscard(m_numTris*6 * (1+1) * sizeof(S32));
550 #endif
551 #endif
552 #else
553  // TODO: Rewrite this when double headed heap is realized
554  S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (sizeof(Vec4f)+sizeof(S32))) + 4096 - 1) & -4096;
555  //size_t f, t; cuMemGetInfo(&f, &t);
556  //S64 kdtreeSize = f & -4096;
557  m_bvhData.resizeDiscard(kdtreeSize);
558  m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
559 #endif
560 
561  m_gpuTime = buildCudaKdtree();
562  m_cpuTime = m_timer.end();
563 
564  // Resize to exact memory
566 
567 #ifdef DEBUG_PPS
568  exit(0);
569 #endif
570 
571  return m_gpuTime;
572 }
573 
575 {
576 #ifdef TRACE_L1
577  // Set compiler options
578  m_compiler.clearOptions();
579 #endif
580 
581  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
582 #ifdef COMPACT_LAYOUT
583 #ifdef WOOP_TRIANGLES
584 #ifdef DUPLICATE_REFERENCES
585  String kernelName("src/rt/kernels/fermi_kdtree_while_while_childPtr");
586  //String kernelName("src/rt/kernels/fermi_kdtree_while_while_shortStack");
587 #else
588  String kernelName("src/rt/kernels/fermi_kdtree_while_while_leafRef");
589 #endif
590 #else
591 #error Undefined kernel
592 #endif
593 #else
594  String kernelName("src/rt/kernels/fermi_kdtree_while_while");
595 #endif
596 #ifdef TRACE_L1
597  m_compiler.addOptions("-use_fast_math");
598  //m_compiler.addOptions("-use_fast_math -maxrregcount 32");
599 #endif
600 
601  if(stats != NULL)
602  {
603  kernelName += "_statistics";
604  }
605  kernelName += ".cu";
606  m_compiler.setSourceFile(kernelName);
607 
608  m_module = m_compiler.compile();
609  failIfError();
610 
611  CUfunction queryKernel = m_module->getKernel("queryConfig");
612  if (!queryKernel)
613  fail("Config query kernel not found!");
614 
615  // Initialize config with default values.
616  KernelConfig& kernelConfig = *(KernelConfig*)m_module->getGlobal("g_config").getMutablePtr();
617  kernelConfig.bvhLayout = BVHLayout_Max;
618  kernelConfig.blockWidth = 0;
619  kernelConfig.blockHeight = 0;
620  kernelConfig.usePersistentThreads = 0;
621 
622  // Query config.
623 
624  m_module->launchKernel(queryKernel, 1, 1);
625  kernelConfig = *(const KernelConfig*)m_module->getGlobal("g_config").getPtr();
626 
627  CUfunction kernel;
628  if(stats != NULL)
629  kernel = m_module->getKernel("trace_stats");
630  else
631  kernel = m_module->getKernel("trace");
632  if (!kernel)
633  fail("Trace kernel not found!");
634 
635  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
636  // Start the timer
637  m_timer.unstart();
638  m_timer.start();
639 
640  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
641  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
642 
643 #ifndef INTERLEAVED_LAYOUT
644  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
645  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
646  Buffer& indexBuf = m_trisIndexOut;
647 #else
648  CUdeviceptr triPtr = m_bvhData.getCudaPtr();
649  Vec2i triOfsA = Vec2i(0, (S32)m_bvhData.getSize());
650  Buffer& indexBuf = m_bvhData;
651 #endif
652 
653  // Set input.
654  // The new version has it via parameters, not const memory
655  in.numRays = rays.getSize();
656  in.anyHit = (rays.getNeedClosestHit() == false);
657  memcpy(&in.bmin, &m_bbox.min, sizeof(float3));
658  memcpy(&in.bmax, &m_bbox.max, sizeof(float3));
659  in.nodesA = nodePtr + nodeOfsA.x;
660  in.trisA = triPtr + triOfsA.x;
661  in.rays = rays.getRayBuffer().getCudaPtr();
662  in.results = rays.getResultBuffer().getMutableCudaPtr();
663  in.triIndices = indexBuf.getCudaPtr();
664 
665  // Set texture references.
666  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
667  m_module->setTexRef("t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
668  m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
669  m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
670 
671  // Determine block and grid sizes.
672  int desiredWarps = (rays.getSize() + 31) / 32;
673  if (kernelConfig.usePersistentThreads != 0)
674  {
675  *(S32*)m_module->getGlobal("g_warpCounter").getMutablePtr() = 0;
676  desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
677  }
678 
679  Vec2i blockSize(kernelConfig.blockWidth, kernelConfig.blockHeight);
680  int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
681  Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
682 
683  if(stats != NULL)
684  {
685  m_module->getGlobal("g_NumNodes").clear();
686  m_module->getGlobal("g_NumLeaves").clear();
687  m_module->getGlobal("g_NumEmptyLeaves").clear();
688  m_module->getGlobal("g_NumTris").clear();
689  m_module->getGlobal("g_NumFailedTris").clear();
690  m_module->getGlobal("g_NumHitTrisOutside").clear();
691  }
692 
693  // Launch.
694  F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
695 
696  if(stats != NULL)
697  {
698  stats->numNodeTests += *(U32*)m_module->getGlobal("g_NumNodes").getPtr();
699  stats->numLeavesVisited += *(U32*)m_module->getGlobal("g_NumLeaves").getPtr();
700  stats->numEmptyLeavesVisited += *(U32*)m_module->getGlobal("g_NumEmptyLeaves").getPtr();
701  stats->numTriangleTests += *(U32*)m_module->getGlobal("g_NumTris").getPtr();
702  stats->numFailedTriangleTests += *(U32*)m_module->getGlobal("g_NumFailedTris").getPtr();
703  stats->numSuccessTriangleTestsOutside += *(U32*)m_module->getGlobal("g_NumHitTrisOutside").getPtr();
704  stats->numRays += rays.getSize();
705  }
706 
707  m_gpuTime = launchTime;
708  m_cpuTime = m_timer.end();
709 
710 #ifdef TRACE_L1
711  // reset options
712  m_compiler.clearOptions();
713  m_compiler.addOptions("-use_fast_math -Xptxas -dlcm=cg");
714 #endif
715 
716  return launchTime;
717 }
718 
719 F32 CudaNoStructTracer::traceOnDemandBVH(RayBuffer& rays, bool rebuild, int numRays)
720 {
721  m_numRays = numRays;
722 
723  if(rebuild)
724  {
725  // Compile the kernel
726  m_kernelFile = "src/rt/kernels/persistent_ondemand.cu";
727  m_compiler.setSourceFile(m_kernelFile);
728  m_module = m_compiler.compile();
729  failIfError();
730 
731  // Set triangle index buffer
732  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
733  for(int i=0;i<m_numTris;i++)
734  {
735  // indices
736  *tiout = i;
737  tiout++;
738  }
739  }
740 
741  // Start the timer
742  m_timer.unstart();
743  m_timer.start();
744 
745  if(rebuild)
746  {
747  // Create the taskData
748  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
749  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
750  S64 bvhSize = ((m_numTris * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
751  //S64 bvhSize = ((m_numTris/2 * sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
752  m_bvhData.resizeDiscard(bvhSize);
753  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
754 
755 #ifdef COMPACT_LAYOUT
756  m_trisCompactOut.resizeDiscard(m_numTris * (3+1) * sizeof(Vec4f));
757  m_trisIndexOut.resizeDiscard(m_numTris * (3+1) * sizeof(S32));
758 #endif
759 
760 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
761  m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
762  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
763 #endif
764  }
765 
766  // Build + trace
767  m_gpuTime = traceOnDemandBVHRayBuffer(rays, rebuild);
768  m_cpuTime = m_timer.end();
769 
770  // Save sizes of buffer so that they can be printed
771  if(rebuild)
772  saveBufferSizes();
773 
774  return m_gpuTime;
775 }
776 
777 F32 CudaNoStructTracer::traceOnDemandKdtree(RayBuffer& rays, bool rebuild, int numRays)
778 {
779  m_numRays = numRays;
780 
781  if(rebuild)
782  {
783  // Compile the kernel
784  m_kernelFile = "src/rt/kernels/persistent_ondemand_kdtree.cu";
785  m_compiler.setSourceFile(m_kernelFile);
786  m_module = m_compiler.compile();
787  failIfError();
788 
789  prepareDynamicMemory();
790 
791  // Set triangle index buffer
792  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
793  for(int i=0;i<m_numTris;i++)
794  {
795  // indices
796  *tiout = i;
797  tiout++;
798  }
799  }
800 
801  // Start the timer
802  m_timer.unstart();
803  m_timer.start();
804 
805  if(rebuild)
806  {
807  // Create the taskData
808  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
809  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
810 //#if SPLIT_TYPE == 3
811  m_splitData.resizeDiscard((S64)TASK_SIZE * (S64)sizeof(SplitInfoTri));
812 //#elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
813  // m_splitData.resizeDiscard((S64)(TASK_SIZE+1) * (S64)sizeof(SplitArray));
814 //#endif
815  m_splitData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
816 
817  // Node and triangle data
818 #ifndef INTERLEAVED_LAYOUT
819  S64 kdtreeSize = ((m_numTris * sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
820  m_bvhData.resizeDiscard(kdtreeSize);
821  m_bvhData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
822  //m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
823 #ifndef COMPACT_LAYOUT
824  m_trisCompactOut.resizeDiscard(m_numTris*10 * 3 * sizeof(Vec4f));
825  m_trisIndexOut.resizeDiscard(m_numTris*10 * 3 * sizeof(S32));
826 #else
827 #ifdef DUPLICATE_REFERENCES
828  m_trisCompactOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(Vec4f));
829  m_trisIndexOut.resizeDiscard(m_numTris*8 * (3+1) * sizeof(S32));
830 #else
831  m_trisCompactOut.resizeDiscard(m_numTris * 3 * sizeof(Vec4f));
832  m_trisIndexOut.resizeDiscard(m_numTris*7 * (1+1) * sizeof(S32));
833 #endif
834 #endif
835 #else
836  // TODO: Rewrite this when double headed heap is realized
837  S64 kdtreeSize = ((m_numTris*5 * sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (sizeof(Vec4f)+sizeof(S32))) + 4096 - 1) & -4096;
838  //size_t f, t; cuMemGetInfo(&f, &t);
839  //S64 kdtreeSize = f & -4096;
840  m_bvhData.resizeDiscard(kdtreeSize);
841  m_bvhData.clearRange32(0, 0, kdtreeSize); // Mark all tasks as 0 (important for debug)
842 #endif
843  }
844 
845  // Build + trace
846  m_gpuTime = traceOnDemandKdtreeRayBuffer(rays, rebuild);
847  m_cpuTime = m_timer.end();
848 
849  // Save sizes of buffer so that they can be printed
850  if(rebuild)
851  saveBufferSizes();
852 
853  return m_gpuTime;
854 }
855 
856 void CudaNoStructTracer::traceOnDemandTrace(RayBuffer& rays, F32& GPUmegakernel, F32& CPUmegakernel, F32& GPUtravKernel, F32& CPUtravKernel, int& buildNodes, RayStats* stats)
857 {
858  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
859  m_compiler.setSourceFile(m_kernelFile);
860  m_module = m_compiler.compile();
861  failIfError();
862 
863  CUfunction kernel;
864  kernel = m_module->getKernel("build");
865  if (!kernel)
866  fail("Build kernel not found!");
867 
868  F32 tTrace, tTraceCPU;
869 #ifndef ONDEMAND_FULL_BUILD
870 #if 0
871  // Needed for BVH since the data has been erased by module switch
872  // Set BVH input.
873  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
874  inBVH.numTris = m_numTris;
875  inBVH.tris = m_trisCompact.getCudaPtr();
876  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
877  //inBVH.trisBox = m_trisBox.getCudaPtr();
878  inBVH.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
879  inBVH.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
880  inBVH.sortTris = m_sortTris.getMutableCudaPtr();
881 #ifdef COMPACT_LAYOUT
882  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
883  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
884 #endif
885 
886  // Set traversal input
887  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
888  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
889  Buffer& indexBuf = m_trisIndex;
890  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
891  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
892 
893  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
894  m_timer.start();
895  in.numRays = rays.getSize();
896  in.anyHit = (rays.getNeedClosestHit() == false);
897  in.nodesA = nodePtr + nodeOfsA.x;
898  in.trisA = triPtr + triOfsA.x;
899  in.rays = rays.getRayBuffer().getCudaPtr();
900  in.results = rays.getResultBuffer().getMutableCudaPtr();
901  in.triIndices = indexBuf.getCudaPtr();
902 
903  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
904  m_module->setTexRef("t_nodesA", m_bvhData, CU_AD_FORMAT_FLOAT, 4);
905  m_module->setTexRef("t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
906  m_module->setTexRef("t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
907 #endif
908 
909  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
910  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
911  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
912  int gridSizeX = NUM_SM*numBlocksPerSM;
913  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
914 
915  // Run the kernel as long as the traversal order does not change
916  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
917  int oldNodes;
918  int i = 0;
919  do
920  {
921  oldNodes = tasks.numNodes;
922 
923  m_timer.unstart();
924  m_timer.start();
925  // Launch trace until traversal path convergence.
926 #if 0
927  // Needed for BVH since the task has been erased by module switch
928  tasks.header = (int*)m_taskData.getMutableCudaPtr();
929  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
930  tasks.nodeTop = 2;
931 #endif
932 
933  tasks.warpCounter = rays.getSize();
934  tasks.unfinished = -NUM_WARPS;
935  tasks.launchFlag = 1;
936 
937  // Launch.
938  tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
939  tTraceCPU = m_timer.end();
940  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
941  buildNodes += tasks.numNodes - oldNodes;
942 
943  //printf("Verify run %d in %f (%d -> %d)\n", i, tTrace, oldNodes, tasks.numNodes);
944  i++;
945  } while(oldNodes != tasks.numNodes);
946 
947  // Launch just trace.
948  /*tasks.warpCounter = rays.getSize();
949  tasks.unfinished = -NUM_WARPS;
950  tasks.launchFlag = 2;
951  tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);*/
952 
953  GPUmegakernel += tTrace; // Save the final traversal time inside the megakernel
954  CPUmegakernel += tTraceCPU;
955 #endif
956 
957  //cout << "Verify trace in " << tTrace << "s" << "\n";
958 
959  /*TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
960  tasks.warpCounter = rays.getSize();
961  tasks.unfinished = -NUM_WARPS;
962 
963  for(int i = 0; i < rays.getSize(); i++)
964  {
965  // Set ray result buffer
966  RayResult& res = rays.getMutableResultForSlot(i);
967  res.clear();
968 
969  // Set ray tmax
970  Ray& ray = rays.getMutableRayForSlot(i);
971  ray.tmax = defTmax;
972  }
973  rays.getResultBuffer().getMutableCudaPtr();
974  float tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
975  //cout << "Verify trace in " << tTrace << "s" << "\n";
976  */
977 
978  if(m_kernelFile.endsWith("kdtree.cu"))
979  tTrace = traceBatchKdtree(rays, stats);
980  else
981  tTrace = traceBatchBVH(rays, stats);
982  //tTrace = -1.f;
983  tTraceCPU = getCPUTime();
984  m_compiler.setCachePath("cudacache"); // On the first compilation the cache path becames absolute which kills the second compilation
985  m_compiler.setSourceFile(m_kernelFile);
986  m_module = m_compiler.compile();
987 
988  //printf("Verify kernel %d in %f\n", i, tTrace);
989 
990  GPUtravKernel += tTrace;
991  CPUtravKernel += tTraceCPU;
992 }
993 
995 {
996  float minTime = FLT_MAX;
997  float sumTime = 0.f;
998  const int numRepeats = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numRepeats");
999 
1000 // Create the taskData
1001  m_taskData.resizeDiscard(TASK_SIZE * (sizeof(TaskBVH) + sizeof(int)));
1002 
1003  const int width = Environment::GetSingleton()->GetInt("Aila.width");
1004  const int height = Environment::GetSingleton()->GetInt("Aila.height");
1005 
1006  cout << "Prefix scan for problem size of " << width << "x" << height << " = " << width*height << "\n";
1007 
1008 #ifdef TEST_TASKS
1009  cout << "Testing task pool" << "\n";
1010 #else
1011  cout << "Testing thrust" << "\n";
1012 #endif
1013 
1014  for(int i = 0; i < numRepeats; i++)
1015  {
1016 //#ifdef TEST_TASKS
1017  float t = testSort(width*height);
1018  //float t = testSort(m_numTris);
1019 /*#else
1020  float t = testThrustScan(width*height);
1021 #endif*/
1022 
1023  printf("Run %d sort in %fs\n", i, t);
1024  minTime = min(t, minTime);
1025  sumTime += t;
1026  }
1027 
1028  printf("Minimum time from %d runs = %fs\n", numRepeats, minTime);
1029  printf("Average time from %d runs = %fs\n", numRepeats, sumTime/numRepeats);
1030 
1031  //size_t f, t; cuMemGetInfo(&f, &t); fprintf(stdout,"CUDA Memory allocated:%dMB free:%dMB\n",(t-f)/1048576,f/1048576);
1032  return minTime;
1033 }
1034 
1035 void CudaNoStructTracer::updateConstants()
1036 {
1037  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
1038 
1039  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.maxDepth", cudaEnv.optMaxDepth);
1040 
1041  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.planeSelectionOverhead", cudaEnv.optPlaneSelectionOverhead);
1042 
1043  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ci", cudaEnv.optCi);
1044 
1045  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ct", cudaEnv.optCt);
1046 
1047  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ctr", cudaEnv.optCtr);
1048 
1049  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.ctt", cudaEnv.optCtt);
1050 
1051  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.triangleBasedWeight", cudaEnv.optTriangleBasedWeight);
1052 
1053  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.rayBasedWeight", cudaEnv.optRayBasedWeight);
1054 
1055  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.axisAlignedWeight", cudaEnv.optAxisAlignedWeight);
1056 
1057  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.cutOffDepth", cudaEnv.optCutOffDepth);
1058  m_cutOffDepth = cudaEnv.optCutOffDepth;
1059 
1060  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.rayLimit", cudaEnv.rayLimit);
1061 
1062  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.triLimit", cudaEnv.triLimit);
1063  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.triMaxLimit", cudaEnv.triMaxLimit);
1064 
1065  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.popCount", cudaEnv.popCount);
1066 
1067  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.granularity", cudaEnv.granularity);
1068 
1069  Environment::GetSingleton()->GetFloatValue("SubdivisionRayCaster.failRq", cudaEnv.failRq);
1070 
1071  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.failureCount", cudaEnv.failureCount);
1072 
1073  int siblingLimit;
1074  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.siblingLimit", siblingLimit);
1075  cudaEnv.siblingLimit = siblingLimit / WARP_SIZE;
1076 
1077  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.childLimit", cudaEnv.childLimit);
1078 
1079  Environment::GetSingleton()->GetIntValue("SubdivisionRayCaster.subtreeLimit", cudaEnv.subtreeLimit);
1080 
1081  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
1082 
1083  cudaEnv.epsilon = m_epsilon;
1084  //cudaEnv.epsilon = 0.f;
1085 }
1086 
1087 //------------------------------------------------------------------------
1088 
1089 int CudaNoStructTracer::warpSubtasks(int threads)
1090 {
1091  //return (threads + WARP_SIZE - 1) / WARP_SIZE;
1092  return max((threads + WARP_SIZE - 1) / WARP_SIZE, 1); // Do not create empty tasks - at least on warp gets to clean this task
1093 }
1094 
1095 //------------------------------------------------------------------------
1096 
1097 int CudaNoStructTracer::floatToOrderedInt(float floatVal)
1098 {
1099  int intVal = *((int*)&floatVal);
1100  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
1101 }
1102 
1103 /*unsigned int CudaNoStructTracer::floatToOrderedInt(float floatVal)
1104 {
1105  unsigned int f = *((unsigned int*)&floatVal);
1106  unsigned int mask = -(int)(f >> 31) | 0x80000000;
1107  return f ^ mask;
1108 }*/
1109 
1110 //------------------------------------------------------------------------
1111 
1112 void CudaNoStructTracer::allocateSnapshots(Buffer &snapData)
1113 {
1114  // Prepare snapshot memory
1115 #ifdef SNAPSHOT_POOL
1116  snapData.resizeDiscard(sizeof(PoolInfo)*SNAPSHOT_POOL);
1117  PoolInfo* &snapshots = *(PoolInfo**)m_module->getGlobal("g_snapshots").getMutablePtr();
1118  snapshots = (PoolInfo*)snapData.getMutableCudaPtr();
1119  snapData.clearRange32(0, 0, SNAPSHOT_POOL*sizeof(PoolInfo)); // Mark all tasks as empty (important for debug)
1120 #endif
1121 #ifdef SNAPSHOT_WARP
1122  snapData.resizeDiscard(sizeof(WarpInfo)*SNAPSHOT_WARP*NUM_WARPS);
1123  WarpInfo* &snapshots = *(WarpInfo**)m_module->getGlobal("g_snapshots").getMutablePtr();
1124  snapshots = (WarpInfo*)snapData.getMutableCudaPtr();
1125  snapData.clearRange32(0, 0, SNAPSHOT_WARP*NUM_WARPS*sizeof(WarpInfo)); // Mark all tasks as empty (important for debug)
1126 #endif
1127 }
1128 
1129 //------------------------------------------------------------------------
1130 
1131 void CudaNoStructTracer::printSnapshots(Buffer &snapData)
1132 {
1133 #ifdef SNAPSHOT_POOL
1134  PoolInfo* snapshots = (PoolInfo*)snapData.getPtr();
1135 
1136  if(snapshots[SNAPSHOT_POOL-1].pool != 0) // Full
1137  printf("\aSnapshot memory full!\n");
1138 
1139  long long int clockMin = snapshots[0].clockStart;
1140  long long int clockMax = 0;
1141  for(int i = 0; i < SNAPSHOT_POOL; i++)
1142  {
1143  if(snapshots[i].pool == 0)
1144  {
1145  clockMax = snapshots[i-1].clockEnd;
1146  break;
1147  }
1148  }
1149 
1150  ofstream snapfile("plots\\pool\\activity.dat");
1151  snapfile << "Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" << "\n";
1152  for(int i = 0; i < SNAPSHOT_POOL; i++)
1153  {
1154  if(snapshots[i].pool == 0)
1155  break;
1156 
1157  snapfile << i << "\t" << snapshots[i].pool << "\t" << snapshots[i].tasks << "\t" << snapshots[i].active << "\t" << snapshots[i].chunks << "\t" << snapshots[i].depth
1158  << "\t" << snapshots[i].clockEnd - snapshots[i].clockStart << "\n";
1159  }
1160  snapfile.close();
1161 
1162  snapfile.open("plots\\pool\\activity_clockCor.dat");
1163  snapfile << "Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" << "\n";
1164  for(int i = 0; i < SNAPSHOT_POOL; i++)
1165  {
1166  if(snapshots[i].pool == 0)
1167  break;
1168 
1169  snapfile << (float)((long double)(snapshots[i].clockEnd - clockMin) / (long double)(clockMax - clockMin)) << "\t" << snapshots[i].pool << "\t" << snapshots[i].tasks << "\t"
1170  << snapshots[i].active << "\t" << snapshots[i].chunks << "\t" << snapshots[i].depth << "\t" << snapshots[i].clockEnd - snapshots[i].clockStart << "\n";
1171  }
1172 
1173  snapfile.close();
1174 #endif
1175 #ifdef SNAPSHOT_WARP
1176  WarpInfo* snapshots = (WarpInfo*)snapData.getPtr();
1177 
1178  for(int w = 0; w < NUM_WARPS; w++)
1179  {
1180  if(snapshots[SNAPSHOT_WARP-1].reads != 0) // Full
1181  printf("\aSnapshot memory full for warp %d!\n", w);
1182 
1183  ostringstream filename;
1184  filename.fill('0');
1185  filename << "plots\\warps\\warp" << setw(3) << w << ".dat";
1186  //cout << filename.str() << "\n";
1187  ofstream snapfile(filename.str());
1188 
1189  snapfile << "Snap#\t#reads\t#rays\t#tris\ttype(leaf=8)\t#chunks\tpopCount\tdepth\tcDequeue\tcCompute\tstackTop\ttaskIdx" << "\n";
1190  for(int i = 0; i < SNAPSHOT_WARP; i++)
1191  {
1192  if(snapshots[i].reads == 0)
1193  break;
1194 
1195  if(snapshots[i].clockDequeue < snapshots[i].clockSearch || snapshots[i].clockFinished < snapshots[i].clockDequeue)
1196  cout << "Error timer for warp " << w << "\n";
1197 
1198  snapfile << i << "\t" << snapshots[i].reads << "\t" << snapshots[i].rays << "\t" << snapshots[i].tris << "\t" << snapshots[i].type << "\t"
1199  << snapshots[i].chunks << "\t" << snapshots[i].popCount << "\t" << snapshots[i].depth << "\t" << (snapshots[i].clockDequeue - snapshots[i].clockSearch) << "\t"
1200  << (snapshots[i].clockFinished - snapshots[i].clockDequeue) << "\t" << snapshots[i].stackTop << "\t" << snapshots[i].idx << "\n";
1201  }
1202 
1203  snapfile.close();
1204  snapshots += SNAPSHOT_WARP; // Next warp
1205  }
1206 #endif
1207 }
1208 
1209 //------------------------------------------------------------------------
1210 
1211 void CudaNoStructTracer::initPool(int numRays, Buffer* rayBuffer, Buffer* nodeBuffer)
1212 {
1213  // Prepare the task data
1214  updateConstants();
1215 #if PARALLELISM_TEST >= 0
1216  int& numActive = *(int*)m_module->getGlobal("g_numActive").getMutablePtr();
1217  numActive = 1;
1218 #endif
1219 
1220 #ifndef MALLOC_SCRATCHPAD
1221  // Set PPS buffers
1222  m_ppsTris.resizeDiscard(sizeof(int)*m_numTris);
1223  m_ppsTrisIndex.resizeDiscard(sizeof(int)*m_numTris);
1224  m_sortTris.resizeDiscard(sizeof(int)*m_numTris);
1225 
1226  if(numRays > 0)
1227  {
1228  m_ppsRays.resizeDiscard(sizeof(int)*numRays);
1229  m_ppsRaysIndex.resizeDiscard(sizeof(int)*numRays);
1230  m_sortRays.resizeDiscard(sizeof(int)*numRays);
1231  }
1232 #endif
1233 
1234 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1235  // Prepare snapshot memory
1236  Buffer snapData;
1237  allocateSnapshots(snapData);
1238 #endif
1239 
1240  // Set all headers empty
1241 #ifdef TEST_TASKS
1242  m_taskData.setOwner(Buffer::Cuda, true); // Make CUDA the owner so that CPU memory is never allocated
1243 #ifdef BENCHMARK
1244  m_taskData.clearRange32(0, TaskHeader_Empty, TASK_SIZE * sizeof(int)); // Mark all tasks as empty
1245 #else
1246  m_taskData.clearRange32(0, TaskHeader_Empty, TASK_SIZE * (sizeof(int)+sizeof(Task))); // Mark all tasks as empty (important for debug)
1247 #endif
1248 #endif
1249 
1250  // Increase printf output size so that more can fit
1251  //cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 536870912);
1252 
1253  /*cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); // Driver does not seem to care and preffers L1
1254  cuFuncSetCacheConfig(kernel, CU_FUNC_CACHE_PREFER_SHARED);
1255  CUfunc_cache test;
1256  cuCtxGetCacheConfig(&test);
1257  if(test != CU_FUNC_CACHE_PREFER_SHARED)
1258  printf("Error\n");*/
1259 
1260  // Set texture references.
1261  if(rayBuffer != NULL)
1262  {
1263  m_module->setTexRef("t_rays", *rayBuffer, CU_AD_FORMAT_FLOAT, 4);
1264  }
1265  if(nodeBuffer != NULL)
1266  {
1267  m_module->setTexRef("t_nodesA", *nodeBuffer, CU_AD_FORMAT_FLOAT, 4);
1268  }
1269  m_module->setTexRef("t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
1270  m_module->setTexRef("t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
1271 
1272 /*#ifdef COMPACT_LAYOUT
1273  if(numRays == 0)
1274  {
1275  m_module->setTexRef("t_trisAOut", m_trisCompactOut, CU_AD_FORMAT_FLOAT, 4);
1276  m_module->setTexRef("t_triIndicesOut", m_trisIndexOut, CU_AD_FORMAT_SIGNED_INT32, 1);
1277  }
1278 #endif*/
1279 }
1280 
1281 //------------------------------------------------------------------------
1282 
1283 void CudaNoStructTracer::deinitPool(int numRays)
1284 {
1285  m_ppsTris.reset();
1286  m_ppsTrisIndex.reset();
1287  m_sortTris.reset();
1288 
1289  if(numRays > 0)
1290  {
1291  m_ppsRays.reset();
1292  m_ppsRaysIndex.reset();
1293  m_sortRays.reset();
1294  }
1295 }
1296 
1297 //------------------------------------------------------------------------
1298 
1299 void CudaNoStructTracer::printPoolHeader(TaskStackBase* tasks, int* header, int numWarps, FW::String state)
1300 {
1301 #if PARALLELISM_TEST >= 0
1302  numActive = *(int*)m_module->getGlobal("g_numActive").getPtr();
1303  printf("Active: %d\n", numActive);
1304 #endif
1305 
1306 
1307 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1308  printSnapshots(snapData);
1309 #endif
1310 
1311 #ifdef DEBUG_INFO
1312  Debug << "\nPRINTING DEBUG_INFO STATISTICS" << "\n\n";
1313 #else
1314  Debug << "\nPRINTING STATISTICS" << "\n\n";
1315 #endif
1316 
1317  float4* debugData = (float4*)m_debug.getPtr();
1318  float minAll[4] = {MAX_FLOAT, MAX_FLOAT, MAX_FLOAT, MAX_FLOAT};
1319  float maxAll[4] = {0, 0, 0, 0};
1320  float sumAll[4] = {0, 0, 0, 0};
1321  int countDead = 0;
1322  Debug << "Warp No. cnt_task_queues Avg. #Reads Max #Reads #Restarts" << "\n";
1323  for(int i = 0; i < numWarps; i++)
1324  {
1325  Debug << "Warp " << i << ": (" << debugData[i].x << ", " << debugData[i].y << ", " << debugData[i].z << ", " << debugData[i].w << ")" << "\n";
1326 
1327  //fabs is because we do not care whether the warp stopped prematurely or not
1328  minAll[0] = min(fabs(debugData[i].x), minAll[0]);
1329  minAll[1] = min(fabs(debugData[i].y), minAll[1]);
1330  minAll[2] = min(fabs(debugData[i].z), minAll[2]);
1331  minAll[3] = min(fabs(debugData[i].w), minAll[3]);
1332 
1333  maxAll[0] = max(fabs(debugData[i].x), maxAll[0]);
1334  maxAll[1] = max(fabs(debugData[i].y), maxAll[1]);
1335  maxAll[2] = max(fabs(debugData[i].z), maxAll[2]);
1336  maxAll[3] = max(fabs(debugData[i].w), maxAll[3]);
1337 
1338  sumAll[0] += fabs(debugData[i].x);
1339  sumAll[1] += fabs(debugData[i].y);
1340  sumAll[2] += fabs(debugData[i].z);
1341  sumAll[3] += fabs(debugData[i].w);
1342 
1343  if(debugData[i].x < 0)
1344  countDead++;
1345  }
1346  Debug << "Dead=" << countDead << " / All=" << numWarps << " = " << (float)countDead/(float)numWarps << "\n";
1347  Debug << "Min: " << minAll[0] << ", " << minAll[1] << ", " << minAll[2] << ", " << minAll[3] << "\n";
1348  Debug << "Max: " << maxAll[0] << ", " << maxAll[1] << ", " << maxAll[2] << ", " << maxAll[3] << "\n";
1349  Debug << "Sum: " << sumAll[0] << ", " << sumAll[1] << ", " << sumAll[2] << ", " << sumAll[3] << "\n";
1350  Debug << "Avg: " << sumAll[0]/numWarps << ", " << sumAll[1]/numWarps << ", " << sumAll[2]/numWarps << ", " << sumAll[3]/numWarps << "\n\n" << "\n";
1351  Debug << "cnt_task_queues per object = " << sumAll[0]/(float)m_numTris << "\n";
1352 
1353  Debug << "Pool" << "\n";
1354  Debug << "Top = " << tasks->top << "; Bottom = " << tasks->bottom << "; Unfinished = " << tasks->unfinished << "; Size = " << tasks->sizePool << "; ";
1355  Debug << state.getPtr() << "\n";
1356  Debug << "ActiveTop = " << tasks->activeTop << "; Active = ";
1357  for(int i = 0; i < ACTIVE_MAX+1; i++)
1358  Debug << tasks->active[i] << " ";
1359  Debug << "\n" << "\n";
1360  Debug << "EmptyTop = " << tasks->emptyTop << "; EmptyBottom = " << tasks->emptyBottom << "\nEmpty\n";
1361  for(int i = 0; i < EMPTY_MAX+1; i++)
1362  {
1363  if(i % 50 == 0)
1364  Debug << "\n";
1365  else
1366  Debug << " ";
1367  Debug << tasks->empty[i];
1368  }
1369 
1370  /*for(int i = 0; i < EMPTY_MAX+1; i++)
1371  Debug << tasks->empty[i] << " ";*/
1372  Debug << "\n" << "\n";
1373 
1374  int emptyItems = 0;
1375  int bellowEmpty = 0;
1376  Debug << "Header" << "\n";
1377  for(int i = 0; i < TASK_SIZE; i++)
1378  {
1379  if(i % 50 == 0)
1380  Debug << "\n";
1381  else
1382  Debug << " ";
1383  if(header[i] != TaskHeader_Empty)
1384  {
1385  Debug << header[i];
1386  }
1387  else
1388  {
1389  Debug << TaskHeader_Active;
1390  if(i < tasks->top)
1391  emptyItems++;
1392  }
1393 
1394  if(header[i] < TaskHeader_Empty)
1395  bellowEmpty++;
1396  }
1397 
1398  Debug << "\n\nEmptyItems = " << emptyItems << "\n";
1399  Debug << "BellowEmpty = " << bellowEmpty << "\n";
1400 }
1401 
1402 //------------------------------------------------------------------------
1403 
1404 void CudaNoStructTracer::printPool(TaskStackBVH &tasks, int numWarps)
1405 {
1406 #ifdef LEAF_HISTOGRAM
1407  printf("Leaf histogram\n");
1408  unsigned int leafSum = 0;
1409  unsigned int triSum = 0;
1410  for(S32 i = 0; i <= Environment::GetSingleton()->GetInt("SubdivisionRayCaster.triLimit"); i++)
1411  {
1412  printf("%d: %d\n", i, tasks.leafHist[i]);
1413  leafSum += tasks.leafHist[i];
1414  triSum += i*tasks.leafHist[i];
1415  }
1416  printf("Leafs total %d, average leaf %.2f\n", leafSum, (float)triSum/(float)leafSum);
1417 #endif
1418 
1419  int* header = (int*)m_taskData.getPtr();
1420  FW::String state = sprintf("BVH Top = %d; Tri Top = %d; Warp counter = %d; ", tasks.nodeTop, tasks.triTop, tasks.warpCounter);
1421 #ifdef BVH_COUNT_NODES
1422  state.appendf("Number of inner nodes = %d; Number of leaves = %d; Sorted tris = %d; ", tasks.numNodes, tasks.numLeaves, tasks.numSortedTris);
1423 #endif
1424  printPoolHeader(&tasks, header, numWarps, state);
1425 
1426  Debug << "\n\nTasks" << "\n";
1427  TaskBVH* task = (TaskBVH*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
1428  int stackMax = 0;
1429  int maxDepth = 0;
1430  int syncCount = 0;
1431  int maxTaskId = -1;
1432  long double sumTris = 0;
1433  long double maxTris = 0;
1434 
1435  int sortTasks = 0;
1436  long double cntSortTris = 0;
1437 
1438  int subFailed = 0;
1439 
1440 #ifdef DEBUG_INFO
1441  char terminatedNames[TerminatedBy_Max][255] = {
1442  "None", "Depth","TotalLimit","OverheadLimit","Cost","FailureCounter"
1443  };
1444 
1445  int terminatedBy[TerminatedBy_Max];
1446  memset(&terminatedBy,0,sizeof(int)*TerminatedBy_Max);
1447 #endif
1448 
1449  for(int i = 0; i < TASK_SIZE; i++)
1450  {
1451  if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
1452  {
1453 #ifdef DEBUG_INFO
1454  _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1455  terminatedBy[ task[i].terminatedBy ]++;
1456 #endif
1457 
1458  Debug << "Task " << i << "\n";
1459  Debug << "Header: " << header[i] << "\n";
1460  Debug << "Unfinished: " << task[i].unfinished << "\n";
1461  Debug << "Type: " << task[i].type << "\n";
1462  Debug << "TriStart: " << task[i].triStart << "\n";
1463  Debug << "TriLeft: " << task[i].triLeft << "\n";
1464  Debug << "TriRight: " << task[i].triRight << "\n";
1465  Debug << "TriEnd: " << task[i].triEnd << "\n";
1466  Debug << "ParentIdx: " << task[i].parentIdx << "\n";
1467  Debug << "NodeIdx: " << task[i].nodeIdx << "\n";
1468  Debug << "TaskID: " << task[i].taskID << "\n";
1469  Debug << "Split: (" << task[i].splitPlane.x << ", " << task[i].splitPlane.y << ", " << task[i].splitPlane.z << ", " << task[i].splitPlane.w << ")\n";
1470  Debug << "Box: (" << task[i].bbox.m_mn.x << ", " << task[i].bbox.m_mn.y << ", " << task[i].bbox.m_mn.z << ") - ("
1471  << task[i].bbox.m_mx.x << ", " << task[i].bbox.m_mx.y << ", " << task[i].bbox.m_mx.z << ")\n";
1472  //Debug << "BoxLeft: (" << task[i].bboxLeft.m_mn.x << ", " << task[i].bboxLeft.m_mn.y << ", " << task[i].bboxLeft.m_mn.z << ") - ("
1473  // << task[i].bboxLeft.m_mx.x << ", " << task[i].bboxLeft.m_mx.y << ", " << task[i].bboxLeft.m_mx.z << ")\n";
1474  //Debug << "BoxRight: (" << task[i].bboxRight.m_mn.x << ", " << task[i].bboxRight.m_mn.y << ", " << task[i].bboxRight.m_mn.z << ") - ("
1475  // << task[i].bboxRight.m_mx.x << ", " << task[i].bboxRight.m_mx.y << ", " << task[i].bboxRight.m_mx.z << ")\n";
1476  Debug << "Axis: " << task[i].axis << "\n";
1477  Debug << "Depth: " << task[i].depth << "\n";
1478  Debug << "Step: " << task[i].step << "\n";
1479 #ifdef DEBUG_INFO
1480  //Debug << "Step: " << task[i].step << "\n";
1481  //Debug << "Lock: " << task[i].lock << "\n";
1482 #ifdef MALLOC_SCRATCHPAD
1483  Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
1484 #endif
1485  Debug << "GMEMSync: " << task[i].sync << "\n";
1486  Debug << "Parent: " << task[i].parent << "\n";
1487 #endif
1488 
1489 #ifdef DEBUG_INFO
1490  Debug << "TerminatedBy: " << task[i].terminatedBy << "\n";
1491 #endif
1492  if(task[i].terminatedBy != TerminatedBy_None)
1493  Debug << "Triangles: " << task[i].triEnd - task[i].triStart << "\n";
1494 
1495  Debug << "\n";
1496  stackMax = i;
1497 
1498  if(header[i] > (int)0xFF800000) // Not waiting
1499  {
1500 #ifdef CUTOFF_DEPTH
1501  if(task[i].depth == m_cutOffDepth)
1502  {
1503 #endif
1504  long double tris = task[i].triEnd - task[i].triStart;
1505  if(task[i].terminatedBy != TerminatedBy_None)
1506  {
1507  if(tris > maxTris)
1508  {
1509  maxTris = tris;
1510  maxTaskId = i;
1511  }
1512  sumTris += tris;
1513  }
1514  sortTasks++;
1515  cntSortTris += tris;
1516 #ifdef CUTOFF_DEPTH
1517  }
1518 #endif
1519 
1520 #ifdef DEBUG_INFO
1521  maxDepth = max(task[i].depth, maxDepth);
1522  syncCount += task[i].sync;
1523 #endif
1524  }
1525  }
1526  }
1527 
1528  if(stackMax == TASK_SIZE-1)
1529  printf("\aIncomplete result!\n");
1530 #ifdef CUTOFF_DEPTH
1531  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
1532 #else
1533  Debug << "\n\n";
1534 #endif
1535 
1536 #ifdef DEBUG_INFO
1537  Debug << "Avg naive task height (tris) = " << sumTris/(long double)sortTasks << "\n";
1538  Debug << "Max naive task height (tris) = " << maxTris << ", taskId: " << maxTaskId << "\n";
1539  Debug << "Cnt sorted operations = " << sortTasks << "\n";
1540  double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0f)));
1541  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1542  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1543  Debug << "\n";
1544  Debug << "Max task depth = " << maxDepth << "\n";
1545  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
1546  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
1547  Debug << "Terminated by:" << "\n";
1548  for(int i = 0; i < TerminatedBy_Max; i++)
1549  {
1550  Debug << terminatedNames[i] << ": " << terminatedBy[i] << "\n";
1551  }
1552 #endif
1553 
1554  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
1555 }
1556 
1557 //------------------------------------------------------------------------
1558 
1559 void CudaNoStructTracer::printPool(TaskStack &tasks, int numWarps)
1560 {
1561  tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getPtr();
1562  int* header = (int*)m_taskData.getPtr();
1563  printPoolHeader(&tasks, header, numWarps, FW::sprintf(""));
1564 
1565  Debug << "\n\nTasks" << "\n";
1566  Task* task = (Task*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
1567  int stackMax = 0;
1568  int maxDepth = 0;
1569  int syncCount = 0;
1570  int maxTaskId = -1;
1571  int rayIssues = 0;
1572  int triIssues = 0;
1573  long double sumRays = 0;
1574  long double maxRays = 0;
1575  long double sumTris = 0;
1576  long double maxTris = 0;
1577 
1578  int isectTasks = 0;
1579  long double cntIsect = 0;
1580  long double maxIsect = 0;
1581  long double clippedIsect = 0;
1582 
1583  int sortTasks = 0;
1584  long double cntSortRays = 0;
1585  long double cntClippedRays = 0;
1586  long double cntSortTris = 0;
1587 
1588  int subFailed = 0;
1589  int failureCount = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.failureCount");
1590 
1591 #ifdef DEBUG_INFO
1592  char terminatedNames[TerminatedBy_Max][255] = {
1593  "None", "Depth","TotalLimit","OverheadLimit","Cost","FailureCounter"
1594  };
1595 
1596  int terminatedBy[TerminatedBy_Max];
1597  memset(&terminatedBy,0,sizeof(int)*TerminatedBy_Max);
1598 #endif
1599 
1600  for(int i = 0; i < TASK_SIZE; i++)
1601  {
1602  if(task[i].depend1 != TaskHeader_Empty || task[i].depend2 != TaskHeader_Empty)
1603  {
1604 #ifdef DEBUG_INFO
1605  _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1606  terminatedBy[ task[i].terminatedBy ]++;
1607 #endif
1608 
1609  Debug << "Task " << i << "\n";
1610  Debug << "Header: " << header[i] << "\n";
1611  Debug << "Unfinished: " << task[i].unfinished << "\n";
1612  Debug << "Type: " << task[i].type << "\n";
1613  Debug << "RayStart: " << task[i].rayStart << "\n";
1614  Debug << "RayEnd: " << task[i].rayEnd << "\n";
1615  if(task[i].type != TaskType_Intersect) // Splitted
1616  {
1617  Debug << "RayLeft: " << task[i].rayLeft << "\n";
1618  Debug << "RayRight: " << task[i].rayRight << "\n";
1619  Debug << "RayActive: " << task[i].rayActive << "\n";
1620  }
1621 #ifdef CLIP_INTERSECT
1622  if(task[i].type == TaskType_Intersect)
1623  Debug << "RayActive: " << task[i].rayActive << "\n";
1624 #endif
1625  Debug << "TriStart: " << task[i].triStart << "\n";
1626  Debug << "TriEnd: " << task[i].triEnd << "\n";
1627  if(task[i].type != TaskType_Intersect) // Splitted
1628  {
1629  //Debug << "BestOrder: " << task[i].bestOrder << "\n";
1630  Debug << "TriLeft: " << task[i].triLeft << "\n";
1631  Debug << "TriRight: " << task[i].triRight << "\n";
1632  }
1633  Debug << "Depend1: " << task[i].depend1 << "\n";
1634  Debug << "Depend2: " << task[i].depend2 << "\n";
1635  if(task[i].type != TaskType_Intersect) // Splitted
1636  {
1637  Debug << "Split: (" << task[i].splitPlane.x << ", " << task[i].splitPlane.y << ", " << task[i].splitPlane.z << ", " << task[i].splitPlane.w << ")\n";
1638  }
1639  Debug << "Box: (" << task[i].bbox.m_mn.x << ", " << task[i].bbox.m_mn.y << ", " << task[i].bbox.m_mn.z << ") - ("
1640  << task[i].bbox.m_mx.x << ", " << task[i].bbox.m_mx.y << ", " << task[i].bbox.m_mx.z << ")\n";
1641  //Debug << "BoxLeft: (" << task[i].bboxLeft.m_mn.x << ", " << task[i].bboxLeft.m_mn.y << ", " << task[i].bboxLeft.m_mn.z << ") - ("
1642  // << task[i].bboxLeft.m_mx.x << ", " << task[i].bboxLeft.m_mx.y << ", " << task[i].bboxLeft.m_mx.z << ")\n";
1643  //Debug << "BoxMiddle (" << task[i].bboxMiddle.m_mn.x << ", " << task[i].bboxMiddle.m_mn.y << ", " << task[i].bboxMiddle.m_mn.z << ") - ("
1644  // << task[i].bboxMiddle.m_mx.x << ", " << task[i].bboxMiddle.m_mx.y << ", " << task[i].bboxMiddle.m_mx.z << ")\n";
1645  //Debug << "BoxRight: (" << task[i].bboxRight.m_mn.x << ", " << task[i].bboxRight.m_mn.y << ", " << task[i].bboxRight.m_mn.z << ") - ("
1646  // << task[i].bboxRight.m_mx.x << ", " << task[i].bboxRight.m_mx.y << ", " << task[i].bboxRight.m_mx.z << ")\n";
1647  Debug << "Depth: " << task[i].depth << "\n";
1648 #ifdef DEBUG_INFO
1649  //Debug << "Step: " << task[i].step << "\n";
1650  //Debug << "Lock: " << task[i].lock << "\n";
1651  Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
1652  Debug << "GMEMSync: " << task[i].sync << "\n";
1653  Debug << "TaskID: " << task[i].taskID << "\n";
1654  Debug << "Parent: " << task[i].parent << "\n";
1655 #if AABB_TYPE < 3
1656  if(task[i].type == TaskType_AABB_Max)
1657 #elif AABB_TYPE == 3
1658  if(task[i].type == TaskType_AABB)
1659 #endif
1660  {
1661  Debug << "SubtaskIdx: " << task[i].subtaskIdx << "\n";
1662  Debug << "Clipped rays: " << task[i].rayEnd-task[i].rayActive << "\n";
1663  }
1664 #endif
1665 
1666 #ifdef CUTOFF_DEPTH
1667  if(task[i].depth == m_cutOffDepth)
1668 #endif
1669  if(task[i].type == TaskType_Intersect)
1670  {
1671 #ifdef CLIP_INTERSECT
1672  long double locRays = task[i].rayActive - task[i].rayStart;
1673 #else
1674  long double locRays = task[i].rayEnd - task[i].rayStart;
1675 #endif
1676  long double locTris = task[i].triEnd - task[i].triStart;
1677  Debug << "Intersections: " << locRays * locTris << "\n";
1678  //if(locRays > 1000 || locTris > 1000 )
1679  {
1680  if( locRays < sqrt((double)locTris) )
1681  triIssues++;
1682  if( locTris < sqrt((double)locRays) )
1683  rayIssues++;
1684  }
1685 
1686  Debug << "ClippedIntersections: " << task[i].clippedRays * locTris << "\n";
1687  clippedIsect += task[i].clippedRays * locTris;
1688  }
1689 
1690 #ifdef ONE_WARP_RUN
1691  //Debug << "Clock: " << task[i].clockEnd - task[i].clockStart << "\n";
1692  Debug << "Clock: " << task[i].clockEnd << "\n";
1693 #endif
1694 #ifdef DEBUG_INFO
1695  Debug << "TerminatedBy: " << task[i].terminatedBy << "\n";
1696 #endif
1697 
1698  Debug << "\n";
1699  stackMax = i;
1700 
1701 #ifdef CUTOFF_DEPTH
1702  if(task[i].depth == m_cutOffDepth)
1703  {
1704 #endif
1705 
1706 #ifdef CLIP_INTERSECT
1707  long double rays = task[i].rayActive - task[i].rayStart;
1708 #else
1709  long double rays = task[i].rayEnd - task[i].rayStart;
1710 #endif
1711 
1712  long double tris = task[i].triEnd - task[i].triStart;
1713  if(task[i].type == TaskType_Intersect)
1714  {
1715  isectTasks++;
1716  cntIsect += rays*tris;
1717  maxIsect = max<long double>(rays*tris, maxIsect);
1718  if(maxIsect==(rays*tris)) maxTaskId = i;
1719  sumRays += rays;
1720  maxRays = max<long double>(rays, maxRays);
1721  sumTris += tris;
1722  maxTris = max<long double>(tris, maxTris);
1723  if(task[i].subFailureCounter > failureCount)
1724  subFailed++;
1725  }
1726 #if AABB_TYPE < 3
1727  if(task[i].type == TaskType_AABB_Max)
1728 #elif AABB_TYPE == 3
1729  if(task[i].type == TaskType_AABB)
1730 #endif
1731  {
1732  sortTasks++;
1733  cntSortRays += rays;
1734  cntClippedRays += task[i].rayEnd-task[i].rayActive;
1735  cntSortTris += tris;
1736  }
1737 #ifdef CUTOFF_DEPTH
1738  }
1739 #endif
1740 
1741 #ifdef DEBUG_INFO
1742  maxDepth = max(task[i].depth, maxDepth);
1743  syncCount += task[i].sync;
1744 #endif
1745  }
1746  }
1747 
1748  if(stackMax == TASK_SIZE-1)
1749  printf("\aIncomplete result!\n");
1750 #ifdef CUTOFF_DEPTH
1751  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
1752 #else
1753  Debug << "\n\n";
1754 #endif
1755 
1756 #ifdef DEBUG_INFO
1757  Debug << "ray_obj_intersections per ray = " << cntIsect/m_numRays << "\n";
1758  Debug << "cnt_leaves = " << isectTasks << "\n";
1759  Debug << "cnt_leaves per obj = " << (float)isectTasks/(float)m_numTris << "\n";
1760  Debug << "ray_obj_intersections = " << cntIsect << "\n";
1761  Debug << "Useless ray_obj_intersections = " << clippedIsect << "\n";
1762  Debug << "Avg ray_obj_intersections per leaf = " << cntIsect/(long double)isectTasks << "\n";
1763  Debug << "Max ray_obj_intersections per leaf = " << maxIsect << ", taskId: " << maxTaskId << "\n";
1764  Debug << "reduction [%] = " << 100.0f * (cntIsect/((long double)m_numRays*(long double)m_numTris)) << "\n";
1765  Debug << "Avg naive task width (rays) = " << sumRays/(long double)isectTasks << "\n";
1766  Debug << "Max naive task width (rays) = " << maxRays << "\n";
1767  Debug << "Avg naive task height (tris) = " << sumTris/(long double)isectTasks << "\n";
1768  Debug << "Max naive task height (tris) = " << maxTris << "\n";
1769  Debug << "Cnt sorted operations = " << sortTasks << "\n";
1770  double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0f)));
1771  double cntRaysLog2Tris = (double(m_numRays) * (double)(logf(m_numTris)/logf(2.0f)));
1772  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1773  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1774  Debug << "Cnt sorted rays = " << cntSortRays << " BEFORE CLIPPING\n";
1775  Debug << "Cnt sorted rays/(log N)/R, N=#tris,R=#rays = " << cntSortRays/cntRaysLog2Tris << " BEFORE CLIPPING\n";
1776  Debug << "Cnt clipped rays = " << cntClippedRays << "\n";
1777  Debug << "\n";
1778  Debug << "Max task depth = " << maxDepth << "\n";
1779  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
1780  Debug << "Ray issues = " << rayIssues << ", tris issues = " << triIssues << "\n";
1781  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
1782 
1783  Debug << "Terminated by:" << "\n";
1784  for(int i = 0; i < TerminatedBy_Max; i++)
1785  {
1786  Debug << terminatedNames[i] << ": " << terminatedBy[i] << "\n";
1787  }
1788 #endif
1789 
1790  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
1791 }
1792 
1793 //------------------------------------------------------------------------
1794 
1795 F32 CudaNoStructTracer::traceCudaRayBuffer(RayBuffer& rb)
1796 {
1797  CUfunction kernel;
1798 #ifdef TEST_TASKS
1799  kernel = m_module->getKernel("trace");
1800  if (!kernel)
1801  fail("Trace kernel not found!");
1802 #endif
1803 
1804  // Prepare the task data
1805  initPool(m_numRays, &rb.getRayBuffer());
1806 
1807  // Set input.
1808  KernelInputNoStruct& in = *(KernelInputNoStruct*)m_module->getGlobal("c_ns_in").getMutablePtr();
1809  in.numRays = m_numRays;
1810  in.numTris = m_numTris;
1811  in.anyHit = !rb.getNeedClosestHit();
1812  in.rays = rb.getRayBuffer().getMutableCudaPtr();
1813  in.results = rb.getResultBuffer().getMutableCudaPtr();
1814  in.tris = m_trisCompact.getCudaPtr();
1815  in.trisIndex = m_trisIndex.getMutableCudaPtr();
1816  in.raysIndex = m_raysIndex.getMutableCudaPtr();
1817  in.ppsRaysBuf = m_ppsRays.getMutableCudaPtr();
1818  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
1819  in.ppsRaysIndex = m_ppsRaysIndex.getMutableCudaPtr();
1820  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
1821  in.sortRays = m_sortRays.getMutableCudaPtr();
1822  in.sortTris = m_sortTris.getMutableCudaPtr();
1823 
1824 
1825 #ifndef TEST_TASKS
1826  kernel = m_module->getKernel("__naive");
1827  if (!kernel)
1828  fail("Trace kernel not found!");
1829 
1830  Vec2i blockSizeN(1024, 1);
1831  Vec2i gridSizeN((m_numRays+1023)/1024, 1);
1832 
1833  float tNaive = m_module->launchKernelTimed(kernel, blockSizeN, gridSizeN);
1834 
1835  printf("Verifying GPU trace\n");
1836  /*for(int i = 0; i < m_numRays; i++)
1837  {
1838  const RayResult& res = rb.getResultForSlot(i);
1839  Debug << "Ray " << i << "\tGPU naive: id=" << res.id << ", t=" << res.t << "\n";
1840  }*/
1841 
1842  return tNaive;
1843 #endif
1844 
1845 
1846 #if SPLIT_TYPE == 3
1847  m_splitData.clearRange(0, 0, sizeof(SplitInfo)); // Set first split to zeros
1848  // Prepare split stack
1849  SplitInfo* &splits = *(SplitInfo**)m_module->getGlobal("g_splitStack").getMutablePtr();
1850  splits = (SplitInfo*)m_splitData.getMutableCudaPtr();
1851 #endif
1852 
1853  CudaAABB bbox;
1854  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
1855  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
1856 
1857  // Set parent task containing all the work
1858  Task all;
1859  all.rayStart = 0;
1860  all.rayLeft = 0;
1861  all.rayRight = m_numRays;
1862  all.rayEnd = m_numRays;
1863  all.triStart = 0;
1864  all.triLeft = 0;
1865  all.triRight = m_numTris;
1866  all.triEnd = m_numTris;
1867  all.bbox = bbox;
1868  all.step = 0;
1869  all.depend1 = DependType_Root;
1870  all.depend2 = DependType_None; // Only one task is dependent on this one - the unfinished counter
1871  all.lock = LockType_Free;
1872  all.bestCost = 1e38f;
1873  all.depth = 0;
1874  all.subFailureCounter = 0;
1875  Vector3 size = m_bbox.Diagonal();
1876  all.axis = size.MajorAxis();
1877  all.terminatedBy = TerminatedBy_None;
1878 #ifdef DEBUG_INFO
1879  all.sync = 0;
1880  all.parent = -1;
1881  all.taskID = 0;
1882  all.clippedRays = 0;
1883  all.clockStart = 0;
1884  all.clockEnd = 0;
1885 #endif
1886 
1887 #if SPLIT_TYPE == 0
1888 #if SCAN_TYPE == 0
1889  all.type = TaskType_Sort_PPS1;
1890 #elif SCAN_TYPE == 1
1891  all.type = TaskType_Sort_PPS1_Up;
1892 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
1893  all.type = TaskType_Sort_SORT1;
1894 #endif
1895 
1896  all.unfinished = warpSubtasks(m_numRays) + warpSubtasks(m_numTris);
1897  all.bestOrder = warpSubtasks(m_numRays);
1898  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
1899  if(all.axis == 0)
1900  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
1901  else if(all.axis == 1)
1902  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
1903  else
1904  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
1905 #else
1906  all.type = TaskType_Split;
1907 #if SPLIT_TYPE == 1
1908  int evaluatedCandidates = (int)sqrtf(m_numRays) + (int)sqrtf(m_numTris);
1909  int numPlanes = 0.5f * (m_numRays + m_numTris)/evaluatedCandidates;
1910  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
1911 #elif SPLIT_TYPE == 2
1912  all.unfinished = 1;
1913 #elif SPLIT_TYPE == 3
1914  all.type = TaskType_SplitParallel;
1915  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
1916  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
1917  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
1918 #endif
1919 #endif
1920 
1921 #ifdef DEBUG_PPS
1922  all.type = TaskType_Sort_PPS1_Up;
1923  int pRays = warpSubtasks(m_numRays);
1924  all.bestOrder = pRays;
1925  int pTris = warpSubtasks(m_numTris);
1926  all.unfinished = pRays+pTris;
1927 #endif
1928 
1929  all.origSize = all.unfinished;
1930 
1931  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(Task)); // Set the first task
1932 
1933  // Set parent task header
1934  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
1935 
1936  // Prepare the task stack
1937  TaskStack& tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getMutablePtr();
1938  tasks.header = (int*)m_taskData.getMutableCudaPtr();
1939  tasks.tasks = (Task*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
1940  tasks.top = 0;
1941  tasks.bottom = 0;
1942  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
1943  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
1944  tasks.active[0] = 0;
1945  //for(int i = 0; i < ACTIVE_MAX+1; i++)
1946  // tasks.active[i] = i;
1947  tasks.activeTop = 1;
1948  //tasks.empty[0] = 0;
1949  //int j = 1;
1950  //for(int i = EMPTY_MAX; i > 0; i--, j++)
1951  // tasks.empty[i] = j;
1952  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
1953  tasks.emptyTop = 0;
1954  tasks.emptyBottom = 0;
1955  tasks.unfinished = -1; // We are waiting for one task to finish = task all
1956  tasks.sizePool = TASK_SIZE;
1957  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
1958  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
1959 
1960  // Determine block and grid sizes.
1961 #ifdef ONE_WARP_RUN
1962  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
1963  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
1964  int numWarps = 1;
1965 #else
1966  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
1967  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
1968  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
1969  int gridSizeX = NUM_SM*numBlocksPerSM;
1970  int numWarps = numWarpsPerBlock*gridSizeX;
1971  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
1972 
1973  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
1974  printf("\aNUM_WARPS constant does not match the launch parameters\n");
1975 #endif
1976 
1977  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
1978  m_debug.clear();
1979  in.debug = m_debug.getMutableCudaPtr();
1980 
1981  // Launch.
1982  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
1983 
1984 #ifndef BENCHMARK
1985  cuCtxSynchronize(); // Flushes printfs
1986 #endif
1987 
1988 #ifdef DEBUG_PPS
1989  ptout = (S32*)m_ppsTris.getPtr();
1990  stout = (S32*)m_sortTris.getPtr();
1991 
1992  prout = (S32*)m_ppsRays.getPtr();
1993  srout = (S32*)m_sortRays.getPtr();
1994  S32 sum = 0;
1995  S32 error = 0;
1996  int j = 0;
1997  for(int i=0;i<m_numTris;i++)
1998  {
1999  sum += *stout; // Here for inclusive scan
2000  if(*ptout != sum)
2001  {
2002  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *ptout << " for " << m_numTris << " triangles!" << "\n";
2003  error = 1;
2004  if(j == 10)
2005  break;
2006  j++;
2007  }
2008  if(*stout < -1 || *stout > 1)
2009  {
2010  cout << "\nWTF " << i << " of " << m_numTris << ": " << *stout << "!\n" << "\n";
2011  break;
2012  }
2013  //sum += *stout; // Here for exclusive scan
2014  ptout++;
2015  stout++;
2016  }
2017 
2018  sum = 0;
2019  for(int i=0;i<m_numRays;i++)
2020  {
2021  sum += *srout; // Here for inclusive scan
2022  if(*prout != sum)
2023  {
2024  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *prout << " for " << m_numRays << " rays!" << "\n";
2025  error = 1;
2026  if(j == 10)
2027  break;
2028  j++;
2029  }
2030  if(*srout < -1 || *srout > 2)
2031  {
2032  cout << "\nWTF " << i << " of " << m_numRays << ": " << *srout << "!\n" << "\n";
2033  break;
2034  }
2035  //sum += *srout; // Here for exclusive scan
2036  prout++;
2037  srout++;
2038  }
2039 
2040  if(!error)
2041  cout << "PPS correct for " << m_numTris << " triangles and " << m_numRays << " rays!" << "\n";
2042  return 0;
2043 #endif
2044 
2045  // Set rays index buffer
2046  /*int* ind = (int*)m_raysIndex.getPtr();
2047  int count = 0;
2048  int mismatched = 0;
2049 
2050  // Validate if rays hit triangles
2051  printf("Verifying GPU trace\n");
2052  for(int i = 0; i < m_numRays; i++)
2053  {
2054  const RayResult& res = rb.getResultForSlot(i);
2055  //RayResult& res = rb.getMutableResultForSlot(i);
2056  //res.clear();
2057  Ray ray = rb.getRayForSlot(i);
2058  RayResult cpu;
2059  ray.tmax = 1e36;
2060 
2061  if(i % 10000 == 0)
2062  printf("rid: %d\n", i);
2063  traceCpuRay(ray, cpu, !rb.getNeedClosestHit());
2064  //traceCpuRay(ray, res, !rb.getNeedClosestHit());
2065 
2066  if(ind[i] != i)
2067  count++;
2068 
2069  if(res.id != cpu.id)
2070  {
2071  Debug << "Ray " << i << " CPU/GPU mismatch! Swapped: " << (ind[i] != i) << "\n"
2072  << "\tCPU: id=" << cpu.id << ", t=" << cpu.t << "\n"
2073  << "\tGPU: id=" << res.id << ", t=" << res.t << "\n";
2074  mismatched++;
2075  }
2076 
2077  //Debug << "Ray " << i << " CPU: id=" << cpu.id << ", t=" << cpu.t << "\n";
2078  }
2079  Debug << "Swaped " << count << "\n";
2080  Debug << "Mismatched " << mismatched << "\n";*/
2081 
2082  //Debug << "\nTraced in " << tKernel << "\n\n";
2083 
2084 #ifndef BENCHMARK
2085  tasks = *(TaskStack*)m_module->getGlobal("g_taskStack").getPtr();
2086  printPool(tasks, numWarps);
2087 
2088  /*for(int i = 0; i < m_numRays; i++)
2089  {
2090  const RayResult& res = rb.getResultForSlot(i);
2091  if(res.id == -2)
2092  {
2093  printf("Error on ray %d! Value: (%d, %f, %f, %f)\n", i, res.id, res.t, res.u, res.v);
2094  }
2095  //Debug << "Ray " << i << "! Value: (" << res.id << ", " << res.t << ", " << res.u << ", " << res.v << ")" << "\n";
2096  }*/
2097 
2098  /*CUcontext ctx;
2099  cuCtxPopCurrent(&ctx);
2100  cuCtxDestroy(ctx);
2101  exit(0);*/
2102 #endif
2103 
2104  return tKernel;
2105 }
2106 
2107 F32 CudaNoStructTracer::buildCudaBVH()
2108 {
2109  CUfunction kernel;
2110  kernel = m_module->getKernel("build");
2111  if (!kernel)
2112  fail("Build kernel not found!");
2113 
2114 #ifdef MALLOC_SCRATCHPAD
2115  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2116  in.numTris = m_numTris;
2117  in.tris = m_trisCompact.getCudaPtr();
2118  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2119 #ifdef COMPACT_LAYOUT
2120  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2121  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2122 #endif
2123 #endif
2124 
2125  // Prepare the task data
2126  initPool();
2127 
2128 #ifndef MALLOC_SCRATCHPAD
2129  // Set input.
2130  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2131  in.numTris = m_numTris;
2132  in.tris = m_trisCompact.getCudaPtr();
2133  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2134  //in.trisBox = m_trisBox.getCudaPtr();
2135  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
2136  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
2137  in.sortTris = m_sortTris.getMutableCudaPtr();
2138 #ifdef COMPACT_LAYOUT
2139  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2140  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2141 #endif
2142 #else
2143  CUfunction kernelAlloc = m_module->getKernel("allocFreeableMemory", 2*sizeof(int));
2144  if (!kernelAlloc)
2145  fail("Memory allocation kernel not found!");
2146 
2147  int offset = 0;
2148  offset += m_module->setParami(kernelAlloc, offset, m_numTris);
2149  offset += m_module->setParami(kernelAlloc, offset, 0);
2150  F32 allocTime = m_module->launchKernelTimed(kernelAlloc, Vec2i(1,1), Vec2i(1, 1));
2151 
2152 #ifndef BENCHMARK
2153  printf("Memory allocated in %f\n", allocTime);
2154 #endif
2155 
2156  CUfunction kernelMemCpyIndex = m_module->getKernel("MemCpyIndex", sizeof(CUdeviceptr)+sizeof(int));
2157  if (!kernelMemCpyIndex)
2158  fail("Memory copy kernel not found!");
2159 
2160  int memSize = m_trisIndex.getSize()/sizeof(int);
2161  offset = 0;
2162  offset += m_module->setParamPtr(kernelMemCpyIndex, offset, m_trisIndex.getCudaPtr());
2163  offset += m_module->setParami(kernelMemCpyIndex, offset, memSize);
2164  F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex, Vec2i(256,1), Vec2i((memSize-1+256)/256, 1));
2165 
2166 #ifndef BENCHMARK
2167  printf("Triangle indices copied in %f\n", memcpyTime);
2168 #endif
2169  in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2170 #endif
2171 
2172 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2173 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2174  SplitRed split;
2175  for(int i = 0; i < 2; i++)
2176  {
2177  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2178  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2179  split.children[i].cnt = 0;
2180  }
2181 
2182  SplitArray sArray;
2183  for(int i = 0; i < NUM_WARPS; i++)
2184  {
2185  for(int j = 0; j < PLANE_COUNT; j++)
2186  sArray.splits[i][j] = split;
2187  }
2188 #else
2189  SplitRed split;
2190  for(int i = 0; i < 2; i++)
2191  {
2192  //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2193  //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2194  split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2195  split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2196  split.children[i].cnt = 0;
2197  }
2198 
2199  SplitArray sArray;
2200  for(int j = 0; j < PLANE_COUNT; j++)
2201  sArray.splits[j] = split;
2202 
2203  m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
2204 #endif
2205 
2206  m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
2207 #endif
2208 
2209  CudaAABB bbox;
2210  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
2211  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
2212 
2213  // Set parent task containing all the work
2214  TaskBVH all;
2215  all.triStart = 0;
2216  all.triLeft = 0;
2217 #ifndef MALLOC_SCRATCHPAD
2218  all.triRight = m_numTris;
2219 #else
2220  all.triRight = 0;
2221 #endif
2222  all.triEnd = m_numTris;
2223  all.bbox = bbox;
2224  all.step = 0;
2225  all.lock = LockType_Free;
2226  all.bestCost = 1e38f;
2227  all.depth = 0;
2228  all.dynamicMemory= 0;
2229 #ifndef MALLOC_SCRATCHPAD
2230  all.triIdxCtr = 0;
2231 #endif
2232  all.parentIdx = -1;
2233  all.nodeIdx = 0;
2234  all.taskID = 0;
2235  Vector3 size = m_bbox.Diagonal();
2236  all.axis = size.MajorAxis();
2237  all.terminatedBy = TerminatedBy_None;
2238 #ifdef DEBUG_INFO
2239  all.sync = 0;
2240  all.parent = -1;
2241  all.clockStart = 0;
2242  all.clockEnd = 0;
2243 #endif
2244 
2245 #if SPLIT_TYPE == 0
2246 #if SCAN_TYPE == 0
2247  all.type = TaskType_Sort_PPS1;
2248 #elif SCAN_TYPE == 1
2249  all.type = TaskType_Sort_PPS1_Up;
2250 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2251  all.type = TaskType_Sort_SORT1;
2252 #endif
2253  all.unfinished = warpSubtasks(m_numTris);
2254  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2255  if(all.axis == 0)
2256  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2257  else if(all.axis == 1)
2258  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2259  else
2260  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2261 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2262 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2263  all.type = TaskType_InitMemory;
2264  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
2265 #else
2266  all.type = TaskType_BinTriangles;
2267  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2268  /*all.type = TaskType_BuildObjectSAH;
2269  all.unfinished = 1;*/
2270 #endif
2271 #endif
2272  all.origSize = all.unfinished;
2273 
2274  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2275 
2276  // Set parent task header
2277  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2278 
2279  // Prepare the task stack
2280  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2281  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2282  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2283  tasks.nodeTop = 1;
2284  tasks.triTop = 0;
2285  tasks.top = 0;
2286  tasks.bottom = 0;
2287  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2288  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
2289  tasks.active[0] = 0;
2290  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
2291  tasks.active[i] = i;*/
2292  tasks.activeTop = 1;
2293  //tasks.empty[0] = 0;
2294  //int j = 1;
2295  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2296  // tasks.empty[i] = j;
2297  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
2298  tasks.emptyTop = 0;
2299  tasks.emptyBottom = 0;
2300  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2301  tasks.numSortedTris = 0;
2302  tasks.numNodes = 0;
2303  tasks.numLeaves = 0;
2304  tasks.numEmptyLeaves = 0;
2305  tasks.sizePool = TASK_SIZE;
2306  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2307  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2308  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
2309 
2310 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2311  // Prepare split stack
2312  SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
2313  splits = (SplitArray*)m_splitData.getMutableCudaPtr();
2314 #endif
2315 
2316  CudaBVHNode* &bvh = *(CudaBVHNode**)m_module->getGlobal("g_bvh").getMutablePtr();
2317  bvh = (CudaBVHNode*)m_bvhData.getMutableCudaPtr();
2318 
2319  // Determine block and grid sizes.
2320 #ifdef ONE_WARP_RUN
2321  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2322  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2323  int numWarps = 1;
2324 #else
2325  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2326  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2327  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2328  int gridSizeX = NUM_SM*numBlocksPerSM;
2329  int numWarps = numWarpsPerBlock*gridSizeX;
2330  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2331 
2332  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2333  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2334 #endif
2335 
2336  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2337  m_debug.clear();
2338  in.debug = m_debug.getMutableCudaPtr();
2339 
2340  // Launch.
2341  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2342 
2343 /*#ifdef MALLOC_SCRATCHPAD
2344  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
2345  if (!kernelDealloc)
2346  fail("Memory allocation kernel not found!");
2347 
2348  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
2349 
2350  printf("Memory freed in %f\n", deallocTime);
2351 #endif*/
2352 
2353 #ifndef BENCHMARK
2354  cuCtxSynchronize(); // Flushes printfs
2355 #endif
2356 
2357 #ifdef DEBUG_PPS
2358  pout = (S32*)m_ppsTris.getPtr();
2359  sout = (S32*)m_sortTris.getPtr();
2360  S32 sum = 0;
2361  S32 error = 0;
2362  int j = 0;
2363  for(int i=0;i<m_numTris;i++)
2364  {
2365  sum += *sout; // Here for inclusive scan
2366  if(*pout != sum)
2367  {
2368  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *pout << " for " << m_numTris << " triangles!" << "\n";
2369  error = 1;
2370  if(j == 10)
2371  break;
2372  j++;
2373  }
2374  if(*sout != 0 && *sout != 1)
2375  {
2376  cout << "\nWTF " << i << " of " << m_numTris << ": " << *sout << "!\n" << "\n";
2377  break;
2378  }
2379  //sum += *sout; // Here for exclusive scan
2380  pout++;
2381  sout++;
2382  }
2383 
2384  if(!error)
2385  cout << "PPS correct for " << m_numTris << " triangles!" << "\n";
2386  return 0;
2387 #endif
2388 
2389  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2390  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
2391  tKernel = 1e38f;
2392  //printf("%d (%d x %d) (%d x %d)\n", tasks.unfinished != 0, tasks.nodeTop, m_bvhData.getSize() / sizeof(CudaBVHNode), tasks.triTop, m_trisIndexOut.getSize() / sizeof(S32));
2393 
2394  //Debug << "\nBuild in " << tKernel << "\n\n";
2395 
2396 #ifndef BENCHMARK
2397  printPool(tasks, numWarps);
2398 
2399  /*Debug << "\n\nBVH" << "\n";
2400  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
2401 
2402  for(int i = 0; i < tasks.nodeTop; i++)
2403  {
2404  Debug << "Node " << i << "\n";
2405  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
2406  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
2407  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
2408  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
2409  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
2410  }*/
2411 
2412  // Free data
2413  deinitPool();
2414 #endif
2415 
2416  return tKernel;
2417 }
2418 
2419 F32 CudaNoStructTracer::buildCudaKdtree()
2420 {
2421  CUfunction kernel;
2422  kernel = m_module->getKernel("build");
2423  if (!kernel)
2424  fail("Build kernel not found!");
2425 
2426  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2427  in.numTris = m_numTris;
2428  in.tris = m_trisCompact.getCudaPtr();
2429  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2430 
2431 #ifndef INTERLEAVED_LAYOUT
2432  in.trisOut = m_trisCompactOut.getMutableCudaPtr();
2433  in.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2434 #endif
2435 
2436  // Prepare the task data
2437  initPool();
2438  // Set the maximum depth for the current triangle count
2439  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
2440  float k1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK1");
2441  float k2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK2");
2442  float f1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.failK1");
2443  float f2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.failK2");
2444  cudaEnv.optMaxDepth = k1 * log2((F32)m_numTris) + k2;
2445  cudaEnv.failureCount = f1 * cudaEnv.optMaxDepth + f2;
2446 #ifndef BENCHMARK
2447  printf("Maximum depth = %d\n", cudaEnv.optMaxDepth);
2448  printf("Failure count = %d\n", cudaEnv.failureCount);
2449 #endif
2450 
2451  int baseOffset = setDynamicMemory();
2452  in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2453 
2454 #if SPLIT_TYPE == 3
2455  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
2456 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2457 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2458  SplitRed split;
2459  for(int i = 0; i < 2; i++)
2460  {
2461  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2462  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2463  split.children[i].cnt = 0;
2464  }
2465 
2466  SplitArray sArray;
2467  for(int i = 0; i < NUM_WARPS; i++)
2468  {
2469  for(int j = 0; j < PLANE_COUNT; j++)
2470  sArray.splits[i][j] = split;
2471  }
2472 #else
2473  //SplitRed split;
2474  //for(int i = 0; i < 2; i++)
2475  //{
2476  // //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2477  // //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2478  // split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2479  // split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2480  // split.children[i].cnt = 0;
2481  //}
2482 
2483  //SplitArray sArray;
2484  //for(int j = 0; j < PLANE_COUNT; j++)
2485  // sArray.splits[j] = split;
2486 
2487  //m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
2488  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
2489 #endif
2490 
2491  //m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
2492  // Prepare split stack
2493  //SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
2494  //splits = (SplitArray*)m_splitData.getMutableCudaPtr();
2495 
2496  SplitInfoTri* &splits = *(SplitInfoTri**)m_module->getGlobal("g_splitStack").getMutablePtr();
2497  splits = (SplitInfoTri*)m_splitData.getMutableCudaPtr();
2498 #endif
2499 
2500  CudaAABB bbox;
2501  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
2502  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
2503  /*bbox.m_mn.x -= m_epsilon;
2504  bbox.m_mn.y -= m_epsilon;
2505  bbox.m_mn.z -= m_epsilon;
2506  bbox.m_mx.x += m_epsilon;
2507  bbox.m_mx.y += m_epsilon;
2508  bbox.m_mx.z += m_epsilon;*/
2509 
2510  // Set parent task containing all the work
2511  TaskBVH all;
2512  all.triStart = 0;
2513  all.triLeft = 0;
2514  all.triRight = 0;
2515  all.triEnd = m_numTris;
2516  all.bbox = bbox;
2517  all.step = 0;
2518  all.lock = LockType_Free;
2519  all.bestCost = 1e38f;
2520  all.depth = 0;
2521  all.dynamicMemory= baseOffset;
2522 #ifdef MALLOC_SCRATCHPAD
2523  all.subFailureCounter = 0;
2524 #endif
2525  all.parentIdx = -1;
2526  all.nodeIdx = 0;
2527  all.taskID = 0;
2528  Vector3 size = m_bbox.Diagonal();
2529  all.axis = size.MajorAxis();
2530  all.terminatedBy = TerminatedBy_None;
2531 #ifdef DEBUG_INFO
2532  all.sync = 0;
2533  all.parent = -1;
2534  all.clockStart = 0;
2535  all.clockEnd = 0;
2536 #endif
2537 
2538 #if SPLIT_TYPE == 0
2539 #if SCAN_TYPE == 0
2540  all.type = TaskType_Sort_PPS1;
2541 #elif SCAN_TYPE == 1
2542  all.type = TaskType_Sort_PPS1_Up;
2543 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2544  all.type = TaskType_Sort_SORT1;
2545 #endif
2546  all.unfinished = warpSubtasks(m_numTris);
2547  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2548  if(all.axis == 0)
2549  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2550  else if(all.axis == 1)
2551  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2552  else
2553  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2554 #elif SPLIT_TYPE == 1
2555  all.type = TaskType_Split;
2556 #if 0 // SQRT candidates
2557  int evaluatedCandidates = (int)sqrtf(m_numTris);
2558  int evaluatedCandidates = 1;
2559  int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
2560 #elif 0 // Fixed candidates
2561  int numPlanes = 32768;
2562 #else // All candidates
2563  int numPlanes = m_numTris*6; // Number of warp sized subtasks
2564 #endif
2565  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
2566 #elif SPLIT_TYPE == 2
2567  all.type = TaskType_Split;
2568  all.unfinished = 1;
2569 #elif SPLIT_TYPE == 3
2570  all.type = TaskType_SplitParallel;
2571  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
2572  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
2573  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
2574 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2575 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2576  all.type = TaskType_InitMemory;
2577  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
2578 #else
2579  all.type = TaskType_BinTriangles;
2580  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2581  /*all.type = TaskType_BuildObjectSAH;
2582  all.unfinished = 1;*/
2583 #endif
2584 #endif
2585  all.origSize = all.unfinished;
2586 
2587  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2588 
2589  // Set parent task header
2590  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2591 
2592  // Prepare the task stack
2593  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2594  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2595  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2596 #ifndef INTERLEAVED_LAYOUT
2597  tasks.nodeTop = 1;
2598 #else
2599  tasks.nodeTop = sizeof(CudaKdtreeNode);
2600 #endif
2601  tasks.triTop = 0;
2602  tasks.top = 0;
2603  tasks.bottom = 0;
2604  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2605  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
2606  tasks.active[0] = 0;
2607  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
2608  tasks.active[i] = i;*/
2609  tasks.activeTop = 1;
2610  //tasks.empty[0] = 0;
2611  //int j = 1;
2612  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2613  // tasks.empty[i] = j;
2614  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
2615  tasks.emptyTop = 0;
2616  tasks.emptyBottom = 0;
2617  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2618  tasks.numSortedTris = 0;
2619  tasks.numNodes = 0;
2620  tasks.numEmptyLeaves = 0;
2621  tasks.numLeaves = 0;
2622  tasks.sizePool = TASK_SIZE;
2623  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2624  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2625  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
2626 
2627  CudaKdtreeNode* &kdtree = *(CudaKdtreeNode**)m_module->getGlobal("g_kdtree").getMutablePtr();
2628  kdtree = (CudaKdtreeNode*)m_bvhData.getMutableCudaPtr();
2629 
2630  // Determine block and grid sizes.
2631 #ifdef ONE_WARP_RUN
2632  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2633  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2634  int numWarps = 1;
2635 #else
2636  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2637  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2638  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2639  int gridSizeX = NUM_SM*numBlocksPerSM;
2640  int numWarps = numWarpsPerBlock*gridSizeX;
2641  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2642 
2643  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2644  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2645 #endif
2646 
2647  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2648  m_debug.clear();
2649  in.debug = m_debug.getMutableCudaPtr();
2650 
2651  // Launch.
2652  float tKernel = 0.f;
2653 #ifndef DUPLICATE_REFERENCES
2654  tKernel += convertWoop();
2655 #endif
2656  tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
2657 
2658 /*#ifdef MALLOC_SCRATCHPAD
2659  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
2660  if (!kernelDealloc)
2661  fail("Memory allocation kernel not found!");
2662 
2663  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
2664 
2665  printf("Memory freed in %f\n", deallocTime);
2666 #endif*/
2667 
2668 #ifndef BENCHMARK
2669  cuCtxSynchronize(); // Flushes printfs
2670 #endif
2671 
2672 #ifdef DEBUG_PPS
2673  pout = (S32*)m_ppsTris.getPtr();
2674  sout = (S32*)m_sortTris.getPtr();
2675  S32 sum = 0;
2676  S32 error = 0;
2677  int j = 0;
2678  for(int i=0;i<m_numTris;i++)
2679  {
2680  sum += *sout; // Here for inclusive scan
2681  if(*pout != sum)
2682  {
2683  cout << "PPS error for item " << i << ", CPU=" << sum << ", GPU=" << *pout << " for " << m_numTris << " triangles!" << "\n";
2684  error = 1;
2685  if(j == 10)
2686  break;
2687  j++;
2688  }
2689  if(*sout != 0 && *sout != 1)
2690  {
2691  cout << "\nWTF " << i << " of " << m_numTris << ": " << *sout << "!\n" << "\n";
2692  break;
2693  }
2694  //sum += *sout; // Here for exclusive scan
2695  pout++;
2696  sout++;
2697  }
2698 
2699  if(!error)
2700  cout << "PPS correct for " << m_numTris << " triangles!" << "\n";
2701  return 0;
2702 #endif
2703 
2704  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2705 #ifndef INTERLEAVED_LAYOUT
2706  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
2707 #else
2708  if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.getSize()) // Something went fishy
2709 #endif
2710  tKernel = 1e38f;
2711  //printf("%d (%d x %d) (%d x %d)\n", tasks.unfinished != 0, tasks.nodeTop, m_bvhData.getSize() / sizeof(CudaKdtreeNode), tasks.triTop, m_trisIndexOut.getSize() / sizeof(S32));
2712 
2713  //Debug << "\nBuild in " << tKernel << "\n\n";
2714 
2715 #ifndef BENCHMARK
2716  printPool(tasks, numWarps);
2717 
2718  /*Debug << "\n\nKdtree" << "\n";
2719  CudaBVHNode* nodes = (CudaKdtreeNode*)m_bvhData.getPtr();
2720 
2721  for(int i = 0; i < tasks.nodeTop; i++)
2722  {
2723  Debug << "Node " << i << "\n";
2724  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
2725  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
2726  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
2727  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
2728  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
2729  }*/
2730 
2731  // Free data
2732  deinitPool();
2733 #endif
2734 
2735  return tKernel;
2736 }
2737 
2738 F32 CudaNoStructTracer::testSort(S32 arraySize)
2739 {
2740  m_compiler.setSourceFile("src/rt/kernels/persistent_test.cu");
2741  m_module = m_compiler.compile();
2742  failIfError();
2743 
2744  CUfunction kernel;
2745  //kernel = m_module->getKernel("sort");
2746  //kernel = m_module->getKernel("testMemoryCamping");
2747  kernel = m_module->getKernel("testKeplerSort");
2748  if (!kernel)
2749  fail("Sort kernel not found!");
2750 
2751  // Prepare the task data
2752  initPool();
2753 
2754  // Set ppsTrisIndex
2755  /*S32* tid = (S32*)m_ppsTrisIndex.getMutablePtr();
2756  for(int i=0; i<arraySize/2; i++)
2757  {
2758  *tid = 0;
2759  tid++;
2760  }
2761  for(int i=arraySize/2; i<arraySize; i++)
2762  {
2763  *tid = 1;
2764  tid++;
2765  }*/
2766 
2767  m_trisIndex.resizeDiscard(sizeof(int)*arraySize);
2768  S32* tiout = (S32*)m_trisIndex.getMutablePtr();
2769  for(int i=0; i < arraySize; i++)
2770  {
2771  // indices
2772  *tiout = (arraySize-1) - i;
2773  tiout++;
2774  }
2775 
2776  // Set input.
2777  KernelInputBVH& in = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2778  in.numTris = arraySize;
2779  in.trisIndex = m_trisIndex.getMutableCudaPtr();
2780  in.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
2781  in.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
2782  in.sortTris = m_sortTris.getMutableCudaPtr();
2783 
2784  // Set parent task containing all the work
2785  TaskBVH all;
2786  all.triStart = 0;
2787  all.triEnd = arraySize;
2788  //all.bbox = bbox;
2789  all.step = 0;
2790  all.lock = 0;
2791  all.bestCost = 1e38f;
2792  all.depth = 0;
2793  all.parentIdx = -1;
2794  all.nodeIdx = 0;
2795  all.taskID = 0;
2796  all.pivot = arraySize / 2;
2797 #ifdef DEBUG_INFO
2798  all.sync = 0;
2799  all.parent = -1;
2800  all.clockStart = 0;
2801  all.clockEnd = 0;
2802 #endif
2803 
2804  all.type = TaskType_Sort_PPS1;
2805  all.unfinished = warpSubtasks(arraySize);
2806  all.origSize = all.unfinished;
2807 
2808  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
2809 
2810  // Set parent task header
2811  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
2812 
2813  // Prepare the task stack
2814  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
2815  tasks.header = (int*)m_taskData.getMutableCudaPtr();
2816  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
2817  tasks.nodeTop = 1;
2818  tasks.top = 0;
2819  tasks.bottom = 0;
2820  memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
2821  tasks.activeTop = 1;
2822  //tasks.empty[0] = 0;
2823  //int j = 1;
2824  //for(int i = EMPTY_MAX; i > 0; i--, j++)
2825  // tasks.empty[i] = j;
2826  tasks.emptyTop = 0;
2827  tasks.emptyBottom = 0;
2828  tasks.unfinished = -1; // We are waiting for one task to finish = task all
2829  tasks.sizePool = TASK_SIZE;
2830  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
2831  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
2832 
2833  // Determine block and grid sizes.
2834 #ifdef ONE_WARP_RUN
2835  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
2836  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
2837 #else
2838  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
2839  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
2840  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
2841  int gridSizeX = NUM_SM*numBlocksPerSM;
2842  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
2843 
2844  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2845  printf("\aNUM_WARPS constant does not match the launch parameters\n");
2846 #endif
2847 
2848  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
2849  m_debug.clear();
2850  in.debug = m_debug.getMutableCudaPtr();
2851 
2852  // Launch.
2853  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize, false, 0, false);
2854 
2855 #ifndef BENCHMARK
2856  cuCtxSynchronize(); // Flushes printfs
2857 #endif
2858 
2859  // Verify sort
2860  S32* tsort = (S32*)m_trisIndex.getPtr();
2861  for(int i=0; i < arraySize; i++)
2862  {
2863  if(*tsort != i)
2864  {
2865  printf("Sort error %d instead of %d\n", *tsort, i);
2866  break;
2867  }
2868  tsort++;
2869  }
2870 
2871  Debug << "\nSort in " << tKernel << "\n\n";
2872 
2873  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
2874  int* header = (int*)m_taskData.getPtr();
2875  printPoolHeader(&tasks, header, blockSize.y*gridSize.x, sprintf(""));
2876 
2877  Debug << "\n\nTasks" << "\n";
2878  TaskBVH* task = (TaskBVH*)m_taskData.getPtr(TASK_SIZE*sizeof(int));
2879  int stackMax = 0;
2880  int maxDepth = 0;
2881  int syncCount = 0;
2882  int maxTaskId = -1;
2883  long double sumTris = 0;
2884  long double maxTris = 0;
2885 
2886  int sortTasks = 0;
2887  long double cntSortTris = 0;
2888 
2889  int subFailed = 0;
2890 
2891  for(int i = 0; i < TASK_SIZE; i++)
2892  {
2893  if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
2894  {
2895  Debug << "Task " << i << "\n";
2896  Debug << "Header: " << header[i] << "\n";
2897  Debug << "Unfinished: " << task[i].unfinished << "\n";
2898  Debug << "Type: " << task[i].type << "\n";
2899  Debug << "TriStart: " << task[i].triStart << "\n";
2900  Debug << "TriEnd: " << task[i].triEnd << "\n";
2901  Debug << "TriRight: " << task[i].triRight << "\n";
2902  Debug << "ParentIdx: " << task[i].parentIdx << "\n";
2903  Debug << "NodeIdx: " << task[i].nodeIdx << "\n";
2904  Debug << "TaskID: " << task[i].taskID << "\n";
2905  Debug << "Depth: " << task[i].depth << "\n";
2906 #ifdef DEBUG_INFO
2907  //Debug << "Step: " << task[i].step << "\n";
2908  //Debug << "Lock: " << task[i].lock << "\n";
2909  //Debug << "SubFailure: " << task[i].subFailureCounter << "\n";
2910  Debug << "GMEMSync: " << task[i].sync << "\n";
2911  Debug << "Parent: " << task[i].parent << "\n";
2912 #endif
2913  Debug << "Triangles: " << task[i].triEnd - task[i].triStart << "\n";
2914  Debug << "Pivot: " << task[i].pivot << "\n";
2915 
2916  Debug << "\n";
2917  stackMax = i;
2918 
2919 #ifdef CUTOFF_DEPTH
2920  if(task[i].depth == m_cutOffDepth)
2921  {
2922 #endif
2923  long double tris = task[i].triEnd - task[i].triStart;
2924  if(tris > maxTris)
2925  {
2926  maxTris = tris;
2927  maxTaskId = i;
2928  }
2929  sumTris += tris;
2930  sortTasks++;
2931  cntSortTris += tris;
2932 #ifdef CUTOFF_DEPTH
2933  }
2934 #endif
2935 
2936 #ifdef DEBUG_INFO
2937  maxDepth = max(task[i].depth, maxDepth);
2938  syncCount += task[i].sync;
2939 #endif
2940  }
2941  }
2942 
2943  if(stackMax == TASK_SIZE-1)
2944  printf("\aIncomplete result!\n");
2945 #ifdef CUTOFF_DEPTH
2946  Debug << "\n\nStatistics for cutoff depth " << m_cutOffDepth << "\n\n";
2947 #else
2948  Debug << "\n\n";
2949 #endif
2950 
2951 #ifdef DEBUG_INFO
2952  Debug << "Avg naive task height (tris) = " << sumTris/(long double)sortTasks << "\n";
2953  Debug << "Max naive task height (tris) = " << maxTris << ", taskId: " << maxTaskId << "\n";
2954  Debug << "Cnt sorted operations = " << sortTasks << "\n";
2955  double cntTrisLog2Tris = (double(arraySize) * (double)(logf(arraySize)/logf(2.0f)));
2956  Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
2957  Debug << "Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
2958  Debug << "\n";
2959  Debug << "Max task depth = " << maxDepth << "\n";
2960  Debug << "Cnt gmem synchronizations: " << syncCount << "\n";
2961  Debug << "Leafs failed to subdivide = " << subFailed << " (*3) => total useless tasks " << subFailed * 3 << "\n";
2962 #endif
2963 
2964  Debug << "max_queue_length = " << stackMax << "\n\n" << "\n";
2965 
2966  return tKernel;
2967 }
2968 
2969 F32 CudaNoStructTracer::traceOnDemandBVHRayBuffer(RayBuffer& rays, bool rebuild)
2970 {
2971  CUfunction kernel;
2972  kernel = m_module->getKernel("build");
2973  if (!kernel)
2974  fail("Build kernel not found!");
2975 
2976  // Prepare the task data
2977  if(rebuild)
2978  {
2979  initPool(0, &rays.getRayBuffer(), &m_bvhData);
2980  }
2981 
2982  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
2983  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
2984 
2985  // Set BVH input.
2986  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
2987  inBVH.numTris = m_numTris;
2988  inBVH.tris = m_trisCompact.getCudaPtr();
2989  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
2990  //inBVH.trisBox = m_trisBox.getCudaPtr();
2991  inBVH.ppsTrisBuf = m_ppsTris.getMutableCudaPtr();
2992  inBVH.ppsTrisIndex = m_ppsTrisIndex.getMutableCudaPtr();
2993  inBVH.sortTris = m_sortTris.getMutableCudaPtr();
2994 #ifdef COMPACT_LAYOUT
2995  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
2996  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
2997 #endif
2998 
2999  // Set traversal input
3000  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
3001  CUdeviceptr triPtr = m_trisCompact.getCudaPtr();
3002  Buffer& indexBuf = m_trisIndex;
3003  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3004  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompact.getSize());
3005 
3006  // Stop the timer for this copy as it is stopped in other algorithms as well
3007  m_timer.end();
3008  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
3009  m_timer.start();
3010  in.numRays = rays.getSize();
3011  in.anyHit = (rays.getNeedClosestHit() == false);
3012  in.nodesA = nodePtr + nodeOfsA.x;
3013  in.trisA = triPtr + triOfsA.x;
3014  in.rays = rays.getRayBuffer().getCudaPtr();
3015  in.results = rays.getResultBuffer().getMutableCudaPtr();
3016  in.triIndices = indexBuf.getCudaPtr();
3017 
3018  if(rebuild)
3019  {
3020 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3021 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3022  SplitRed split;
3023  for(int i = 0; i < 2; i++)
3024  {
3025  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3026  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3027  split.children[i].cnt = 0;
3028  }
3029 
3030  SplitArray sArray;
3031  for(int i = 0; i < NUM_WARPS; i++)
3032  {
3033  for(int j = 0; j < PLANE_COUNT; j++)
3034  sArray.splits[i][j] = split;
3035  }
3036 #else
3037  SplitRed split;
3038  for(int i = 0; i < 2; i++)
3039  {
3040  //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3041  //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3042  split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3043  split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3044  split.children[i].cnt = 0;
3045  }
3046 
3047  SplitArray sArray;
3048  for(int j = 0; j < PLANE_COUNT; j++)
3049  sArray.splits[j] = split;
3050 
3051  m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
3052 #endif
3053 
3054  m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
3055 #endif
3056 
3057  m_bvhData.clearRange32(0, UNBUILD_FLAG, sizeof(CudaBVHNode)); // Set the root as unbuild
3058 
3059  CudaAABB bbox;
3060  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
3061  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
3062 
3063  // Set parent task containing all the work
3064  TaskBVH all;
3065  all.triStart = 0;
3066  all.triLeft = 0;
3067  all.triRight = m_numTris;
3068  all.triEnd = m_numTris;
3069  all.bbox = bbox;
3070  all.step = 0;
3071  all.lock = LockType_Free;
3072  all.bestCost = 1e38f;
3073  all.depth = 0;
3074 #ifndef MALLOC_SCRATCHPAD
3075  all.triIdxCtr = 0;
3076 #endif
3077  all.parentIdx = -1;
3078  all.nodeIdx = 0;
3079  all.taskID = 0;
3080  Vector3 size = m_bbox.Diagonal();
3081  all.axis = size.MajorAxis();
3082  all.pivot = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.subtreeLimit");
3083  all.terminatedBy = TerminatedBy_None;
3084 #ifdef DEBUG_INFO
3085  all.sync = 0;
3086  all.parent = -1;
3087  all.clockStart = 0;
3088  all.clockEnd = 0;
3089 #endif
3090  all.cached = LockType_None; // Mark this task as cached
3091 
3092 #if SPLIT_TYPE == 0
3093 #if SCAN_TYPE == 0
3094  all.type = TaskType_Sort_PPS1;
3095 #elif SCAN_TYPE == 1
3096  all.type = TaskType_Sort_PPS1_Up;
3097 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3098  all.type = TaskType_Sort_SORT1;
3099 #endif
3100  all.unfinished = warpSubtasks(m_numTris);
3101  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3102  if(all.axis == 0)
3103  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3104  else if(all.axis == 1)
3105  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3106  else
3107  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3108 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3109 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3110  all.type = TaskType_InitMemory;
3111  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
3112 #else
3113  all.type = TaskType_BinTriangles;
3114  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3115 #endif
3116 #endif
3117  all.origSize = all.unfinished;
3118 
3119  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
3120 
3121  // Set parent task header
3122  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
3123  }
3124 
3125  // Prepare the task stack
3126  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
3127 
3128  tasks.header = (int*)m_taskData.getMutableCudaPtr();
3129  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
3130  tasks.launchFlag = 0;
3131 
3132  if(rebuild)
3133  {
3134  tasks.nodeTop = 1;
3135  tasks.triTop = 0;
3136  tasks.top = 0;
3137  tasks.bottom = 0;
3138  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
3139  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
3140  tasks.active[0] = 0;
3141  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
3142  tasks.active[i] = i;*/
3143  tasks.activeTop = 1;
3144  //tasks.empty[0] = 0;
3145  //int j = 1;
3146  //for(int i = EMPTY_MAX; i > 0; i--, j++)
3147  // tasks.empty[i] = j;
3148  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
3149  tasks.emptyTop = 0;
3150  tasks.emptyBottom = 0;
3151  tasks.numSortedTris = 0;
3152  tasks.numNodes = 0;
3153  tasks.numLeaves = 0;
3154  tasks.numEmptyLeaves = 0;
3155  tasks.sizePool = TASK_SIZE;
3156  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
3157  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
3158  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
3159  }
3160  /*else
3161  {
3162  tasks.emptyTop = 3;
3163  }*/
3164 
3165  tasks.warpCounter = rays.getSize();
3166  tasks.unfinished = -NUM_WARPS; // We are waiting for one task to finish = task all
3167 
3168 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3169  // Prepare split stack
3170  SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
3171  splits = (SplitArray*)m_splitData.getMutableCudaPtr();
3172 #endif
3173 
3174  CudaBVHNode* &bvh = *(CudaBVHNode**)m_module->getGlobal("g_bvh").getMutablePtr();
3175  bvh = (CudaBVHNode*)m_bvhData.getMutableCudaPtr();
3176 
3177  // Determine block and grid sizes.
3178 #ifdef ONE_WARP_RUN
3179  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
3180  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
3181  int numWarps = 1;
3182 #else
3183  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
3184  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
3185  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
3186  int gridSizeX = NUM_SM*numBlocksPerSM;
3187  int numWarps = numWarpsPerBlock*gridSizeX;
3188  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
3189 
3190  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3191  printf("\aNUM_WARPS constant does not match the launch parameters\n");
3192 #endif
3193 
3194  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
3195  m_debug.clear();
3196  inBVH.debug = m_debug.getMutableCudaPtr();
3197 
3198  // Launch.
3199  //cuFuncSetSharedSize(kernel, 0); // Set shared memory to force some launch configurations
3200  float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
3201 
3202 #ifndef BENCHMARK
3203  cuCtxSynchronize(); // Flushes printfs
3204 #endif
3205 
3206  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
3207  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
3208  tKernel = 1e38f;
3209 
3210  //Debug << "\nBuild in " << tKernel << "\n\n";
3211 
3212 #ifndef BENCHMARK
3213  printPool(tasks, numWarps);
3214 
3215  /*Debug << "\n\nBVH" << "\n";
3216  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
3217 
3218  for(int i = 0; i < tasks.nodeTop; i++)
3219  {
3220  Debug << "Node " << i << "\n";
3221  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
3222  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
3223  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
3224  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
3225  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
3226  }*/
3227 #endif
3228 
3229  return tKernel;
3230 }
3231 
3232 F32 CudaNoStructTracer::traceOnDemandKdtreeRayBuffer(RayBuffer& rays, bool rebuild)
3233 {
3234  CUfunction kernel;
3235  kernel = m_module->getKernel("build");
3236  if (!kernel)
3237  fail("Build kernel not found!");
3238 
3239  // Prepare the task data
3240  if(rebuild)
3241  {
3242  initPool(0, &rays.getRayBuffer(), &m_bvhData);
3243  }
3244 
3245  RtEnvironment& cudaEnv = *(RtEnvironment*)m_module->getGlobal("c_env").getMutablePtr();
3246  cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3247  float k1 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK1");
3248  float k2 = Environment::GetSingleton()->GetFloat("SubdivisionRayCaster.depthK2");
3249  cudaEnv.optMaxDepth = k1 * log2((F32)m_numTris) + k2;
3250  //cudaEnv.failureCount = 0.2f*cudaEnv.optMaxDepth + 1.0f;
3251 #ifndef BENCHMARK
3252  if(rebuild)
3253  {
3254  printf("Maximum depth = %d\n", cudaEnv.optMaxDepth);
3255  printf("Failure count = %d\n", cudaEnv.failureCount);
3256  }
3257 #endif
3258 
3259  // Set BVH input.
3260  KernelInputBVH& inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
3261  inBVH.numTris = m_numTris;
3262  inBVH.tris = m_trisCompact.getCudaPtr();
3263  inBVH.trisIndex = m_trisIndex.getMutableCudaPtr();
3264 #ifndef INTERLEAVED_LAYOUT
3265  inBVH.trisOut = m_trisCompactOut.getMutableCudaPtr();
3266  inBVH.trisIndexOut = m_trisIndexOut.getMutableCudaPtr();
3267 #endif
3268 
3269  // Set traversal input
3270  CUdeviceptr nodePtr = m_bvhData.getCudaPtr();
3271  Vec2i nodeOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3272 #ifndef INTERLEAVED_LAYOUT
3273  CUdeviceptr triPtr = m_trisCompactOut.getCudaPtr();
3274  Vec2i triOfsA = Vec2i(0, (S32)m_trisCompactOut.getSize());
3275  Buffer& indexBuf = m_trisIndexOut;
3276 #else
3277  CUdeviceptr triPtr = m_bvhData.getCudaPtr();
3278  Vec2i triOfsA = Vec2i(0, (S32)m_bvhData.getSize());
3279  Buffer& indexBuf = m_bvhData;
3280 #endif
3281 
3282  // Stop the timer for this copy as it is stopped in other algorithms as well
3283  m_timer.end();
3284  KernelInput& in = *(KernelInput*)m_module->getGlobal("c_in").getMutablePtr();
3285  m_timer.start();
3286  in.numRays = rays.getSize();
3287  in.anyHit = (rays.getNeedClosestHit() == false);
3288  memcpy(&in.bmin, &m_bbox.min, sizeof(float3));
3289  memcpy(&in.bmax, &m_bbox.max, sizeof(float3));
3290  in.nodesA = nodePtr + nodeOfsA.x;
3291  in.trisA = triPtr + triOfsA.x;
3292  in.rays = rays.getRayBuffer().getCudaPtr();
3293  in.results = rays.getResultBuffer().getMutableCudaPtr();
3294  in.triIndices = indexBuf.getCudaPtr();
3295 
3296  // Set texture references.
3297  m_module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
3298  m_module->setTexRef("t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3299  //m_module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3300  //m_module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
3301 
3302  if(rebuild)
3303  {
3304  int baseOffset = setDynamicMemory();
3305  inBVH = *(KernelInputBVH*)m_module->getGlobal("c_bvh_in").getMutablePtr();
3306 
3307 #if SPLIT_TYPE == 3
3308  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
3309 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3310 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3311  SplitRed split;
3312  for(int i = 0; i < 2; i++)
3313  {
3314  split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3315  split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3316  split.children[i].cnt = 0;
3317  }
3318 
3319  SplitArray sArray;
3320  for(int i = 0; i < NUM_WARPS; i++)
3321  {
3322  for(int j = 0; j < PLANE_COUNT; j++)
3323  sArray.splits[i][j] = split;
3324  }
3325 #else
3326  //SplitRed split;
3327  //for(int i = 0; i < 2; i++)
3328  //{
3329  // //split.children[i].bbox.m_mn = make_float3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3330  // //split.children[i].bbox.m_mx = make_float3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3331  // split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3332  // split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3333  // split.children[i].cnt = 0;
3334  //}
3335 
3336  //SplitArray sArray;
3337  //for(int j = 0; j < PLANE_COUNT; j++)
3338  // sArray.splits[j] = split;
3339 
3340  //m_splitData.setRange(0, &sArray, sizeof(SplitArray)); // Set the first splits
3341  m_splitData.clearRange(0, 0, sizeof(SplitInfoTri)); // Set first split to zeros
3342 #endif
3343 
3344  //m_splitData.setRange(TASK_SIZE * sizeof(SplitArray), &sArray, sizeof(SplitArray)); // Set the last splits for copy
3345  // Prepare split stack
3346  //SplitArray* &splits = *(SplitArray**)m_module->getGlobal("g_redSplits").getMutablePtr();
3347  //splits = (SplitArray*)m_splitData.getMutableCudaPtr();
3348 
3349  SplitInfoTri* &splits = *(SplitInfoTri**)m_module->getGlobal("g_splitStack").getMutablePtr();
3350  splits = (SplitInfoTri*)m_splitData.getMutableCudaPtr();
3351 #endif
3352 
3353  m_bvhData.clearRange32(0, UNBUILD_FLAG, sizeof(CudaKdtreeNode)); // Set the root as unbuild
3354 
3355  CudaAABB bbox;
3356  memcpy(&bbox.m_mn, &m_bbox.min, sizeof(float3));
3357  memcpy(&bbox.m_mx, &m_bbox.max, sizeof(float3));
3358 
3359  // Set parent task containing all the work
3360  TaskBVH all;
3361  all.triStart = 0;
3362  all.triLeft = 0;
3363  all.triRight = 0;
3364  all.triEnd = m_numTris;
3365  all.bbox = bbox;
3366  all.step = 0;
3367  all.lock = LockType_Free;
3368  all.bestCost = 1e38f;
3369  all.depth = 0;
3370  all.dynamicMemory= baseOffset;
3371 #ifdef MALLOC_SCRATCHPAD
3372  all.subFailureCounter = 0;
3373 #endif
3374  all.parentIdx = -1;
3375  all.nodeIdx = 0;
3376  all.taskID = 0;
3377  Vector3 size = m_bbox.Diagonal();
3378  all.axis = size.MajorAxis();
3379  all.pivot = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.subtreeLimit");
3380  all.terminatedBy = TerminatedBy_None;
3381 #ifdef DEBUG_INFO
3382  all.sync = 0;
3383  all.parent = -1;
3384  all.clockStart = 0;
3385  all.clockEnd = 0;
3386 #endif
3387  all.cached = LockType_None; // Mark this task as cached
3388 
3389 #if SPLIT_TYPE == 0
3390 #if SCAN_TYPE == 0
3391  all.type = TaskType_Sort_PPS1;
3392 #elif SCAN_TYPE == 1
3393  all.type = TaskType_Sort_PPS1_Up;
3394 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3395  all.type = TaskType_Sort_SORT1;
3396 #endif
3397  all.unfinished = warpSubtasks(m_numTris);
3398  float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3399  if(all.axis == 0)
3400  all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3401  else if(all.axis == 1)
3402  all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3403  else
3404  all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3405 #elif SPLIT_TYPE == 1
3406  all.type = TaskType_Split;
3407 #if 0 // SQRT candidates
3408  int evaluatedCandidates = (int)sqrtf(m_numTris);
3409  int evaluatedCandidates = 1;
3410  int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
3411 #elif 0 // Fixed candidates
3412  int numPlanes = 32768;
3413 #else // All candidates
3414  int numPlanes = m_numTris*6; // Number of warp sized subtasks
3415 #endif
3416  all.unfinished = warpSubtasks(numPlanes); // This must be the same as in the GPU code
3417 #elif SPLIT_TYPE == 2
3418  all.type = TaskType_Split;
3419  all.unfinished = 1;
3420 #elif SPLIT_TYPE == 3
3421  all.type = TaskType_SplitParallel;
3422  int evaluatedRays = warpSubtasks((int)sqrtf(m_numRays));
3423  int evaluatedTris = warpSubtasks((int)sqrtf(m_numTris));
3424  all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris); // Each WARP_SIZE rays and tris add their result to one plane
3425 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3426 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3427  all.type = TaskType_InitMemory;
3428  all.unfinished = warpSubtasks(sizeof(SplitArray)/sizeof(int));
3429 #else
3430  all.type = TaskType_BinTriangles;
3431  all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3432 #endif
3433 #endif
3434  all.origSize = all.unfinished;
3435 
3436  m_taskData.setRange(TASK_SIZE * sizeof(int), &all, sizeof(TaskBVH)); // Set the first task
3437 
3438  // Set parent task header
3439  m_taskData.setRange(0, &all.unfinished, sizeof(int)); // Set the first task
3440  }
3441 
3442  // Prepare the task stack
3443  TaskStackBVH& tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getMutablePtr();
3444 
3445  tasks.header = (int*)m_taskData.getMutableCudaPtr();
3446  tasks.tasks = (TaskBVH*)m_taskData.getMutableCudaPtr(TASK_SIZE * sizeof(int));
3447  tasks.launchFlag = 0;
3448 
3449  if(rebuild)
3450  {
3451 #ifndef INTERLEAVED_LAYOUT
3452  tasks.nodeTop = 1;
3453 #else
3454  tasks.nodeTop = sizeof(CudaKdtreeNode);
3455 #endif
3456  tasks.triTop = 0;
3457  tasks.top = 0;
3458  tasks.bottom = 0;
3459  //memset(tasks.active, 0, sizeof(int)*(ACTIVE_MAX+1));
3460  memset(tasks.active, -1, sizeof(int)*(ACTIVE_MAX+1));
3461  tasks.active[0] = 0;
3462  /*for(int i = 0; i < ACTIVE_MAX+1; i++)
3463  tasks.active[i] = i;*/
3464  tasks.activeTop = 1;
3465  //tasks.empty[0] = 0;
3466  //int j = 1;
3467  //for(int i = EMPTY_MAX; i > 0; i--, j++)
3468  // tasks.empty[i] = j;
3469  memset(tasks.empty, 0, sizeof(int)*(EMPTY_MAX+1));
3470  tasks.emptyTop = 0;
3471  tasks.emptyBottom = 0;
3472  tasks.numSortedTris = 0;
3473  tasks.numNodes = 0;
3474  tasks.numLeaves = 0;
3475  tasks.numEmptyLeaves = 0;
3476  tasks.sizePool = TASK_SIZE;
3477  tasks.sizeNodes = m_bvhData.getSize()/sizeof(CudaKdtreeNode);
3478  tasks.sizeTris = m_trisIndexOut.getSize()/sizeof(S32);
3479  memset(tasks.leafHist, 0, sizeof(tasks.leafHist));
3480  }
3481  /*else
3482  {
3483  tasks.emptyTop = 3;
3484  }*/
3485 
3486  tasks.warpCounter = rays.getSize();
3487 #ifndef ONDEMAND_FULL_BUILD
3488  tasks.unfinished = -NUM_WARPS; // We are waiting for all trace warps to finish
3489 #else
3490  tasks.unfinished = -1; // We are waiting for one task to finish = task all
3491 #endif
3492 
3493  CudaKdtreeNode* &kdtree = *(CudaKdtreeNode**)m_module->getGlobal("g_kdtree").getMutablePtr();
3494  kdtree = (CudaKdtreeNode*)m_bvhData.getMutableCudaPtr();
3495 
3496  // Determine block and grid sizes.
3497 #ifdef ONE_WARP_RUN
3498  Vec2i blockSize(WARP_SIZE, 1); // threadIdx.x must equal the thread lane in warp
3499  Vec2i gridSize(1, 1); // Number of SMs * Number of blocks?
3500  int numWarps = 1;
3501 #else
3502  int numWarpsPerBlock = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numWarpsPerBlock");
3503  int numBlocksPerSM = Environment::GetSingleton()->GetInt("SubdivisionRayCaster.numBlockPerSM");
3504  Vec2i blockSize(WARP_SIZE, numWarpsPerBlock); // threadIdx.x must equal the thread lane in warp
3505  int gridSizeX = NUM_SM*numBlocksPerSM;
3506  int numWarps = numWarpsPerBlock*gridSizeX;
3507  Vec2i gridSize(gridSizeX, 1); // Number of SMs * Number of blocks?
3508 
3509  if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3510  printf("\aNUM_WARPS constant does not match the launch parameters\n");
3511 #endif
3512 
3513  m_debug.resizeDiscard(blockSize.y*gridSize.x*sizeof(float4));
3514  m_debug.clear();
3515  inBVH.debug = m_debug.getMutableCudaPtr();
3516 
3517  // Launch.
3518  //cuFuncSetSharedSize(kernel, 0); // Set shared memory to force some launch configurations
3519  float tKernel = 0.f;
3520 #ifndef DUPLICATE_REFERENCES
3521  if(rebuild)
3522  tKernel += convertWoop();
3523 #endif
3524  tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
3525 
3526 /*#ifdef MALLOC_SCRATCHPAD
3527  CUfunction kernelDealloc = m_module->getKernel("deallocFreeableMemory", 0);
3528  if (!kernelDealloc)
3529  fail("Memory allocation kernel not found!");
3530 
3531  F32 deallocTime = m_module->launchKernelTimed(kernelDealloc, Vec2i(1,1), Vec2i(1, 1));
3532 
3533  printf("Memory freed in %f\n", deallocTime);
3534 #endif*/
3535 
3536 #ifndef BENCHMARK
3537  cuCtxSynchronize(); // Flushes printfs
3538 #endif
3539 
3540  tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
3541 #ifndef INTERLEAVED_LAYOUT
3542  if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.getSize() / sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.getSize() / sizeof(S32)) // Something went fishy
3543 #else
3544  if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.getSize()) // Something went fishy
3545 #endif
3546  tKernel = 1e38f;
3547 
3548  //Debug << "\nBuild in " << tKernel << "\n\n";
3549 
3550 #ifndef BENCHMARK
3551  printPool(tasks, numWarps);
3552 
3553  /*Debug << "\n\nBVH" << "\n";
3554  CudaBVHNode* nodes = (CudaBVHNode*)m_bvhData.getPtr();
3555 
3556  for(int i = 0; i < tasks.nodeTop; i++)
3557  {
3558  Debug << "Node " << i << "\n";
3559  Debug << "BoxLeft: (" << nodes[i].c0xy.x << ", " << nodes[i].c0xy.z << ", " << nodes[i].c01z.x << ") - ("
3560  << nodes[i].c0xy.y << ", " << nodes[i].c0xy.w << ", " << nodes[i].c01z.y << ")\n";
3561  Debug << "BoxRight: (" << nodes[i].c1xy.x << ", " << nodes[i].c1xy.z << ", " << nodes[i].c01z.z << ") - ("
3562  << nodes[i].c1xy.y << ", " << nodes[i].c1xy.w << ", " << nodes[i].c01z.w << ")\n";
3563  Debug << "Children: " << nodes[i].children.x << ", " << nodes[i].children.y << "\n\n";
3564  }*/
3565 #endif
3566 
3567  return tKernel;
3568 }
3569 
3570 F32 CudaNoStructTracer::traceCpuRayBuffer(RayBuffer& rb)
3571 {
3572  const Ray* rays = (const Ray*)rb.getRayBuffer().getPtr();
3573  RayResult* results = (RayResult*)rb.getResultBuffer().getMutablePtr();
3574  for(int rid=0; rid < rb.getSize(); rid++)
3575  {
3576  if(rid % 10000 == 0) printf("rid: %d\n",rid);
3577  traceCpuRay(rays[rid], results[rid], !rb.getNeedClosestHit());
3578  }
3579 
3580  return 0;
3581 }
3582 
3583 void CudaNoStructTracer::traceCpuRay(const Ray& r, RayResult& result, bool anyHit)
3584 {
3585  const Vec4f *t_trisA = (Vec4f*)(m_trisCompact.getPtr());
3586  const S32 *t_trisIndices = (S32*)(m_trisIndex.getPtr());
3587 
3588  int hitIndex;
3589  float hitT;
3590  float hitU;
3591  float hitV;
3592  float tmin;
3593 
3594  hitIndex = -1;
3595  hitT = r.tmax;
3596  hitU = 0;
3597  hitV = 0;
3598  tmin = 0;
3599 
3600  // naive over all triangles
3601  for (int triAddr = 0; triAddr < m_numTris * 3 ; triAddr += 3)
3602  {
3603  const Vec3f &v00 = t_trisA[triAddr + 0].getXYZ();
3604  const Vec3f &v11 = t_trisA[triAddr + 1].getXYZ();
3605  const Vec3f &v22 = t_trisA[triAddr + 2].getXYZ();
3606 
3607  Vec3f nrmN = cross(v11-v00,v22-v00);
3608  const float den = dot(nrmN,r.direction);
3609 
3610  if(den >= 0.0f)
3611  continue;
3612 
3613  const float deni = 1.0f / den;
3614  const Vec3f org0 = v00-r.origin;
3615  float t = dot(nrmN,org0)*deni;
3616 
3617  if (t > tmin && t < hitT)
3618  {
3619  const Vec3f crossProd = cross(r.direction,org0);
3620  const float v = dot(v00-v22,crossProd)*deni;
3621  if (v >= 0.0f && v <= 1.0f)
3622  {
3623  const float u = -dot(v00-v11,crossProd)*deni;
3624  if (u >= 0.0f && u + v <= 1.0f)
3625  {
3626  hitT = t;
3627  hitU = u;
3628  hitV = v;
3629  hitIndex = triAddr;
3630  }
3631  }
3632  }
3633  }
3634 
3635  if(hitIndex != -1)
3636  hitIndex = hitIndex / 3;
3637 
3638  result.id = hitIndex;
3639  result.t = hitT;
3640  result.u = hitU;
3641  result.v = hitV;
3642 }
3643 
3644 void CudaNoStructTracer::saveBufferSizes(bool ads, bool aux)
3645 {
3646  float MB = (float)(1024*1024);
3647 
3648  if(ads)
3649  {
3650  m_sizeADS = m_bvhData.getSize()/MB;
3651 #ifndef COMPACT_LAYOUT
3652  m_sizeTri = m_trisCompact.getSize()/MB;
3653  m_sizeTriIdx = m_trisIndex.getSize()/MB;
3654 #else
3655  m_sizeTri = m_trisCompactOut.getSize()/MB;
3656  m_sizeTriIdx = m_trisIndexOut.getSize()/MB;
3657 #endif
3658  }
3659 
3660  if(aux)
3661  {
3662  m_sizeTask = m_taskData.getSize()/MB;
3663  m_sizeSplit = m_splitData.getSize()/MB;
3664 #ifdef MALLOC_SCRATCHPAD
3665 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3666  size_t heapSize;
3667  cuCtxGetLimit(&heapSize, CU_LIMIT_MALLOC_HEAP_SIZE);
3668  m_heap = heapSize/MB;
3669 #else
3670  m_heap = (m_mallocData.getSize()+m_mallocData2.getSize())/MB;
3671 #endif
3672 #else
3673  m_heap = 0.f;
3674 #endif
3675  }
3676 }
3677 
3678 void CudaNoStructTracer::prepareDynamicMemory()
3679 {
3680  // Set the memory limit according to triangle count
3681  //U64 allocSize = (U64)m_trisIndex.getSize()*15ULL;
3682  //U64 allocSize = (U64)m_trisIndex.getSize()*20ULL;
3683  U64 allocSize = (U64)m_trisIndex.getSize()*150ULL;
3684  //U64 allocSize = (U64)m_trisIndex.getSize()*200ULL;
3685 
3686 #if defined(SCATTER_ALLOC) || defined(FDG_ALLOC)
3687  U64 allocSize = max(allocSize, 8ULL*1024ULL*1024ULL);
3688 #endif
3689 
3690 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3691  cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, allocSize);
3692 #elif defined(ATOMIC_MALLOC) || defined(CIRCULAR_MALLOC)
3693  m_mallocData.resizeDiscard(allocSize);
3694 #ifdef WITH_SCATTER_ALLOC
3695  m_mallocData2.resizeDiscard(allocSize);
3696 #endif
3697 #elif defined(SCATTER_ALLOC)
3698  m_mallocData.resizeDiscard(allocSize);
3699 #endif
3700 
3701 #if defined(SCATTER_ALLOC) || defined(WITH_SCATTER_ALLOC)
3702  // CUDA Driver API cannot deal with templates -> use C++ mangled name
3703  CUfunction initHeap = m_module->getKernel("_ZN8GPUTools8initHeapILj4096ELj8ELj16ELj2ELb0ELb1EEEvPNS_10DeviceHeapIXT_EXT0_EXT1_EXT2_EXT3_EXT4_EEEPvj", 2*sizeof(CUdeviceptr)+sizeof(int));
3704  if (!initHeap)
3705  fail("Scatter alloc initialization kernel not found!");
3706 
3707  int offset = 0;
3708  offset += m_module->setParamPtr(initHeap, offset, m_module->getGlobal("theHeap").getMutableCudaPtr());
3709 #ifdef WITH_SCATTER_ALLOC
3710  offset += m_module->setParamPtr(initHeap, offset, m_mallocData2.getMutableCudaPtr());
3711 #else
3712  offset += m_module->setParamPtr(initHeap, offset, m_mallocData.getMutableCudaPtr());
3713 #endif
3714  offset += m_module->setParami(initHeap, offset, allocSize);
3715  F32 initTime = m_module->launchKernelTimed(initHeap, Vec2i(256,1), Vec2i(1, 1));
3716 
3717  printf("Scatter alloc initialized in %f\n", initTime);
3718 #endif
3719 }
3720 
3721 int CudaNoStructTracer::setDynamicMemory()
3722 {
3723  int baseOffset = 0;
3724 #if !defined(ATOMIC_MALLOC) && !defined(CIRCULAR_MALLOC)
3725  CUfunction kernelAlloc = m_module->getKernel("allocFreeableMemory", 2*sizeof(int));
3726  if (!kernelAlloc)
3727  fail("Memory allocation kernel not found!");
3728 
3729  int offset = 0;
3730  offset += m_module->setParami(kernelAlloc, offset, m_numTris);
3731  offset += m_module->setParami(kernelAlloc, offset, 0);
3732  F32 allocTime = m_module->launchKernelTimed(kernelAlloc, Vec2i(1,1), Vec2i(1, 1));
3733 
3734 #ifndef BENCHMARK
3735  printf("Memory allocated in %f\n", allocTime);
3736 #endif
3737 #else
3738  // Set the heapBase, heapOffset and heapSize
3739  char*& heapBase = *(char**)m_module->getGlobal("g_heapBase").getMutablePtr();
3740  heapBase = (char*)m_mallocData.getMutableCudaPtr();
3741  int& heapOffset = *(int*)m_module->getGlobal("g_heapOffset").getMutablePtr();
3742 #if SCAN_TYPE < 2
3743  heapOffset = 4*m_numTris*sizeof(int);
3744 #else
3745  heapOffset = m_numTris*sizeof(int);
3746 #endif
3747 
3748  int& heapSize = *(int*)m_module->getGlobal("g_heapSize").getMutablePtr();
3749  heapSize = m_mallocData.getSize();
3750 
3751 #if defined(CIRCULAR_MALLOC)
3752 #ifndef DOUBLY_LINKED
3753  int headerSize = 2*sizeof(int);
3754 #else
3755  int headerSize = 3*sizeof(int);
3756 #endif
3757  heapOffset += headerSize;
3758 
3759  int& heapLock = *(int*)m_module->getGlobal("g_heapLock").getMutablePtr();
3760  heapLock = 0;
3761 
3762 #ifndef DOUBLY_LINKED
3763  Vec2i first(LockType_Set, heapOffset); // Locked, allocated memory for parent
3764  m_mallocData.setRange(0, &first, sizeof(Vec2i)); // Set the first header
3765 #else
3766  Vec3i first(LockType_Set, heapSize-headerSize, heapOffset); // Locked, allocated memory for parent
3767  m_mallocData.setRange(0, &first, sizeof(Vec3i)); // Set the first header
3768 #endif
3769 
3770 #ifdef GLOBAL_HEAP_LOCK
3771 #ifndef DOUBLY_LINKED
3772  Vec2i second(LockType_Free, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3773  m_mallocData.setRange(heapOffset, &second, sizeof(Vec2i)); // Set the second header
3774 #else
3775  Vec3i second(LockType_Free, 0, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3776  m_mallocData.setRange(heapOffset, &second, sizeof(Vec3i)); // Set the second header
3777 #endif
3778 #else
3779 #if 0
3780  // Create regular chunks
3781  int numChunks = m_mallocData.getSize()/heapOffset;
3782  for(int i = 1; i < numChunks; i++)
3783  {
3784  #ifndef DOUBLY_LINKED
3785  Vec2i next(0, (i+1)*heapOffset); // Unlocked, next at the multiple of heapOffset
3786  m_mallocData.setRange(i*heapOffset, &next, sizeof(Vec2i)); // Set the next header
3787  #else
3788  Vec3i next(0, (i-1)*heapOffset, (i+1)*heapOffset); // Unlocked, next at the multiple of heapOffset
3789  m_mallocData.setRange(i*heapOffset, &next, sizeof(Vec3i)); // Set the next header
3790  #endif
3791  }
3792 
3793 #else
3794  // Create hierarchical chunks
3795  int delta = ((int)(heapOffset)+headerSize+3) & -4;
3796  int prevOfs = 0;
3797  int ofs;
3798  int i = 0;
3799  int lvl = 2;
3800  for(ofs = heapOffset; true; ofs += delta, i++)
3801  {
3802  if(i == lvl) // New level in BFS order
3803  {
3804  delta = ((int)(delta * 0.8f)+headerSize+3) & -4;
3805  i = 0;
3806  lvl *= 2;
3807  }
3808 
3809  if(ofs+delta >= heapSize-2*headerSize) // We cannot make another chunk
3810  break;
3811 
3812 #ifndef DOUBLY_LINKED
3813  Vec2i next(LockType_Free, ofs+delta); // Unlocked, next at the multiple of heapOffset
3814  m_mallocData.setRange(ofs, &next, sizeof(Vec2i)); // Set the next header
3815 #else
3816  Vec3i next(LockType_Free, prevOfs, ofs+delta); // Unlocked, next at the multiple of heapOffset
3817  m_mallocData.setRange(ofs, &next, sizeof(Vec3i)); // Set the next header
3818 #endif
3819 
3820  prevOfs = ofs;
3821  }
3822 #endif
3823 
3824 #ifndef DOUBLY_LINKED
3825  Vec2i last(LockType_Free, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3826  m_mallocData.setRange(ofs, &last, sizeof(Vec2i)); // Set the last header
3827 #else
3828  Vec3i last(LockType_Free, prevOfs, heapSize-headerSize); // Unlocked, next at the end of allocated memory
3829  m_mallocData.setRange(ofs, &last, sizeof(Vec3i)); // Set the last header
3830 #endif
3831 #endif
3832 
3833 #ifndef DOUBLY_LINKED
3834  Vec2i tail(LockType_Set, 0); // Locked, next at the start of heap
3835  m_mallocData.setRange(heapSize-headerSize, &tail, sizeof(Vec2i)); // Set the last header
3836 #else
3837  Vec3i tail(LockType_Set, ofs, 0); // Locked, next at the start of heap
3838  m_mallocData.setRange(heapSize-headerSize, &tail, sizeof(Vec3i)); // Set the last header
3839 #endif
3840 
3841  // Offset of the memory allocation
3842  baseOffset = headerSize;
3843 
3844 #ifdef WITH_SCATTER_ALLOC
3845  // With scatter alloc
3846  char*& heapBase2 = *(char**)m_module->getGlobal("g_heapBase2").getMutablePtr();
3847  heapBase2 = (char*)m_mallocData2.getMutableCudaPtr();
3848 #endif
3849 #endif
3850 
3851  int offset;
3852 #endif
3853 
3854  CUfunction kernelMemCpyIndex = m_module->getKernel("MemCpyIndex", sizeof(CUdeviceptr)+sizeof(int));
3855  if (!kernelMemCpyIndex)
3856  fail("Memory copy kernel not found!");
3857 
3858  int memSize = m_trisIndex.getSize()/sizeof(int);
3859  offset = 0;
3860  offset += m_module->setParamPtr(kernelMemCpyIndex, offset, m_trisIndex.getCudaPtr());
3861  offset += m_module->setParami(kernelMemCpyIndex, offset, memSize);
3862  F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex, Vec2i(256,1), Vec2i((memSize-1+256)/256, 1));
3863 
3864 #ifndef BENCHMARK
3865  printf("Triangle indices copied in %f\n", memcpyTime);
3866 #endif
3867 
3868 #ifdef SCATTER_ALLOC
3869  CUdeviceptr& heap = *(CUdeviceptr*)m_module->getGlobal("g_heapBase").getMutablePtr();
3870  CUdeviceptr base = m_mallocData.getMutableCudaPtr();
3871  baseOffset = heap - base;
3872  heap = base;
3873  //if(heap != m_mallocData.getCudaPtr())
3874  // printf("Wrong base address!\n");
3875 #endif
3876 
3877  return baseOffset;
3878 }
3879 
3880 
3882 {
3883  // Convert woop triangles
3884  CUfunction kernelCreateWoop = m_module->getKernel("createWoop", 2*sizeof(CUdeviceptr)+sizeof(int));
3885  if (!kernelCreateWoop)
3886  fail("Regular triangle to Woop triangle conversion kernel not found!");
3887 
3888  int offset = 0;
3889  offset += m_module->setParamPtr(kernelCreateWoop, offset, m_trisCompact.getCudaPtr());
3890  offset += m_module->setParamPtr(kernelCreateWoop, offset, m_trisCompactOut.getMutableCudaPtr());
3891  offset += m_module->setParami(kernelCreateWoop, offset, m_numTris);
3892  F32 woopTime = m_module->launchKernelTimed(kernelCreateWoop, Vec2i(256,1), Vec2i((m_numTris-1+256)/256, 1));
3893 
3894 #ifndef BENCHMARK
3895  printf("Woop triangles created in %f\n", woopTime);
3896 #endif
3897 
3898  return woopTime;
3899 }
3900 
3901 void CudaNoStructTracer::resetBuffers(bool resetADSBuffers)
3902 {
3903  // Reset buffers so that reuse of space does not cause timing disturbs
3904  if(resetADSBuffers)
3905  {
3906  m_bvhData.reset();
3907  m_trisCompactOut.reset();
3908  m_trisIndexOut.reset();
3909  }
3910 
3911  m_mallocData.reset();
3912  m_mallocData2.reset();
3913  m_taskData.reset();
3914  m_splitData.reset();
3915 
3916  m_raysIndex.reset();
3917 
3918  m_ppsTris.reset();
3919  m_ppsTrisIndex.reset();
3920  m_sortTris.reset();
3921  m_ppsRays.reset();
3922  m_ppsRaysIndex.reset();
3923  m_sortRays.reset();
3924 }
3925 
3927 {
3928  // Save sizes of auxiliary buffers so that they can be printed
3929  saveBufferSizes(false, true);
3930  // Free auxiliary buffers
3931  resetBuffers(false);
3932 
3933  // Resize to exact memory
3934  U32 nN, nL, eL, sT, bT, tT, sTr;
3935  getStats(nN, nL, eL, sT, bT, tT, sTr);
3936 #ifdef COMPACT_LAYOUT
3937  m_bvhData.resize(nN * sizeof(CudaBVHNode));
3938  m_trisCompactOut.resize(tT*3*sizeof(float4) + nL*sizeof(float4));
3939  m_trisIndexOut.resize(tT*3*sizeof(int) + nL*sizeof(int));
3940 #else
3941  m_bvhData.resize((nN + nL) * sizeof(CudaBVHNode));
3942 #endif
3943 
3944  // Save sizes of ads buffers so that they can be printed
3945  saveBufferSizes(true, false);
3946 }
3947 
3949 {
3950  // Save sizes of auxiliary buffers so that they can be printed
3951  saveBufferSizes(false, true);
3952  // Free auxiliary buffers
3953  resetBuffers(false);
3954 
3955  // Resize to exact memory
3956  U32 nN, nL, eL, sT, nT, tT, sTr;
3957  getStats(nN, nL, eL, sT, nT, tT, sTr);
3958 #ifndef INTERLEAVED_LAYOUT
3959 #ifndef COMPACT_LAYOUT
3960  getStats(nN, nL, eL, sT, nT, tT, sTr, false);
3961  m_bvhData.resize((nN + nL) * sizeof(CudaKdtreeNode));
3962  m_trisCompactOut.resize(tT*3*sizeof(float4));
3963  m_trisIndexOut.resize(tT*3*sizeof(int));
3964 #else
3965 #ifdef DUPLICATE_REFERENCES
3966  m_bvhData.resize(nN * sizeof(CudaKdtreeNode));
3967  m_trisCompactOut.resize(tT*3*sizeof(float4) + nL*sizeof(float4));
3968  m_trisIndexOut.resize(tT*3*sizeof(int) + nL*sizeof(int));
3969 #else
3970  m_bvhData.resize(nN * sizeof(CudaKdtreeNode));
3971  m_trisIndexOut.resize(tT*sizeof(int) + nL*sizeof(int));
3972 #endif
3973 #endif
3974 #else
3975  //m_bvhData.resize((nN + nL) * sizeof(CudaKdtreeNode) + tT*3*sizeof(float4) + tT*3*sizeof(int));
3976  m_bvhData.resize(nT);
3977 #endif
3978 
3979  // Save sizes of ads buffers so that they can be printed
3980  saveBufferSizes(true, false);
3981 }
3982 
3983 void CudaNoStructTracer::getStats(U32& nodes, U32& leaves, U32& emptyLeaves, U32& stackTop, U32& nodeTop, U32& tris, U32& sortedTris, bool sub)
3984 {
3985  TaskStackBVH tasks = *(TaskStackBVH*)m_module->getGlobal("g_taskStackBVH").getPtr();
3986 
3987 #ifndef INTERLEAVED_LAYOUT
3988 #ifndef BVH_COUNT_NODES
3989 #ifndef COMPACT_LAYOUT
3990  nodes = tasks.nodeTop / 2;
3991  leaves = tasks.nodeTop - nodes;
3992 #else
3993  nodes = tasks.nodeTop;
3994  leaves = tasks.triTop;
3995  emptyLeaves = 0;
3996 #endif
3997 #else // BVH_COUNT_NODES
3998  nodes = tasks.numNodes;
3999  leaves = tasks.numLeaves;
4000  emptyLeaves = tasks.numEmptyLeaves;
4001 #endif // BVH_COUNT_NODES
4002 
4003 #ifdef COMPACT_LAYOUT
4004  tris = tasks.triTop;
4005  if(sub)
4006  tris -= (leaves-emptyLeaves);
4007 #ifdef DUPLICATE_REFERENCES
4008  tris /= 3;
4009 #endif
4010 #else
4011  if(sub)
4012  {
4013  tris = m_numTris;
4014  }
4015  else
4016  {
4017  tris = tasks.triTop;
4018  tris /= 3;
4019  }
4020 #endif
4021 #else
4022 #ifndef BVH_COUNT_NODES
4023  nodes = tasks.nodeTop / 2;
4024  leaves = tasks.nodeTop - nodes;
4025  emptyLeaves = 0;
4026 #else // BVH_COUNT_NODES
4027  nodes = tasks.numNodes;
4028  leaves = tasks.numLeaves;
4029  emptyLeaves = tasks.numEmptyLeaves;
4030 #endif // BVH_COUNT_NODES
4031 
4032  tris = tasks.nodeTop - (nodes+leaves)*sizeof(CudaKdtreeNode); // Subtract node memory
4033  tris /= 3*sizeof(float4)+sizeof(int); // Only approximate because of padding
4034 #endif
4035 
4036  nodeTop = tasks.nodeTop;
4037  sortedTris = tasks.numSortedTris;
4038  stackTop = tasks.top;
4039 }
4040 
4041 void CudaNoStructTracer::getSizes(F32& task, F32& split, F32& ads, F32& tri, F32& triIdx, F32& heap)
4042 {
4043  task = m_sizeTask;
4044  split = m_sizeSplit;
4045  ads = m_sizeADS;
4046  tri = m_sizeTri;
4047  triIdx = m_sizeTriIdx;
4048  heap = m_heap;
4049 }
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int offset
Definition: DLLImports.inl:84
S32 getSize() const
Gets size of the buffer (number of rays).
Definition: RayBuffer.hpp:52
#define NULL
Definition: Defs.hpp:39
bool endsWith(const String &str) const
Definition: String.cpp:273
void setRange(S64 dstOfs, const void *src, S64 size, bool async=false, CUstream cudaStream=NULL)
Definition: Buffer.cpp:149
void traceOnDemandTrace(RayBuffer &rays, F32 &GPUmegakernel, F32 &CPUmegakernel, F32 &GPUtravKernel, F32 &CPUtravKernel, int &buildNodes, RayStats *stats=NULL)
float GetFloat(const char *name, const bool isFatal=false) const
void clear(int value=0)
Definition: Buffer.hpp:100
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
int GetInt(const char *name, const bool isFatal=false) const
F32 traceBatchKdtree(RayBuffer &rays, RayStats *stats=NULL)
float t
Definition: Util.hpp:84
void unstart(void)
Definition: Timer.hpp:43
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N
Definition: DLLImports.inl:132
#define TASK_SIZE
CUdeviceptr getCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:108
unsigned __int64 U64
Definition: Defs.hpp:97
U32 toABGR(void) const
Definition: Math.cpp:45
S32 numTriangleTests
Total number of ray-triangle tests.
Definition: BVH.hpp:63
S32 numRays
Total number of rays.
Definition: BVH.hpp:62
void start(void)
Definition: Timer.hpp:42
CudaKernel getKernel(const String &name)
Definition: CudaModule.cpp:80
FW_CUDA_FUNC Vec3f getXYZ(void) const
Definition: Math.hpp:365
FW_CUDA_FUNC F32 sqrt(F32 a)
Definition: Math.hpp:39
int setParami(CUfunction kernel, int offset, S32 value)
Definition: CudaModule.cpp:90
Definition: Util.hpp:62
S64 getSize(void) const
Definition: Buffer.hpp:69
Structure holding ray statistics. Also provides print to the console. These statistics are used in a ...
Definition: BVH.hpp:45
F32 traceBatch(RayBuffer &rays)
const U8 * getPtr(S64 ofs=0)
Definition: Buffer.hpp:106
void clearOptions(void)
void setOwner(Module module, bool modify, bool async=false, CUstream cudaStream=NULL, S64 validSize=-1)
Definition: Buffer.cpp:220
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx cuCtxSynchronize
Definition: DLLImports.inl:58
FW_CUDA_FUNC T dot(const VectorBase< T, L, S > &a, const VectorBase< T, L, V > &b)
Definition: Math.hpp:477
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat x
Definition: DLLImports.inl:363
Buffer & getRayBuffer()
Gets ray buffer.
Definition: RayBuffer.hpp:167
bool getNeedClosestHit() const
Returns whether the closest hit is needed.
Definition: RayBuffer.hpp:150
void setTexRef(const String &name, Buffer &buf, CUarray_format format, int numComponents)
Definition: CudaModule.cpp:193
void define(const String &key, const String &value="")
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z
Definition: DLLImports.inl:363
static int getComputeCapability(void)
Definition: CudaModule.cpp:508
S32 getS32(void)
Definition: Random.hpp:54
Vec3f origin
Definition: Util.hpp:67
FW_CUDA_FUNC T sum(const VectorBase< T, L, S > &v)
Definition: Math.hpp:463
float F32
Definition: Defs.hpp:89
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat y
Definition: DLLImports.inl:363
static Environment * GetSingleton()
F32 traceOnDemandBVH(RayBuffer &rays, bool rebuild, int numRays=0)
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:112
F32 log2(F32 a)
Definition: Math.hpp:90
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
Definition: DLLImports.inl:329
Buffer & getResultBuffer()
Gets ray result buffer.
Definition: RayBuffer.hpp:173
F32 traceBatchBVH(RayBuffer &rays, RayStats *stats=NULL)
Ray buffer class. Stores rays.
Definition: RayBuffer.hpp:38
U8 * getMutablePtr(S64 ofs=0)
Definition: Buffer.hpp:110
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
Definition: Math.hpp:461
Vec3f direction
Definition: Util.hpp:69
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
Definition: Math.hpp:462
CudaNoStructTracer(MiniMax::Scene &scene, F32 epsilon)
float tmax
Definition: Util.hpp:70
signed int S32
Definition: Defs.hpp:88
F32 end(void)
Definition: Timer.hpp:69
String sprintf(const char *fmt,...)
Definition: Defs.cpp:241
void getStats(U32 &nodes, U32 &leaves, U32 &emptyLeaves, U32 &stackTop, U32 &nodeTop, U32 &tris, U32 &sortedTris, bool sub=true)
bool GetFloatValue(const char *name, float &value, const bool isFatal=false) const
S32 numNodeTests
Total number of ray-node tests.
Definition: BVH.hpp:64
signed __int64 S64
Definition: Defs.hpp:98
unsigned int U32
Definition: Defs.hpp:85
Class holding information about a split of a BVH node.
Definition: BVHNode.hpp:58
Buffer & getGlobal(const String &name)
Definition: CudaModule.cpp:117
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
Definition: DLLImports.inl:88
FW_CUDA_FUNC F32 cross(const Vec2f &a, const Vec2f &b)
Definition: Math.hpp:481
void clearRange(S64 dstOfs, int value, S64 size, bool async=false, CUstream cudaStream=NULL)
Definition: Buffer.cpp:202
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
Definition: DLLImports.inl:325
void printf(const char *fmt,...)
Definition: Defs.cpp:225
String & appendf(const char *fmt,...)
Definition: String.cpp:207
F32 traceOnDemandKdtree(RayBuffer &rays, bool rebuild, int numRays=0)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei width
Definition: DLLImports.inl:347
void resetBuffers(bool resetADSBuffers)
FW_CUDA_FUNC F64 log(F64 a)
Definition: Math.hpp:47
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx, F32 &heap)
void failIfError(void)
Definition: Defs.cpp:361
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height
Definition: DLLImports.inl:347
FW_CUDA_FUNC S normalized(T len=(T) 1) const
Definition: Math.hpp:144
U32 getU32(void)
Definition: Random.hpp:51
int setParamPtr(CUfunction kernel, int offset, CUdeviceptr value)
Definition: CudaModule.cpp:108
const float MB
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei depth
Definition: DLLImports.inl:349
bool GetIntValue(const char *name, int &value, const bool isFatal=false) const
static void staticInit(void)
Definition: CudaModule.cpp:311
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type
Definition: DLLImports.inl:323
void fail(const char *fmt,...)
Definition: Defs.cpp:304
void addOptions(const String &options)
void resizeDiscard(S64 size)
Definition: Buffer.hpp:83
void clearDefines(void)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
Definition: DLLImports.inl:319
void resize(S64 size)
Definition: Buffer.hpp:82
void setCachePath(const String &path)
void reset(U32 hints, int align)
Definition: Buffer.hpp:76