NTrace
GPU ray tracing framework
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
HLBVHBuilder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2009-2010 NVIDIA Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 #include "gpu/CudaCompiler.hpp"
19 #include "base/Math.hpp"
20 #include "radixSort.hpp"
21 
22 #include "bvh/HLBVH/emitTreeKernel.cuh"
23 
24 #include <cuda_runtime_api.h>
25 
26 using namespace FW;
27 #define BENCHMARK
28 const float MB = (float)(1024*1024);
29 
30 //------------------------------------------------------------------------
31 
33  : CudaBVH(BVHLayout_Compact), m_scene(scene),m_platform(platform),m_params(params)
34 {
35  //m_params.epsilon = 0.f;
36  //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*1024*500);
37 
38  m_sizeTask = 0;
39  m_sizeSplit = 0;
40  m_sizeADS = 0;
41  m_sizeTri = 0;
42  m_sizeTriIdx = 0;
43 
44  if (!params.hlbvh || params.hlbvhBits == 10)
45  buildLBVH();
46  else
47  buildHLBVH();
48 }
49 
50 //------------------------------------------------------------------------
51 
53 {
54 }
55 
56 //------------------------------------------------------------------------
57 
58 void HLBVHBuilder::getStats(U32& nodes, U32& leaves, U32& nodeTop)
59 {
60  nodes = m_nodes;
61  leaves = m_leafs;
62  nodeTop = m_nodes;
63 }
64 
65 //------------------------------------------------------------------------
66 
68 {
69  CudaKernel kernelMorton = module->getKernel("calcMorton");
70 
71  // scene AABB
72  Vec3f sceneMin, sceneMax;
73  m_scene->getBBox(sceneMin, sceneMax);
74 
76  const float k2 = 1024.0f; // 2^n (n = 10)
77  //const U32 k2 = (1 << n);
78  //const U32 k2 = pow(2,n);
79  Vec3f step = (sceneMax - sceneMin) / k2;
80 
81  kernelMorton.setParams(triCnt, sceneMin.x, sceneMin.y, sceneMin.z, step.x, step.y, step.z);
82  F32 cudaTime = kernelMorton.launchTimed(triCnt, Vec2i(BLOCK_SIZE,1));
83  cudaTotalTime += cudaTime;
84 #ifndef BENCHMARK
85  printf("? Morton codes: %f [%f]\n", cudaTime, cudaTotalTime);
86  printf("! Morton codes: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
87 #endif
88 
90  cudaTime = radixSortCuda(triMorton.getMutableCudaPtr(), triIdx.getMutableCudaPtr(), triCnt);
91  cudaTotalTime += cudaTime;
92 #ifndef BENCHMARK
93  printf("? Radix sort: %f [%f]\n", cudaTime, cudaTotalTime);
94  printf("! Radix sort: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
95 #endif
96 }
97 
98 void HLBVHBuilder::createClustersC(Buffer &triMorton, S32 d, Buffer &clusters)
99 {
100  CudaKernel kernelClusterAABB = module->getKernel("clusterAABB");
101 
103  clusters.resize((triCnt+1) * sizeof(U32));
104 
105  F32 cudaTime = createClusters(triMorton.getMutableCudaPtr(), triCnt, d, clusters.getMutableCudaPtr(), cluster_cnt);
106  cudaTotalTime += cudaTime;
107 
108  clusters.resize((cluster_cnt+1) * sizeof(U32));
109  *(CUdeviceptr*)module->getGlobal("g_clsStart").getMutablePtr() = clusters.getCudaPtr();
110 
111 #ifndef BENCHMARK
112  printf("Clusters: %d\n", cluster_cnt);
113  printf("? Cluster create: %f [%f]\n", cudaTime, cudaTotalTime);
114  printf("! Cluster create: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
115 #endif
116 
118  cluster_bb.resize(cluster_cnt*sizeof(AABB));
119  cluster_bin_id.resize(cluster_cnt*sizeof(S32)*3);
120  cluster_split_id.resize(cluster_cnt*sizeof(S32));
121 
122  *(CUdeviceptr*)module->getGlobal("g_clsAABB").getMutablePtr() = cluster_bb.getMutableCudaPtr(); // cluster AABB
123  *(CUdeviceptr*)module->getGlobal("g_clsBinId").getMutablePtr() = cluster_bin_id.getMutableCudaPtr(); // cluster bin ID
124  *(CUdeviceptr*)module->getGlobal("g_clsSplitId").getMutablePtr() = cluster_split_id.getMutableCudaPtr(); // cluster node ID
125 
126  cluster_split_id.clear();
127 
128 #if CLUSTER_AABB == 3
129  CudaKernel kernelInitBins = module->getKernel("initClusterAABB");
130 
131  kernelInitBins.setParams(cluster_cnt);
132  cudaTime = kernelInitBins.launchTimed(cluster_cnt, Vec2i(BLOCK_SIZE,1));
133 #endif
134 
135  kernelClusterAABB.setParams(cluster_cnt, triCnt);
136 #if CLUSTER_AABB == 0
137  cudaTime += kernelClusterAABB.launchTimed(cluster_cnt, Vec2i(BLOCK_SIZE,1));
138 #elif CLUSTER_AABB == 1
139  int warpsPerBlock = BLOCK_SIZE/WARP_SIZE;
140  kernelClusterAABB.setGridExact(Vec2i(WARP_SIZE, warpsPerBlock), Vec2i((cluster_cnt-1+warpsPerBlock)/warpsPerBlock, 1));
141  cudaTime += kernelClusterAABB.launchTimed();
142 #elif CLUSTER_AABB == 2
143  kernelClusterAABB.setGridExact(Vec2i(BLOCK_SIZE,1), Vec2i(cluster_cnt, 1));
144  cudaTime += kernelClusterAABB.launchTimed();
145 #elif CLUSTER_AABB == 3
146  kernelClusterAABB.setGridExact(Vec2i(BLOCK_SIZE,1), Vec2i(NUM_BLOCKS, 1));
147  cudaTime += kernelClusterAABB.launchTimed();
148 #endif
149  cudaTotalTime += cudaTime;
150 #ifndef BENCHMARK
151  printf("? Cluster AABB: %f [%f]\n", cudaTime, cudaTotalTime);
152  printf("! Cluster AABB: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
153 #endif
154 }
155 
156 void HLBVHBuilder::buildTopLevel(Buffer *ooq, U32 &nodeWritten, U32 &nodeCreated, Buffer &clusters)
157 {
158  CudaKernel kernelSAHInitBins = module->getKernel("initBins");
159 
160  CudaKernel kernelSAHFillBins = module->getKernel("fillBins");
161 
162  CudaKernel kernelSAHSplit = module->getKernel("findSplit");
163 
164  CudaKernel kernelSAHDistribute = module->getKernel("distribute");
165 
166  // scene AABB
167  Vec3f sceneMin, sceneMax;
168  m_scene->getBBox(sceneMin, sceneMax);
169 
171  U32 sahCreated = 1;
172  S64 bufferSize = 2*cluster_cnt;
173 
174  Buffer qs0_bb, qs0_cls, qs0_id, qs0_plane, qs0_child;
175  Buffer qs1_bb, qs1_cls, qs1_id, qs1_plane, qs1_child;
176  Buffer *qsi_bb, *qsi_cls, *qsi_id, *qsi_plane, *qsi_child;
177  Buffer *qso_bb, *qso_cls, *qso_id, *qso_plane, *qso_child;
178  Buffer *qst_bb, *qst_cls, *qst_id, *qst_plane, *qst_child;
179 
180  qso_bb = &qs1_bb;
181  qso_cls = &qs1_cls;
182  qso_id = &qs1_id;
183  qso_plane = &qs1_plane;
184  qso_child = &qs1_child;
185 
186  qsi_bb = &qs0_bb;
187  qsi_cls = &qs0_cls;
188  qsi_id = &qs0_id;
189  qsi_plane = &qs0_plane;
190  qsi_child = &qs0_child;
191 
192  // resize input queue
193  qsi_bb->resize(bufferSize * sizeof(AABB));
194  qsi_cls->resize(bufferSize * sizeof(S32));
195  qsi_id->resize(bufferSize * sizeof(S32));
196  qsi_plane->resize(bufferSize * sizeof(S32));
197  qsi_child->resize(bufferSize * sizeof(S32));
198 
199  // resize output queue
200  qso_bb->resizeDiscard(bufferSize * sizeof(AABB));
201  qso_cls->resizeDiscard(bufferSize * sizeof(S32));
202  qso_id->resizeDiscard(bufferSize * sizeof(S32));
203  qso_plane->resizeDiscard(bufferSize * sizeof(S32));
204  qso_child->resizeDiscard(bufferSize * sizeof(S32));
205 
206  m_sizeTask += (qs0_bb.getSize() + qs1_bb.getSize()
207  + qs0_cls.getSize() + qs1_cls.getSize()
208  + qs0_id.getSize() + qs1_id.getSize()
209  + qs0_plane.getSize() + qs1_plane.getSize()
210  + qs0_child.getSize() + qs1_child.getSize()) / MB;
211 
212  // insert first split task
213  *(S32*)qsi_id->getMutablePtr() = 0;
214  *(S32*)qsi_cls->getMutablePtr() = cluster_cnt;
215  *(S32*)qsi_child->getMutablePtr() = -1;
216 
217  memcpy((void*)qsi_bb->getMutablePtr(), &sceneMin, sizeof(sceneMin));
218  memcpy((void*)qsi_bb->getMutablePtr(sizeof(sceneMin)), &sceneMax, sizeof(sceneMax));
219 
220  *(CUdeviceptr*)module->getGlobal("g_ooq").getMutablePtr() = ooq->getMutableCudaPtr();
221  module->getGlobal("g_oofs").clear();
222 
223  // bins
224  Buffer bin_bb, bin_cnt;
225  // init bins
226  bin_bb.resizeDiscard(sizeof(AABB)*BIN_CNT*3*bufferSize);
227  bin_cnt.resizeDiscard(sizeof(S32)*BIN_CNT*3*bufferSize); //x,y,z * BIN_CNT
228 
229  // Save sizes
230  m_sizeSplit = (bin_bb.getSize() + bin_cnt.getSize()) / MB;
231 
232  *(CUdeviceptr*)module->getGlobal("g_binAABB").getMutablePtr() = bin_bb.getMutableCudaPtr();
233  *(CUdeviceptr*)module->getGlobal("g_binCnt").getMutablePtr() = bin_cnt.getMutableCudaPtr();
234 
235  U32 sahLvl = 0;
236  U32 sahTerminated = 0;
237  U32 oldTerminated = 0;
238  U32 sahWritten = 1;
239 
240  //Array<U32> lvlNodes;
241  //lvlNodes.add(sahCreated - sahSingles);
242 
243 #ifndef BENCHMARK
244  printf("! top-level SAH prepare: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
245 #endif
246 
247  F32 cudaTime = 0.0f;
248  while (sahCreated > 0) {
249  // fill task split input queue
250  *(CUdeviceptr*)module->getGlobal("g_qsiAABB").getMutablePtr() = qsi_bb->getMutableCudaPtr();
251  *(CUdeviceptr*)module->getGlobal("g_qsiCnt").getMutablePtr() = qsi_cls->getMutableCudaPtr();
252  *(CUdeviceptr*)module->getGlobal("g_qsiId").getMutablePtr() = qsi_id->getMutableCudaPtr();
253  *(CUdeviceptr*)module->getGlobal("g_qsiPlane").getMutablePtr() = qsi_plane->getMutableCudaPtr();
254  *(CUdeviceptr*)module->getGlobal("g_qsiChildId").getMutablePtr() = qsi_child->getMutableCudaPtr();
255 
256  // fill task split output queue
257  *(CUdeviceptr*)module->getGlobal("g_qsoAABB").getMutablePtr() = qso_bb->getMutableCudaPtr();
258  *(CUdeviceptr*)module->getGlobal("g_qsoCnt").getMutablePtr() = qso_cls->getMutableCudaPtr();
259  *(CUdeviceptr*)module->getGlobal("g_qsoId").getMutablePtr() = qso_id->getMutableCudaPtr();
260  *(CUdeviceptr*)module->getGlobal("g_qsoPlane").getMutablePtr() = qso_plane->getMutableCudaPtr();
261  *(CUdeviceptr*)module->getGlobal("g_qsoChildId").getMutablePtr() = qso_child->getMutableCudaPtr();
262 
263  //module->setParami(kernelSAHInitBins.getHandle(), 0, sahCreated*BIN_CNT*3);
264  kernelSAHInitBins.setParams(sahCreated*BIN_CNT*3);
265  //cudaTime += kernelSAHInitBins.launchTimed(Vec2i(BLOCK_SIZE,1), Vec2i((sahCreated*BIN_CNT*3-1+BLOCK_SIZE)/BLOCK_SIZE, 1));
266  cudaTime += kernelSAHInitBins.launchTimed(sahCreated*BIN_CNT*3, Vec2i(BLOCK_SIZE,1));
267 
268  // fill bins
269  kernelSAHFillBins.setParams(cluster_cnt);
270  cudaTime += kernelSAHFillBins.launchTimed(cluster_cnt, Vec2i(BLOCK_SIZE,1));
271 
272  // find SAH split
273  module->getGlobal("g_sahCreated").clear();
274 
275  kernelSAHSplit.setParams(sahCreated, sahWritten);
276  cudaTime += kernelSAHSplit.launchTimed(sahCreated, Vec2i(BLOCK_SIZE,1));
277 
278  // cluster distribution
279  kernelSAHDistribute.setParams(cluster_cnt, sahWritten);
280  cudaTime += kernelSAHDistribute.launchTimed(cluster_cnt, Vec2i(BLOCK_SIZE,1));
281 
282  sahTerminated = *(U32*)module->getGlobal("g_oofs").getPtr(); // terminated total
283  sahCreated = *(U32*)module->getGlobal("g_sahCreated").getPtr(); // created + terminated - leafs
284 
285  S32 terminated = sahTerminated - oldTerminated;
286  oldTerminated = sahTerminated;
287 
288  if (sahCreated != 0) // old sahCreated
289  lvlNodes.add(sahCreated); // - new sahSingles
290 
291  sahWritten += sahCreated;
292  sahCreated -= terminated;
293 
294  //printf("%2d: nodes %d, written %d[%d], ", sahLvl, sahCreated - sahSingles + terminated, sahCreated - sahSingles + terminated, sahWritten);
295  //printf("created %d, terminated %d, leafs %d, offset %d[%d]\n", sahCreated, terminated, *(U32*)(module->getGlobal("g_leafsPtr").getPtr(0)),
296  // sahCreated - sahSingles + terminated, sahWritten);
297 
298  qst_bb = qsi_bb; qst_cls = qsi_cls; qst_id = qsi_id; qst_plane = qsi_plane; qst_child = qsi_child;
299  qsi_bb = qso_bb; qsi_cls = qso_cls; qsi_id = qso_id; qsi_plane = qso_plane; qsi_child = qso_child;
300  qso_bb = qst_bb; qso_cls = qst_cls; qso_id = qst_id; qso_plane = qst_plane; qso_child = qst_child;
301 
302  sahLvl++;
303  //cout << "SAHLevel " << sahLvl << " nodes " << sahCreated << "\n";
304  //getchar();
305  }
306  cudaTotalTime += cudaTime;
307 
308 #ifndef BENCHMARK
309  //printf("SAH: written %d, leafs %d\n", sahWritten, sahTerminated);
310  //getNodeBuffer().resize(sahWritten*64);
311  printf("? top-level SAH: %f [%f]\n", cudaTime, cudaTotalTime);
312  printf("! top-level SAH: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
313 #endif
314 
315  nodeWritten = sahWritten;
316  nodeCreated = *(S32*)module->getGlobal("g_oofs").getPtr();
317 }
318 
319 void HLBVHBuilder::buildBottomLevel(Buffer *q_in, Buffer *q_out, U32 &nodeWritten, U32 &nodeCreated, U32 bOfs, U32 n_bits)
320 {
321  CudaKernel kernel = module->getKernel("emitTreeKernel");
322 
324  // LBVH
326 
327  Buffer *q_tmp;
328  S32 bit_ofs = bOfs;
329 
330 #ifndef BENCHMARK
331  printf("! LBVH prepare: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
332  //printf("Building LBVH... in %d, ofs %d\n", nodeCreated, nodeWritten);
333 #endif
334 
335  F32 cudaTime = 0.0f;
336  S32 level = 0;
337  while((level < (n_bits-bit_ofs)) && nodeCreated > 0) {
338  *(CUdeviceptr*)module->getGlobal("g_inQueueMem").getMutablePtr() = q_in->getCudaPtr();
339  *(CUdeviceptr*)module->getGlobal("g_outQueueMem").getMutablePtr() = q_out->getMutableCudaPtr();
340 
341  module->getGlobal("g_inQueuePtr").clear();
342  module->getGlobal("g_outQueuePtr").clear();
343 
344  kernel.setParams(n_bits - (level+1 + bit_ofs), nodeCreated, nodeWritten);
345  cudaTime += kernel.launchTimed(nodeCreated, Vec2i(BLOCK_SIZE, 1));
346 
347  nodeCreated = *(U32*)(module->getGlobal("g_outQueuePtr").getPtr(0));
348  lvlNodes.add(nodeCreated);
349 
350  nodeWritten += nodeCreated;
351  if (lvlNodes.getLast() == 0)
352  lvlNodes.removeLast();
353 
354  q_tmp = q_in;
355  q_in = q_out;
356  q_out = q_tmp;
357 
358  level++;
359  //printf("lvl %d: created %d, written %d, leafs %d\n", level, nodeCreated, nodeWritten, *(S32*)module->getGlobal("g_leafsPtr").getPtr());
360  //getchar();
361  }
362  cudaTotalTime += cudaTime;
363 
364 #ifndef BENCHMARK
365  //printf("LBVH: written %d, leafs %d [%d]\n", nodeWritten, leafs, nodeWritten-sahWritten);
366  printf("? bottom-level LBVH: %f [%f]\n", cudaTime, cudaTotalTime);
367  printf("! bottom-level LBVH: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
368  //printf("? LBVH build done: %f\n", cudaTime);
369  //printf("! LBVH build done: %f [%f], levels %d, nodes %d, leafs %d\n", m_progressTimer.end(), m_progressTimer.getTotal(), level, nodeWritten, leafs);
370 
371  //printf("Last level: %d of %d\n", level-1, n_bits-1);
372 #endif
373 
375  U32 leafs = *(U32*)(module->getGlobal("g_leafsPtr").getPtr(0));
376  //printf("Resizing node buffer %d -> %d [%d]\n", getNodeBuffer().getSize(), nodeWritten*64, getNodeBuffer().getSize() - nodeWritten*64);
377  getNodeBuffer().resize(nodeWritten*64);
378  //printf("Resizing woop buffer %d -> %d [%d]\n", getTriWoopBuffer().getSize(), triCnt*4*4*3+leafs*4*4, getTriWoopBuffer().getSize() - (triCnt*4*4*3+leafs*4*4));
379  getTriWoopBuffer().resize(triCnt*4*4*3+leafs*4*4);
380  //printf("Resizing index buffer %d -> %d [%d]\n", getTriIndexBuffer().getSize(), triCnt*4*3+leafs*4, getTriIndexBuffer().getSize() - (triCnt*4*3+leafs*4));
381  getTriIndexBuffer().resize(triCnt*4*3+leafs*4);
382 
383  m_sizeADS = getNodeBuffer().getSize() / MB;
384  m_sizeTri = getTriWoopBuffer().getSize() / MB;
385  m_sizeTriIdx = getTriIndexBuffer().getSize() / MB;
387 
388 #ifdef LEAF_HISTOGRAM
389  U32 *histogram = (U32*)(module->getGlobal("g_leafHist").getPtr());
390  printf("Leaf histogram\n");
391  U32 leafSum = 0;
392  U32 triSum = 0;
393  for(S32 i = 0; i <= m_params.leafSize; i++)
394  {
395  printf("%d: %d\n", i, histogram[i]);
396  leafSum += histogram[i];
397  triSum += i*histogram[i];
398  }
399  printf("Leafs total %d, average leaf %.2f\n", leafSum, (float)triSum/(float)leafSum);
400 #endif
401 }
402 
403 void HLBVHBuilder::calcAABB(U32 nodeWritten)
404 {
405  CudaKernel kernelAABB = module->getKernel("calcAABB");
406 
407  *(CUdeviceptr*)module->getGlobal("g_outNodes").getMutablePtr() = getNodeBuffer().getMutableCudaPtr();
408 
409 #ifdef MEASURE_STATS
410  module->getGlobal("g_ga").clear();
411  module->getGlobal("g_gb").clear();
412  module->getGlobal("g_gc").clear();
413  *(S32*)module->getGlobal("g_gd").getMutablePtr() = -1;
414 #endif
415 
416 #ifndef BENCHMARK
417  printf("! Refit nodes: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
418 #endif
419 
420  F32 cudaTime = 0.0f;
421  //S32 aa = 0, bb = 0;
422  for (S32 lvl = lvlNodes.getSize()-1; lvl >= 0; lvl--) { // > 0 dont recalculated top level AABBs? already done in SAH?
423  nodeWritten -= lvlNodes[lvl];
424 
425  kernelAABB.setParams(nodeWritten, lvlNodes[lvl]);
426  //printf("IN: %d %d\n", nodeWritten, lvlNodes[lvl]);
427  cudaTime += kernelAABB.launchTimed(lvlNodes[lvl], Vec2i(BLOCK_SIZE, 1));
428 
429  //printf("Level %d: time %f [nodes %d - %d] start %d, cnt %d", lvl, cudaTime, nodeWritten, nodeWritten+lvlNodes[lvl], nodeWritten, lvlNodes[lvl]);
430  //printf(", nodes %d, leafs %d\n", *(S32*)module->getGlobal("g_ga").getPtr() - aa, *(S32*)module->getGlobal("g_gb").getPtr() - bb);
431  //aa = *(S32*)module->getGlobal("g_ga").getPtr();
432  //bb = *(S32*)module->getGlobal("g_gb").getPtr();
433  //getchar();
434  }
435  cudaTotalTime += cudaTime;
436 #ifndef BENCHMARK
437  printf("? calcAABB GPU: %f [%f]\n", cudaTime, cudaTotalTime);
438  printf("! calcAABB GPU: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
439 #endif
440  //printf("calcAABB GPU: nodes %d, leafs %d, tris %d, biggest leaf: %d\n", *(S32*)module->getGlobal("g_ga").getPtr(),
441  //*(S32*)module->getGlobal("g_gb").getPtr(),
442  //*(S32*)module->getGlobal("g_gc").getPtr(),
443  //*(S32*)module->getGlobal("g_gd").getPtr());
444 }
445 
447 {
448  F32 cudaTime = 0.0f;
449  cudaTotalTime = 0.0f;
450 
451  // morton codes of order n => resulting in 3n bit grid
452  S32 n = 10;
453  S32 n_bits = 3 * n;
454 
455  // compile CUDA kernels
456  CudaCompiler m_compiler;
457  m_compiler.addOptions("-use_fast_math -Xptxas=\"-v\"");
458  m_compiler.setSourceFile("src/rt/bvh/HLBVH/emitTreeKernel.cu");
459  m_compiler.clearDefines();
460 
462  m_compiler.define("FERMI");
463 
464  module = m_compiler.compile();
465  failIfError();
466 
467  // Set leaf size and scene epsilon
468  *(int*)module->getGlobal("c_leafSize").getMutablePtr() = m_params.leafSize;
469  *(float*)module->getGlobal("c_epsilon").getMutablePtr() = m_params.epsilon;
470 
471 #ifdef LEAF_HISTOGRAM
472  module->getGlobal("g_leafHist").clear();
473 #endif
474 
475  CudaKernel kernelWoop = module->getKernel("calcWoopKernel");
476 
478 #ifndef BENCHMARK
479  printf("HLBVHBuilder LBVH: Build start\n");
480 #endif
482 
483  triCnt = m_scene->getNumTriangles();
484 
485  // upload scene triangles and vertices
486  *(CUdeviceptr*)module->getGlobal("g_tris").getMutablePtr() = m_scene->getTriVtxIndexBuffer().getCudaPtr();
487  *(CUdeviceptr*)module->getGlobal("g_verts").getMutablePtr() = m_scene->getVtxPosBuffer().getCudaPtr();
488 #ifndef BENCHMARK
489  printf("! Upload tris and verts: %f [total %f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
490 #endif
491 
492  // morton
493  Buffer triMorton, triIdx;
494  triMorton.resize(triCnt * sizeof(U32));
495  triIdx.resize(triCnt * sizeof(S32));
496 
497  *(CUdeviceptr*)module->getGlobal("g_inTriMem").getMutablePtr() = triMorton.getMutableCudaPtr();
498  *(CUdeviceptr*)module->getGlobal("g_inTriIdxMem").getMutablePtr() = triIdx.getMutableCudaPtr();
499 #ifndef BENCHMARK
500  printf("! Alloc morton and index: %f [total %f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
501 #endif
502 
503  calcMortonAndSort(triMorton, triIdx);
504 
505  m_sizeTask += triMorton.getSize() / MB;
506 
508 #ifdef WOOP_TRIANGLES
509  Buffer inWoop;
510  inWoop.resizeDiscard(triCnt*3*sizeof(Vec4i));
511  *(CUdeviceptr*)module->getGlobal("g_inWoopMem").getMutablePtr() = inWoop.getMutableCudaPtr();
512 
513  kernelWoop.setParams(triCnt);
514  cudaTime = kernelWoop.launchTimed(triCnt, Vec2i(BLOCK_SIZE,1));
515  cudaTotalTime += cudaTime;
516 #ifndef BENCHMARK
517  printf("? Woop data: %f [%f]\n", cudaTime, cudaTotalTime);
518  printf("! Woop data: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
519 #endif
520 
521  m_sizeTask += inWoop.getSize() / MB;
522 #endif
523 
524  // alloc out woop and idx buffers
525 #ifdef COMPACT_LAYOUT
526 #ifdef WOOP_TRIANGLES
527  getTriWoopBuffer().resizeDiscard(triCnt*(3+1)*sizeof(Vec4i)); // just to be sure
528  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriWoopBuffer().getMutableCudaPtr();
529 #else
530  getTriBuffer().resizeDiscard(triCnt*(3+1)*sizeof(Vec4i)); // just to be sure
531  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriBuffer().getMutableCudaPtr();
532 #endif
533  getTriIndexBuffer().resizeDiscard(triCnt*(3+1)*sizeof(S32));
534 #else
535 #ifdef WOOP_TRIANGLES
536  getTriWoopBuffer().resizeDiscard(triCnt*3*sizeof(Vec4i)); // just to be sure
537 #else
538  getTriBuffer().resizeDiscard(triCnt*3*sizeof(Vec4i)); // just to be sure
539  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriBuffer().getMutableCudaPtr();
540 #endif
541  getTriIndexBuffer().resizeDiscard(triCnt*3*sizeof(S32));
542 #endif
543  *(CUdeviceptr*)module->getGlobal("g_outIdxMem").getMutablePtr() = getTriIndexBuffer().getMutableCudaPtr();
544 
545  module->getGlobal("g_leafsPtr").clear();
546 
547  lvlNodes.clear();
548  lvlNodes.add(1); // there is always 1 top root node
549 
550  U32 nodeWritten = 1;
551  U32 nodeCreated = 1;
552 
553  // insert top node
554  Buffer q0,q1;
555  initMemory(q0, q1, min(2, m_params.leafSize));
556  ((S32*)q0.getMutablePtr())[0] = 0;
557  ((S32*)q0.getMutablePtr())[1] = 0;
558  ((S32*)q0.getMutablePtr())[2] = triCnt;
559 
560  m_sizeTask += (q0.getSize() + q1.getSize()) / MB;
561 
562  buildBottomLevel(&q0, &q1, nodeWritten, nodeCreated, 0, n_bits);
563 
564  calcAABB(nodeWritten);
565 
566  m_gpuTime = cudaTotalTime;
568  m_cpuTime = m_progressTimer.getTotal();
569 #ifndef BENCHMARK
570  printf("? Build finished: %f\n", m_gpuTime);
571  printf("! Build finished: %f\n", m_cpuTime);
572 #endif
573 
574  F32* root = (F32*)getNodeBuffer().getPtr();
575  U32 leafs = *(U32*)(module->getGlobal("g_leafsPtr").getPtr(0));
576  /*printf("=== BVH stats: nodes %d, leafs %d\n", nodeWritten, leafs);
577  printf("=== AABB: (%.1f %.1f %.1f) - (%.1f %.1f %.1f)\n", min(root[0],root[4]),
578  min(root[2],root[6]),
579  min(root[8],root[10]),
580  max(root[1],root[5]),
581  max(root[3],root[7]),
582  max(root[9],root[11]));*/
583 
584  /*Debug << "BVH Top = " << nodeWritten << " => number of inner nodes (number of tasks) = " << nodeWritten << " + number of leaves = " << leafs << "\n";
585  Debug << "Sorted tris = " << triCnt << "\n\n";*/
586  m_nodes = nodeWritten;
587  m_leafs = leafs;
588 }
589 
591 {
592  F32 cudaTime = 0.0f;
593  cudaTotalTime = 0.0f;
594 
595  // morton codes of order n => resulting in 3n bit grid
596  S32 n = 10;
597  // sorting into coarse 3m bit grid according to m bit morton code
598  S32 m = (n - m_params.hlbvhBits);
599  // then sorting rest of primitives in each grid cell according to remaining 3(n-m) bits
600  S32 d = 3 * (n - m); // top level SAH (clusters)
601 
602  S32 n_bits = 3 * n;
603 
604  // compile CUDA kernels
605  CudaCompiler m_compiler;
606  m_compiler.addOptions("-use_fast_math -Xptxas=\"-v\"");
607  m_compiler.setSourceFile("src/rt/bvh/HLBVH/emitTreeKernel.cu");
608  m_compiler.clearDefines();
609 
611  m_compiler.define("FERMI");
612 
613  module = m_compiler.compile();
614  failIfError();
615 
616  // Set leaf size and scene epsilon
617  *(int*)module->getGlobal("c_leafSize").getMutablePtr() = m_params.leafSize;
618  *(float*)module->getGlobal("c_epsilon").getMutablePtr() = m_params.epsilon;
619 
620 #ifdef LEAF_HISTOGRAM
621  module->getGlobal("g_leafHist").clear();
622 #endif
623 
624  CudaKernel kernelWoop = module->getKernel("calcWoopKernel");
625 
627 #ifndef BENCHMARK
628  printf("HLBVHBuilder HLBVH: Build start\n");
629 #endif
631 
632  triCnt = m_scene->getNumTriangles();
633 
634  // upload scene triangles and vertices
635  *(CUdeviceptr*)module->getGlobal("g_tris").getMutablePtr() = m_scene->getTriVtxIndexBuffer().getCudaPtr();
636  *(CUdeviceptr*)module->getGlobal("g_verts").getMutablePtr() = m_scene->getVtxPosBuffer().getCudaPtr();
637 #ifndef BENCHMARK
638  printf("! Upload tris and verts: %f [total %f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
639 #endif
640 
641  // morton
642  Buffer triMorton, triIdx;
643  triMorton.resize(triCnt * sizeof(U32));
644  triIdx.resize(triCnt * sizeof(S32));
645 
646  *(CUdeviceptr*)module->getGlobal("g_inTriMem").getMutablePtr() = triMorton.getMutableCudaPtr();
647  *(CUdeviceptr*)module->getGlobal("g_inTriIdxMem").getMutablePtr() = triIdx.getMutableCudaPtr();
648 #ifndef BENCHMARK
649  printf("! Alloc morton and index: %f [total %f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
650 #endif
651 
652  calcMortonAndSort(triMorton, triIdx);
653 
654  m_sizeTask += triMorton.getSize() / MB;
655 
656  Buffer clusters;
657  createClustersC(triMorton, d, clusters);
658 
659  m_sizeTask += clusters.getSize() / MB;
660  m_sizeTask += cluster_bb.getSize() / MB;
661  m_sizeTask += cluster_bin_id.getSize() / MB;
662  m_sizeTask += cluster_split_id.getSize() / MB;
663 
665 #ifdef WOOP_TRIANGLES
666  Buffer inWoop;
667  inWoop.resizeDiscard(triCnt*3*sizeof(Vec4i));
668  *(CUdeviceptr*)module->getGlobal("g_inWoopMem").getMutablePtr() = inWoop.getMutableCudaPtr();
669 
670  kernelWoop.setParams(triCnt);
671  cudaTime = kernelWoop.launchTimed(triCnt, Vec2i(BLOCK_SIZE,1));
672  cudaTotalTime += cudaTime;
673 #ifndef BENCHMARK
674  printf("? Woop data: %f [%f]\n", cudaTime, cudaTotalTime);
675  printf("! Woop data: %f [%f]\n", m_progressTimer.end(), m_progressTimer.getTotal());
676 #endif
677 
678  m_sizeTask += inWoop.getSize() / MB;
679 #endif
680 
681  // alloc out woop and idx buffers
682 #ifdef COMPACT_LAYOUT
683 #ifdef WOOP_TRIANGLES
684  getTriWoopBuffer().resizeDiscard(triCnt*(3+1)*sizeof(Vec4i)); // just to be sure
685  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriWoopBuffer().getMutableCudaPtr();
686 #else
687  getTriBuffer().resizeDiscard(triCnt*(3+1)*sizeof(Vec4i)); // just to be sure
688  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriBuffer().getMutableCudaPtr();
689 #endif
690  getTriIndexBuffer().resizeDiscard(triCnt*(3+1)*sizeof(S32));
691 #else
692 #ifdef WOOP_TRIANGLES
693  getTriWoopBuffer().resizeDiscard(triCnt*3*sizeof(Vec4i)); // just to be sure
694 #else
695  getTriBuffer().resizeDiscard(triCnt*3*sizeof(Vec4i)); // just to be sure
696  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriBuffer().getMutableCudaPtr();
697 #endif
698  getTriIndexBuffer().resizeDiscard(triCnt*3*sizeof(S32));
699 #endif
700  *(CUdeviceptr*)module->getGlobal("g_outIdxMem").getMutablePtr() = getTriIndexBuffer().getMutableCudaPtr();
701 
702  module->getGlobal("g_leafsPtr").clear();
703 
704  lvlNodes.clear();
705  lvlNodes.add(1); // there is always 1 top root node
706 
707  U32 nodeWritten, nodeCreated;
708  Buffer q0, q1;
709  initMemory(q0, q1, (d == 0) ? 1 : min(2, m_params.leafSize));
710 
711  m_sizeTask += (q0.getSize() + q1.getSize()) / MB;
712 
713  buildTopLevel(&q0, nodeWritten, nodeCreated, clusters);
714  if(d != 0)
715  buildBottomLevel(&q0, &q1, nodeWritten, nodeCreated, 3*m, n_bits);
716  calcAABB(nodeWritten);
717 
718  //q0.free(Buffer::Module::CPU);
719 
720  //clusters.clear();
721  //cluster_bb.clear();
722  //cluster_bin_id.clear();
723  //cluster_split_id.clear();
724 
725  m_gpuTime = cudaTotalTime;
727  m_cpuTime = m_progressTimer.getTotal();
728 
729 #ifndef BENCHMARK
730  printf("? Build finished: %f\n", m_gpuTime);
731  printf("! Build finished: %f\n", m_cpuTime);
732 #endif
733 
734  F32* root = (F32*)getNodeBuffer().getPtr();
735  U32 leafs = *(U32*)(module->getGlobal("g_leafsPtr").getPtr());
736  /*printf("=== BVH stats: nodes %d, leafs %d\n", nodeWritten, leafs);
737  printf("=== AABB: (%.1f %.1f %.1f) - (%.1f %.1f %.1f)\n", min(root[0],root[4]),
738  min(root[2],root[6]),
739  min(root[8],root[10]),
740  max(root[1],root[5]),
741  max(root[3],root[7]),
742  max(root[9],root[11])); */
743 
744  /*Debug << "BVH Top = " << nodeWritten << " => number of inner nodes (number of tasks) = " << nodeWritten << " + number of leaves = " << leafs << "\n";
745  Debug << "Sorted tris = " << triCnt << "\n\n";*/
746  m_nodes = nodeWritten;
747  m_leafs = leafs;
748 }
749 
751  CudaCompiler m_compiler;
752  m_compiler.addOptions("-use_fast_math -Xptxas=\"-v\"");
753  m_compiler.setSourceFile("src/rt/bvh/HLBVH/emitTreeKernel.cu");
754  m_compiler.clearDefines();
755 
757  m_compiler.define("FERMI");
758 
759  CudaModule* module = m_compiler.compile();
760  failIfError();
761  CudaKernel kernel = module->getKernel("calcSAH");
762 
763  *(CUdeviceptr*)module->getGlobal("g_outWoopMem").getMutablePtr() = getTriWoopBuffer().getCudaPtr();
764  module->getGlobal("g_sahCost").clear();
765  kernel.launch(1, 1);
766 
767  return *(F32*)module->getGlobal("g_sahCost").getPtr();
768 }
769 
770 void HLBVHBuilder::initMemory(Buffer& q_in, Buffer& q_out, int leafSize) {
771  S64 size = 2*(triCnt/leafSize);
772  getNodeBuffer().resize(size * 64);
773  *(CUdeviceptr*)module->getGlobal("g_outNodes").getMutablePtr() = getNodeBuffer().getMutableCudaPtr();
774 
775  q_in.resize(3*sizeof(S32) * size);
776  q_out.resizeDiscard(3*sizeof(S32) * size);
777 }
778 
779 void HLBVHBuilder::getSizes(F32& task, F32& split, F32& ads, F32& tri, F32& triIdx)
780 {
781  task = m_sizeTask;
782  split = m_sizeSplit;
783  ads = m_sizeADS;
784  tri = m_sizeTri;
785  triIdx = m_sizeTriIdx;
786 }
HLBVHBuilder(Scene *scene, const Platform &platform, HLBVHParams params)
F32 getTotal(void) const
Definition: Timer.hpp:47
void createClustersC(Buffer &triMorton, S32 d, Buffer &clusters)
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx)
Cuda BVH class.
Definition: CudaBVH.hpp:93
Buffer & getTriVtxIndexBuffer(void)
Returns buffer of triangle's vertex indieces.
Definition: Scene.hpp:75
void buildTopLevel(Buffer *ooq, U32 &nodeWritten, U32 &nodeCreated, Buffer &clusters)
void clear(int value=0)
Definition: Buffer.hpp:100
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
void initMemory(Buffer &q_in, Buffer &q_out, int leafSize=1)
HLBVHParams m_params
virtual ~HLBVHBuilder(void)
void buildBottomLevel(Buffer *q_in, Buffer *q_out, U32 &nodeWritten, U32 &nodeCreated, U32 bOfs, U32 n_bits)
Buffer & getVtxPosBuffer(void)
Returns vertex position buffer.
Definition: Scene.hpp:103
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z GLuint GLint GLenum GLboolean GLsizei const GLvoid pointer GLuint GLuint const GLchar name GLenum GLsizei GLenum GLsizei GLsizei height GLenum GLuint renderbuffer GLenum GLenum GLint * params
Definition: DLLImports.inl:373
void unstart(void)
Definition: Timer.hpp:43
CUdeviceptr getCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:108
float radixSortCuda(CUdeviceptr keys, CUdeviceptr values, int n)
void start(void)
Definition: Timer.hpp:42
CudaKernel getKernel(const String &name)
Definition: CudaModule.cpp:80
S64 getSize(void) const
Definition: Buffer.hpp:69
void clear(void)
Definition: Array.hpp:359
const U8 * getPtr(S64 ofs=0)
Definition: Buffer.hpp:106
float createClusters(CUdeviceptr values, int n, int d, CUdeviceptr out, int &out_cnt)
void define(const String &key, const String &value="")
static int getComputeCapability(void)
Definition: CudaModule.cpp:508
void getStats(U32 &nodes, U32 &leaves, U32 &nodeTop)
void getBBox(Vec3f &lo, Vec3f &hi) const
Gets scene AABB's minimum and maximum vector.
Definition: Scene.hpp:163
float F32
Definition: Defs.hpp:89
const T & getLast(void) const
Definition: Array.hpp:272
CudaKernel & setGridExact(const Vec2i &blockSize, const Vec2i &gridSize)
Definition: CudaKernel.hpp:114
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
Definition: Buffer.hpp:112
void calcMortonAndSort(Buffer &triMorton, Buffer &triIdx)
Buffer & getTriWoopBuffer(void)
Definition: CudaBVH.hpp:145
U8 * getMutablePtr(S64 ofs=0)
Definition: Buffer.hpp:110
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
Definition: Math.hpp:461
int getNumTriangles(void) const
Definition: Scene.hpp:61
signed int S32
Definition: Defs.hpp:88
F32 end(void)
Definition: Timer.hpp:69
T & add(void)
Definition: Array.hpp:384
signed __int64 S64
Definition: Defs.hpp:98
unsigned int U32
Definition: Defs.hpp:85
Class holding 3d scene.
Definition: Scene.hpp:44
Buffer & getGlobal(const String &name)
Definition: CudaModule.cpp:117
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
Definition: DLLImports.inl:325
Buffer & getTriIndexBuffer(void)
Definition: CudaBVH.hpp:150
void printf(const char *fmt,...)
Definition: Defs.cpp:225
CudaKernel & setParams(const void *ptr, int size)
Definition: CudaKernel.hpp:83
CudaKernel & launch(void)
Definition: CudaKernel.cpp:179
Class holding various SAH and batch processing parameters.
Definition: Platform.hpp:46
void failIfError(void)
Definition: Defs.cpp:361
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level
Definition: DLLImports.inl:333
T & removeLast(void)
Definition: Array.hpp:465
const float MB
Buffer & getNodeBuffer(void)
Definition: CudaBVH.hpp:140
F32 launchTimed(bool yield=true)
Definition: CudaKernel.cpp:188
void addOptions(const String &options)
void resizeDiscard(S64 size)
Definition: Buffer.hpp:83
void clearDefines(void)
void calcAABB(U32 nodeWritten)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
Definition: DLLImports.inl:319
void resize(S64 size)
Definition: Buffer.hpp:82
S getSize(void) const
Definition: Array.hpp:188