22 #include "bvh/HLBVH/emitTreeKernel.cuh"
24 #include <cuda_runtime_api.h>
28 const float MB = (float)(1024*1024);
72 Vec3f sceneMin, sceneMax;
73 m_scene->
getBBox(sceneMin, sceneMax);
76 const float k2 = 1024.0f;
79 Vec3f step = (sceneMax - sceneMin) / k2;
81 kernelMorton.
setParams(triCnt, sceneMin.x, sceneMin.y, sceneMin.z, step.x, step.y, step.z);
83 cudaTotalTime += cudaTime;
85 printf(
"? Morton codes: %f [%f]\n", cudaTime, cudaTotalTime);
91 cudaTotalTime += cudaTime;
93 printf(
"? Radix sort: %f [%f]\n", cudaTime, cudaTotalTime);
103 clusters.
resize((triCnt+1) *
sizeof(
U32));
106 cudaTotalTime += cudaTime;
108 clusters.
resize((cluster_cnt+1) *
sizeof(
U32));
112 printf(
"Clusters: %d\n", cluster_cnt);
113 printf(
"? Cluster create: %f [%f]\n", cudaTime, cudaTotalTime);
119 cluster_bin_id.
resize(cluster_cnt*
sizeof(
S32)*3);
120 cluster_split_id.
resize(cluster_cnt*
sizeof(
S32));
126 cluster_split_id.
clear();
128 #if CLUSTER_AABB == 3
135 kernelClusterAABB.
setParams(cluster_cnt, triCnt);
136 #if CLUSTER_AABB == 0
137 cudaTime += kernelClusterAABB.
launchTimed(cluster_cnt,
Vec2i(BLOCK_SIZE,1));
138 #elif CLUSTER_AABB == 1
139 int warpsPerBlock = BLOCK_SIZE/WARP_SIZE;
140 kernelClusterAABB.
setGridExact(
Vec2i(WARP_SIZE, warpsPerBlock),
Vec2i((cluster_cnt-1+warpsPerBlock)/warpsPerBlock, 1));
142 #elif CLUSTER_AABB == 2
145 #elif CLUSTER_AABB == 3
149 cudaTotalTime += cudaTime;
151 printf(
"? Cluster AABB: %f [%f]\n", cudaTime, cudaTotalTime);
167 Vec3f sceneMin, sceneMax;
168 m_scene->
getBBox(sceneMin, sceneMax);
172 S64 bufferSize = 2*cluster_cnt;
174 Buffer qs0_bb, qs0_cls, qs0_id, qs0_plane, qs0_child;
175 Buffer qs1_bb, qs1_cls, qs1_id, qs1_plane, qs1_child;
176 Buffer *qsi_bb, *qsi_cls, *qsi_id, *qsi_plane, *qsi_child;
177 Buffer *qso_bb, *qso_cls, *qso_id, *qso_plane, *qso_child;
178 Buffer *qst_bb, *qst_cls, *qst_id, *qst_plane, *qst_child;
183 qso_plane = &qs1_plane;
184 qso_child = &qs1_child;
189 qsi_plane = &qs0_plane;
190 qsi_child = &qs0_child;
194 qsi_cls->
resize(bufferSize *
sizeof(
S32));
195 qsi_id->
resize(bufferSize *
sizeof(
S32));
196 qsi_plane->
resize(bufferSize *
sizeof(
S32));
197 qsi_child->
resize(bufferSize *
sizeof(
S32));
217 memcpy((
void*)qsi_bb->
getMutablePtr(), &sceneMin,
sizeof(sceneMin));
218 memcpy((
void*)qsi_bb->
getMutablePtr(
sizeof(sceneMin)), &sceneMax,
sizeof(sceneMax));
236 U32 sahTerminated = 0;
237 U32 oldTerminated = 0;
248 while (sahCreated > 0) {
264 kernelSAHInitBins.
setParams(sahCreated*BIN_CNT*3);
266 cudaTime += kernelSAHInitBins.
launchTimed(sahCreated*BIN_CNT*3,
Vec2i(BLOCK_SIZE,1));
269 kernelSAHFillBins.
setParams(cluster_cnt);
270 cudaTime += kernelSAHFillBins.
launchTimed(cluster_cnt,
Vec2i(BLOCK_SIZE,1));
275 kernelSAHSplit.
setParams(sahCreated, sahWritten);
279 kernelSAHDistribute.
setParams(cluster_cnt, sahWritten);
280 cudaTime += kernelSAHDistribute.
launchTimed(cluster_cnt,
Vec2i(BLOCK_SIZE,1));
285 S32 terminated = sahTerminated - oldTerminated;
286 oldTerminated = sahTerminated;
289 lvlNodes.
add(sahCreated);
291 sahWritten += sahCreated;
292 sahCreated -= terminated;
298 qst_bb = qsi_bb; qst_cls = qsi_cls; qst_id = qsi_id; qst_plane = qsi_plane; qst_child = qsi_child;
299 qsi_bb = qso_bb; qsi_cls = qso_cls; qsi_id = qso_id; qsi_plane = qso_plane; qsi_child = qso_child;
300 qso_bb = qst_bb; qso_cls = qst_cls; qso_id = qst_id; qso_plane = qst_plane; qso_child = qst_child;
306 cudaTotalTime += cudaTime;
311 printf(
"? top-level SAH: %f [%f]\n", cudaTime, cudaTotalTime);
315 nodeWritten = sahWritten;
337 while((level < (n_bits-bit_ofs)) && nodeCreated > 0) {
344 kernel.
setParams(n_bits - (level+1 + bit_ofs), nodeCreated, nodeWritten);
348 lvlNodes.
add(nodeCreated);
350 nodeWritten += nodeCreated;
362 cudaTotalTime += cudaTime;
366 printf(
"? bottom-level LBVH: %f [%f]\n", cudaTime, cudaTotalTime);
388 #ifdef LEAF_HISTOGRAM
390 printf(
"Leaf histogram\n");
395 printf(
"%d: %d\n", i, histogram[i]);
396 leafSum += histogram[i];
397 triSum += i*histogram[i];
399 printf(
"Leafs total %d, average leaf %.2f\n", leafSum, (
float)triSum/(
float)leafSum);
422 for (
S32 lvl = lvlNodes.
getSize()-1; lvl >= 0; lvl--) {
423 nodeWritten -= lvlNodes[lvl];
425 kernelAABB.
setParams(nodeWritten, lvlNodes[lvl]);
435 cudaTotalTime += cudaTime;
437 printf(
"? calcAABB GPU: %f [%f]\n", cudaTime, cudaTotalTime);
449 cudaTotalTime = 0.0f;
457 m_compiler.
addOptions(
"-use_fast_math -Xptxas=\"-v\"");
458 m_compiler.
setSourceFile(
"src/rt/bvh/HLBVH/emitTreeKernel.cu");
462 m_compiler.
define(
"FERMI");
471 #ifdef LEAF_HISTOGRAM
479 printf(
"HLBVHBuilder LBVH: Build start\n");
495 triIdx.resize(triCnt *
sizeof(
S32));
505 m_sizeTask += triMorton.getSize() /
MB;
508 #ifdef WOOP_TRIANGLES
515 cudaTotalTime += cudaTime;
517 printf(
"? Woop data: %f [%f]\n", cudaTime, cudaTotalTime);
525 #ifdef COMPACT_LAYOUT
526 #ifdef WOOP_TRIANGLES
530 getTriBuffer().resizeDiscard(triCnt*(3+1)*
sizeof(
Vec4i));
535 #ifdef WOOP_TRIANGLES
538 getTriBuffer().resizeDiscard(triCnt*3*
sizeof(
Vec4i));
566 m_gpuTime = cudaTotalTime;
570 printf(
"? Build finished: %f\n", m_gpuTime);
571 printf(
"! Build finished: %f\n", m_cpuTime);
586 m_nodes = nodeWritten;
593 cudaTotalTime = 0.0f;
606 m_compiler.
addOptions(
"-use_fast_math -Xptxas=\"-v\"");
607 m_compiler.
setSourceFile(
"src/rt/bvh/HLBVH/emitTreeKernel.cu");
611 m_compiler.
define(
"FERMI");
620 #ifdef LEAF_HISTOGRAM
628 printf(
"HLBVHBuilder HLBVH: Build start\n");
644 triIdx.resize(triCnt *
sizeof(
S32));
654 m_sizeTask += triMorton.getSize() /
MB;
661 m_sizeTask += cluster_bin_id.
getSize() /
MB;
662 m_sizeTask += cluster_split_id.
getSize() /
MB;
665 #ifdef WOOP_TRIANGLES
672 cudaTotalTime += cudaTime;
674 printf(
"? Woop data: %f [%f]\n", cudaTime, cudaTotalTime);
682 #ifdef COMPACT_LAYOUT
683 #ifdef WOOP_TRIANGLES
687 getTriBuffer().resizeDiscard(triCnt*(3+1)*
sizeof(
Vec4i));
692 #ifdef WOOP_TRIANGLES
695 getTriBuffer().resizeDiscard(triCnt*3*
sizeof(
Vec4i));
707 U32 nodeWritten, nodeCreated;
725 m_gpuTime = cudaTotalTime;
730 printf(
"? Build finished: %f\n", m_gpuTime);
731 printf(
"! Build finished: %f\n", m_cpuTime);
746 m_nodes = nodeWritten;
752 m_compiler.
addOptions(
"-use_fast_math -Xptxas=\"-v\"");
753 m_compiler.
setSourceFile(
"src/rt/bvh/HLBVH/emitTreeKernel.cu");
757 m_compiler.
define(
"FERMI");
771 S64 size = 2*(triCnt/leafSize);
785 triIdx = m_sizeTriIdx;
HLBVHBuilder(Scene *scene, const Platform &platform, HLBVHParams params)
void createClustersC(Buffer &triMorton, S32 d, Buffer &clusters)
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx)
Buffer & getTriVtxIndexBuffer(void)
Returns buffer of triangle's vertex indieces.
void buildTopLevel(Buffer *ooq, U32 &nodeWritten, U32 &nodeCreated, Buffer &clusters)
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
void initMemory(Buffer &q_in, Buffer &q_out, int leafSize=1)
virtual ~HLBVHBuilder(void)
void buildBottomLevel(Buffer *q_in, Buffer *q_out, U32 &nodeWritten, U32 &nodeCreated, U32 bOfs, U32 n_bits)
Buffer & getVtxPosBuffer(void)
Returns vertex position buffer.
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z GLuint GLint GLenum GLboolean GLsizei const GLvoid pointer GLuint GLuint const GLchar name GLenum GLsizei GLenum GLsizei GLsizei height GLenum GLuint renderbuffer GLenum GLenum GLint * params
CUdeviceptr getCudaPtr(S64 ofs=0)
float radixSortCuda(CUdeviceptr keys, CUdeviceptr values, int n)
CudaKernel getKernel(const String &name)
const U8 * getPtr(S64 ofs=0)
float createClusters(CUdeviceptr values, int n, int d, CUdeviceptr out, int &out_cnt)
void define(const String &key, const String &value="")
static int getComputeCapability(void)
void getStats(U32 &nodes, U32 &leaves, U32 &nodeTop)
void getBBox(Vec3f &lo, Vec3f &hi) const
Gets scene AABB's minimum and maximum vector.
const T & getLast(void) const
CudaKernel & setGridExact(const Vec2i &blockSize, const Vec2i &gridSize)
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
void calcMortonAndSort(Buffer &triMorton, Buffer &triIdx)
Buffer & getTriWoopBuffer(void)
U8 * getMutablePtr(S64 ofs=0)
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
int getNumTriangles(void) const
Buffer & getGlobal(const String &name)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
Buffer & getTriIndexBuffer(void)
void printf(const char *fmt,...)
CudaKernel & setParams(const void *ptr, int size)
CudaKernel & launch(void)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level
Buffer & getNodeBuffer(void)
F32 launchTimed(bool yield=true)
void addOptions(const String &options)
void resizeDiscard(S64 size)
void calcAABB(U32 nodeWritten)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size