8 #define TASK_SIZE 430000
18 #include "kernels/thrustTest.hpp"
28 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
31 m_compiler.
define(
"FERMI");
37 m_numVerts = m_numTris * 3;
39 m_numShadingNormals = m_numVerts;
40 m_numTextureCoords = m_numVerts;
42 scene.
getBBox(m_bboxMin, m_bboxMax);
73 for (
int i = 0; i < m_numTris; i++)
75 for (
int j = 0; j < 3; j++)
77 vout[i * 3 + j] =
Vec3f(verts[tris[i][j]].
x, verts[tris[i][j]].
y, verts[tris[i][j]].
z);
78 *tcout =
Vec4f(verts[tris[i][j]].x, verts[tris[i][j]].y, verts[tris[i][j]].z,0);
85 Vec3f minV =
min(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
86 bout[i * 2 + 0] = make_float3(minV.x, minV.y, minV.z);
87 Vec3f maxV =
max(vout[i*3+0], vout[i*3+1], vout[i*3+2]);
88 bout[i* 2 + 1] = make_float3(maxV.x, maxV.y, maxV.z);
120 for(
int i=0,j=0;i<m_numTris;i++,j+=3)
122 tout[i] =
Vec3i(j, j + 1, j + 2);
123 nout[i] = ((vout[j + 1] - vout[j]).
cross(vout[j + 2] - vout[j]));
162 m_kernelFile =
"src/rt/kernels/persistent_nostruct.cu";
164 m_module = m_compiler.
compile();
172 m_numRays = rand.
getU32(1, 1000000);
173 m_numTris = rand.
getU32(1, 1000000);
184 for(
int i=0;i<m_numTris;i++)
191 int rnd = rand.
getS32(-1, 2);
209 for(
int i = 0; i < m_numRays; i++)
220 memcpy(&mRay.origin, &ray.origin,
sizeof(
Vec3f));
221 memcpy(&mRay.direction, &ray.direction,
sizeof(
Vec3f));
223 bool intersects = m_bbox.ComputeMinMaxT(mRay,
227 if (ray.tmin < 1e-3
f)
232 int rnd = rand.
getS32(-1, 3);
257 m_gpuTime = traceCudaRayBuffer(rays);
258 m_cpuTime = m_timer.
end();
270 #ifdef MALLOC_SCRATCHPAD
273 printf(
"Setting dynamic memory limit to %fMB\n", (
float)(m_trisIndex.
getSize()*5*3)/(
float)(1024*1024));
275 cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, m_trisIndex.
getSize()*5*3);
280 m_kernelFile =
"src/rt/kernels/persistent_bvh.cu";
282 m_kernelFile =
"src/rt/kernels/persistent_sbvh.cu";
285 m_module = m_compiler.
compile();
290 m_numTris = rand.
getU32(1, 1000000);
300 for(
int i=0;i<m_numTris;i++)
307 int rnd = rand.
getU32(0, 2);
325 S64 bvhSize = ((m_numTris/2 *
sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
329 #ifdef COMPACT_LAYOUT
342 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
347 m_gpuTime = buildCudaBVH();
348 m_cpuTime = m_timer.
end();
368 #ifdef COMPACT_LAYOUT
369 #ifdef WOOP_TRIANGLES
370 String kernelName(
"src/rt/kernels/fermi_speculative_while_while");
372 String kernelName(
"src/rt/kernels/fermi_speculative_while_while_inter");
378 String kernelName(
"src/rt/kernels/fermi_persistent_speculative_while_while_inter");
380 m_compiler.
addOptions(
"-use_fast_math -maxrregcount 40");
385 kernelName +=
"_statistics";
390 m_module = m_compiler.
compile();
412 kernel = m_module->
getKernel(
"trace_stats");
426 #ifdef COMPACT_LAYOUT
427 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
429 Buffer& indexBuf = m_trisIndexOut;
431 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
433 Buffer& indexBuf = m_trisIndex;
440 in.nodesA = nodePtr + nodeOfsA.x;
441 in.trisA = triPtr + triOfsA.x;
444 in.triIndices = indexBuf.getCudaPtr();
448 m_module->
setTexRef(
"t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
449 m_module->
setTexRef(
"t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
450 m_module->
setTexRef(
"t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
453 int desiredWarps = (rays.
getSize() + 31) / 32;
461 int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
462 Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
475 F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
481 stats->numEmptyLeavesVisited += *(
U32*)m_module->
getGlobal(
"g_NumEmptyLeaves").
getPtr();
483 stats->numFailedTriangleTests += *(
U32*)m_module->
getGlobal(
"g_NumFailedTris").
getPtr();
484 stats->numSuccessTriangleTestsOutside += *(
U32*)m_module->
getGlobal(
"g_NumHitTrisOutside").
getPtr();
488 m_gpuTime = launchTime;
489 m_cpuTime = m_timer.
end();
494 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
503 m_kernelFile =
"src/rt/kernels/persistent_kdtree.cu";
505 m_module = m_compiler.
compile();
508 prepareDynamicMemory();
512 m_numTris = rand.
getU32(1, 1000000);
522 for(
int i=0;i<m_numTris;i++)
529 int rnd = rand.
getU32(0, 2);
554 #ifndef INTERLEAVED_LAYOUT
558 S64 kdtreeSize = ((m_numTris*20 *
sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
562 #ifndef COMPACT_LAYOUT
566 #ifdef DUPLICATE_REFERENCES
578 S64 kdtreeSize = ((m_numTris*5 *
sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (
sizeof(
Vec4f)+
sizeof(
S32))) + 4096 - 1) & -4096;
582 m_bvhData.clearRange32(0, 0, kdtreeSize);
585 m_gpuTime = buildCudaKdtree();
586 m_cpuTime = m_timer.
end();
606 #ifdef COMPACT_LAYOUT
607 #ifdef WOOP_TRIANGLES
608 #ifdef DUPLICATE_REFERENCES
609 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while_childPtr");
612 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while_leafRef");
615 #error Undefined kernel
618 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while");
627 kernelName +=
"_statistics";
632 m_module = m_compiler.
compile();
635 CUfunction queryKernel = m_module->
getKernel(
"queryConfig");
637 fail(
"Config query kernel not found!");
648 m_module->launchKernel(queryKernel, 1, 1);
653 kernel = m_module->
getKernel(
"trace_stats");
657 fail(
"Trace kernel not found!");
667 #ifndef INTERLEAVED_LAYOUT
668 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
670 Buffer& indexBuf = m_trisIndexOut;
674 Buffer& indexBuf = m_bvhData;
681 memcpy(&in.bmin, &m_bbox.min,
sizeof(float3));
682 memcpy(&in.bmax, &m_bbox.max,
sizeof(float3));
683 in.nodesA = nodePtr + nodeOfsA.x;
684 in.trisA = triPtr + triOfsA.x;
687 in.triIndices = indexBuf.getCudaPtr();
691 m_module->
setTexRef(
"t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
692 m_module->
setTexRef(
"t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
693 m_module->
setTexRef(
"t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
696 int desiredWarps = (rays.
getSize() + 31) / 32;
704 int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
705 Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
718 F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
724 stats->numEmptyLeavesVisited += *(
U32*)m_module->
getGlobal(
"g_NumEmptyLeaves").
getPtr();
726 stats->numFailedTriangleTests += *(
U32*)m_module->
getGlobal(
"g_NumFailedTris").
getPtr();
727 stats->numSuccessTriangleTestsOutside += *(
U32*)m_module->
getGlobal(
"g_NumHitTrisOutside").
getPtr();
731 m_gpuTime = launchTime;
732 m_cpuTime = m_timer.
end();
737 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
750 m_kernelFile =
"src/rt/kernels/persistent_ondemand.cu";
752 m_module = m_compiler.
compile();
757 for(
int i=0;i<m_numTris;i++)
774 S64 bvhSize = ((m_numTris *
sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
779 #ifdef COMPACT_LAYOUT
784 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
791 m_gpuTime = traceOnDemandBVHRayBuffer(rays, rebuild);
792 m_cpuTime = m_timer.
end();
808 m_kernelFile =
"src/rt/kernels/persistent_ondemand_kdtree.cu";
810 m_module = m_compiler.
compile();
813 prepareDynamicMemory();
817 for(
int i=0;i<m_numTris;i++)
842 #ifndef INTERLEAVED_LAYOUT
843 S64 kdtreeSize = ((m_numTris *
sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
847 #ifndef COMPACT_LAYOUT
851 #ifdef DUPLICATE_REFERENCES
861 S64 kdtreeSize = ((m_numTris*5 *
sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (
sizeof(
Vec4f)+
sizeof(
S32))) + 4096 - 1) & -4096;
865 m_bvhData.clearRange32(0, 0, kdtreeSize);
870 m_gpuTime = traceOnDemandKdtreeRayBuffer(rays, rebuild);
871 m_cpuTime = m_timer.
end();
884 m_module = m_compiler.
compile();
890 fail(
"Build kernel not found!");
892 F32 tTrace, tTraceCPU;
893 #ifndef ONDEMAND_FULL_BUILD
898 inBVH.numTris = m_numTris;
905 #ifdef COMPACT_LAYOUT
912 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
913 Buffer& indexBuf = m_trisIndex;
921 in.nodesA = nodePtr + nodeOfsA.x;
922 in.trisA = triPtr + triOfsA.x;
928 m_module->
setTexRef(
"t_nodesA", m_bvhData, CU_AD_FORMAT_FLOAT, 4);
929 m_module->
setTexRef(
"t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
930 m_module->
setTexRef(
"t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
935 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
936 int gridSizeX = NUM_SM*numBlocksPerSM;
937 Vec2i gridSize(gridSizeX, 1);
945 oldNodes = tasks.numNodes;
957 tasks.warpCounter = rays.
getSize();
958 tasks.unfinished = -NUM_WARPS;
959 tasks.launchFlag = 1;
962 tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
963 tTraceCPU = m_timer.
end();
965 buildNodes += tasks.numNodes - oldNodes;
969 }
while(oldNodes != tasks.numNodes);
977 GPUmegakernel += tTrace;
978 CPUmegakernel += tTraceCPU;
1002 if(m_kernelFile.
endsWith(
"kdtree.cu"))
1010 m_module = m_compiler.
compile();
1014 GPUtravKernel += tTrace;
1015 CPUtravKernel += tTraceCPU;
1020 float minTime = FLT_MAX;
1021 float sumTime = 0.f;
1030 cout <<
"Prefix scan for problem size of " << width <<
"x" << height <<
" = " << width*height <<
"\n";
1033 cout <<
"Testing task pool" <<
"\n";
1035 cout <<
"Testing thrust" <<
"\n";
1038 for(
int i = 0; i < numRepeats; i++)
1041 float t = testSort(width*height);
1047 printf(
"Run %d sort in %fs\n", i, t);
1048 minTime =
min(t, minTime);
1052 printf(
"Minimum time from %d runs = %fs\n", numRepeats, minTime);
1053 printf(
"Average time from %d runs = %fs\n", numRepeats, sumTime/numRepeats);
1059 void CudaPersistentBVHTracer::updateConstants()
1082 m_cutOffDepth = cudaEnv.optCutOffDepth;
1099 cudaEnv.siblingLimit = siblingLimit / WARP_SIZE;
1105 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
1107 cudaEnv.epsilon = m_epsilon;
1113 int CudaPersistentBVHTracer::warpSubtasks(
int threads)
1116 return max((threads + WARP_SIZE - 1) / WARP_SIZE, 1);
1121 int CudaPersistentBVHTracer::floatToOrderedInt(
float floatVal)
1123 int intVal = *((
int*)&floatVal);
1124 return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
1136 void CudaPersistentBVHTracer::allocateSnapshots(
Buffer &snapData)
1139 #ifdef SNAPSHOT_POOL
1143 snapData.clearRange32(0, 0, SNAPSHOT_POOL*
sizeof(PoolInfo));
1145 #ifdef SNAPSHOT_WARP
1146 snapData.
resizeDiscard(
sizeof(WarpInfo)*SNAPSHOT_WARP*NUM_WARPS);
1149 snapData.clearRange32(0, 0, SNAPSHOT_WARP*NUM_WARPS*
sizeof(WarpInfo));
1155 void CudaPersistentBVHTracer::printSnapshots(
Buffer &snapData)
1157 #ifdef SNAPSHOT_POOL
1158 PoolInfo* snapshots = (PoolInfo*)snapData.
getPtr();
1160 if(snapshots[SNAPSHOT_POOL-1].pool != 0)
1161 printf(
"\aSnapshot memory full!\n");
1163 long long int clockMin = snapshots[0].clockStart;
1164 long long int clockMax = 0;
1165 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1167 if(snapshots[i].pool == 0)
1169 clockMax = snapshots[i-1].clockEnd;
1174 ofstream snapfile(
"plots\\pool\\activity.dat");
1175 snapfile <<
"Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" <<
"\n";
1176 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1178 if(snapshots[i].pool == 0)
1181 snapfile << i <<
"\t" << snapshots[i].pool <<
"\t" << snapshots[i].tasks <<
"\t" << snapshots[i].active <<
"\t" << snapshots[i].chunks <<
"\t" << snapshots[i].depth
1182 <<
"\t" << snapshots[i].clockEnd - snapshots[i].clockStart <<
"\n";
1186 snapfile.open(
"plots\\pool\\activity_clockCor.dat");
1187 snapfile <<
"Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" <<
"\n";
1188 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1190 if(snapshots[i].pool == 0)
1193 snapfile << (float)((
long double)(snapshots[i].clockEnd - clockMin) / (
long double)(clockMax - clockMin)) <<
"\t" << snapshots[i].pool <<
"\t" << snapshots[i].tasks <<
"\t"
1194 << snapshots[i].active <<
"\t" << snapshots[i].chunks <<
"\t" << snapshots[i].depth <<
"\t" << snapshots[i].clockEnd - snapshots[i].clockStart <<
"\n";
1199 #ifdef SNAPSHOT_WARP
1200 WarpInfo* snapshots = (WarpInfo*)snapData.
getPtr();
1202 for(
int w = 0; w < NUM_WARPS; w++)
1204 if(snapshots[SNAPSHOT_WARP-1].reads != 0)
1205 printf(
"\aSnapshot memory full for warp %d!\n", w);
1207 ostringstream filename;
1209 filename <<
"plots\\warps\\warp" << setw(3) << w <<
".dat";
1211 ofstream snapfile(filename.str());
1213 snapfile <<
"Snap#\t#reads\t#rays\t#tris\ttype(leaf=8)\t#chunks\tpopCount\tdepth\tcDequeue\tcCompute\tstackTop\ttaskIdx" <<
"\n";
1214 for(
int i = 0; i < SNAPSHOT_WARP; i++)
1216 if(snapshots[i].reads == 0)
1219 if(snapshots[i].clockDequeue < snapshots[i].clockSearch || snapshots[i].clockFinished < snapshots[i].clockDequeue)
1220 cout <<
"Error timer for warp " << w <<
"\n";
1222 snapfile << i <<
"\t" << snapshots[i].reads <<
"\t" << snapshots[i].rays <<
"\t" << snapshots[i].tris <<
"\t" << snapshots[i].type <<
"\t"
1223 << snapshots[i].chunks <<
"\t" << snapshots[i].popCount <<
"\t" << snapshots[i].depth <<
"\t" << (snapshots[i].clockDequeue - snapshots[i].clockSearch) <<
"\t"
1224 << (snapshots[i].clockFinished - snapshots[i].clockDequeue) <<
"\t" << snapshots[i].stackTop <<
"\t" << snapshots[i].idx <<
"\n";
1228 snapshots += SNAPSHOT_WARP;
1235 void CudaPersistentBVHTracer::initPool(
int numRays,
Buffer* rayBuffer,
Buffer* nodeBuffer)
1239 #if PARALLELISM_TEST >= 0
1244 #ifndef MALLOC_SCRATCHPAD
1258 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1261 allocateSnapshots(snapData);
1268 m_taskData.clearRange32(0, TaskHeader_Empty,
TASK_SIZE *
sizeof(
int));
1270 m_taskData.clearRange32(0, TaskHeader_Empty,
TASK_SIZE * (
sizeof(
int)+
sizeof(Task)));
1285 if(rayBuffer !=
NULL)
1287 m_module->
setTexRef(
"t_rays", *rayBuffer, CU_AD_FORMAT_FLOAT, 4);
1289 if(nodeBuffer !=
NULL)
1291 m_module->
setTexRef(
"t_nodesA", *nodeBuffer, CU_AD_FORMAT_FLOAT, 4);
1293 m_module->
setTexRef(
"t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
1294 m_module->
setTexRef(
"t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
1307 void CudaPersistentBVHTracer::deinitPool(
int numRays)
1310 m_ppsTrisIndex.
reset();
1316 m_ppsRaysIndex.
reset();
1323 void CudaPersistentBVHTracer::printPoolHeader(TaskStackBase* tasks,
int* header,
int numWarps,
FW::String state)
1325 #if PARALLELISM_TEST >= 0
1327 printf(
"Active: %d\n", numActive);
1331 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1332 printSnapshots(snapData);
1336 Debug <<
"\nPRINTING DEBUG_INFO STATISTICS" <<
"\n\n";
1338 Debug <<
"\nPRINTING STATISTICS" <<
"\n\n";
1341 float4* debugData = (float4*)m_debug.
getPtr();
1342 float minAll[4] = {MAX_FLOAT, MAX_FLOAT, MAX_FLOAT, MAX_FLOAT};
1343 float maxAll[4] = {0, 0, 0, 0};
1344 float sumAll[4] = {0, 0, 0, 0};
1346 Debug <<
"Warp No. cnt_task_queues Avg. #Reads Max #Reads #Restarts" <<
"\n";
1347 for(
int i = 0; i < numWarps; i++)
1349 Debug <<
"Warp " << i <<
": (" << debugData[i].x <<
", " << debugData[i].y <<
", " << debugData[i].z <<
", " << debugData[i].w <<
")" <<
"\n";
1352 minAll[0] =
min(fabs(debugData[i].
x), minAll[0]);
1353 minAll[1] =
min(fabs(debugData[i].
y), minAll[1]);
1354 minAll[2] =
min(fabs(debugData[i].
z), minAll[2]);
1355 minAll[3] =
min(fabs(debugData[i].w), minAll[3]);
1357 maxAll[0] =
max(fabs(debugData[i].
x), maxAll[0]);
1358 maxAll[1] =
max(fabs(debugData[i].
y), maxAll[1]);
1359 maxAll[2] =
max(fabs(debugData[i].
z), maxAll[2]);
1360 maxAll[3] =
max(fabs(debugData[i].w), maxAll[3]);
1362 sumAll[0] += fabs(debugData[i].
x);
1363 sumAll[1] += fabs(debugData[i].
y);
1364 sumAll[2] += fabs(debugData[i].
z);
1365 sumAll[3] += fabs(debugData[i].w);
1367 if(debugData[i].
x < 0)
1370 Debug <<
"Dead=" << countDead <<
" / All=" << numWarps <<
" = " << (float)countDead/(
float)numWarps << "\
n";
1371 Debug << "Min: " << minAll[0] << ", " << minAll[1] << ", " << minAll[2] << ", " << minAll[3] << "\n";
1372 Debug << "Max: " << maxAll[0] << ", " << maxAll[1] << ", " << maxAll[2] << ", " << maxAll[3] << "\n";
1373 Debug << "Sum: " << sumAll[0] << ", " << sumAll[1] << ", " << sumAll[2] << ", " << sumAll[3] << "\n";
1374 Debug << "Avg: " << sumAll[0]/numWarps << ", " << sumAll[1]/numWarps << ", " << sumAll[2]/numWarps << ", " << sumAll[3]/numWarps << "\n\n" << "\n";
1375 Debug << "cnt_task_queues per
object = " << sumAll[0]/(
float)m_numTris << "\n";
1377 Debug << "Pool" << "\n";
1378 Debug << "Top = " << tasks->top << "; Bottom = " << tasks->bottom << "; Unfinished = " << tasks->unfinished << "; Size = " << tasks->sizePool << "; ";
1379 Debug << state.getPtr() << "\n";
1380 Debug << "ActiveTop = " << tasks->activeTop << "; Active = ";
1381 for(
int i = 0; i < ACTIVE_MAX+1; i++)
1382 Debug << tasks->active[i] << " ";
1383 Debug << "\n" << "\n";
1384 Debug << "EmptyTop = " << tasks->emptyTop << "; EmptyBottom = " << tasks->emptyBottom << "\nEmpty\n";
1385 for(
int i = 0; i < EMPTY_MAX+1; i++)
1391 Debug << tasks->empty[i];
1396 Debug <<
"\n" <<
"\n";
1399 int bellowEmpty = 0;
1400 Debug <<
"Header" <<
"\n";
1407 if(header[i] != TaskHeader_Empty)
1413 Debug << TaskHeader_Active;
1418 if(header[i] < TaskHeader_Empty)
1422 Debug <<
"\n\nEmptyItems = " << emptyItems <<
"\n";
1423 Debug <<
"BellowEmpty = " << bellowEmpty <<
"\n";
1428 void CudaPersistentBVHTracer::printPool(TaskStackBVH &tasks,
int numWarps)
1430 #ifdef LEAF_HISTOGRAM
1431 printf(
"Leaf histogram\n");
1432 unsigned int leafSum = 0;
1433 unsigned int triSum = 0;
1436 printf(
"%d: %d\n", i, tasks.leafHist[i]);
1437 leafSum += tasks.leafHist[i];
1438 triSum += i*tasks.leafHist[i];
1440 printf(
"Leafs total %d, average leaf %.2f\n", leafSum, (
float)triSum/(
float)leafSum);
1443 int* header = (
int*)m_taskData.
getPtr();
1444 FW::String state =
sprintf(
"BVH Top = %d; Tri Top = %d; Warp counter = %d; ", tasks.nodeTop, tasks.triTop, tasks.warpCounter);
1445 #ifdef BVH_COUNT_NODES
1446 state.
appendf(
"Number of inner nodes = %d; Number of leaves = %d; Sorted tris = %d; ", tasks.numNodes, tasks.numLeaves, tasks.numSortedTris);
1448 printPoolHeader(&tasks, header, numWarps, state);
1450 Debug <<
"\n\nTasks" <<
"\n";
1451 TaskBVH* task = (TaskBVH*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
1456 long double sumTris = 0;
1457 long double maxTris = 0;
1460 long double cntSortTris = 0;
1465 char terminatedNames[TerminatedBy_Max][255] = {
1466 "None",
"Depth",
"TotalLimit",
"OverheadLimit",
"Cost",
"FailureCounter"
1469 int terminatedBy[TerminatedBy_Max];
1470 memset(&terminatedBy,0,
sizeof(
int)*TerminatedBy_Max);
1475 if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
1478 _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1479 terminatedBy[ task[i].terminatedBy ]++;
1482 Debug <<
"Task " << i <<
"\n";
1483 Debug <<
"Header: " << header[i] <<
"\n";
1484 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
1485 Debug <<
"Type: " << task[i].type <<
"\n";
1486 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
1487 Debug <<
"TriLeft: " << task[i].triLeft <<
"\n";
1488 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
1489 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
1490 Debug <<
"ParentIdx: " << task[i].parentIdx <<
"\n";
1491 Debug <<
"NodeIdx: " << task[i].nodeIdx <<
"\n";
1492 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
1493 Debug <<
"Split: (" << task[i].splitPlane.x <<
", " << task[i].splitPlane.y <<
", " << task[i].splitPlane.z <<
", " << task[i].splitPlane.w <<
")\n";
1494 Debug <<
"Box: (" << task[i].bbox.m_mn.x <<
", " << task[i].bbox.m_mn.y <<
", " << task[i].bbox.m_mn.z <<
") - ("
1495 << task[i].bbox.m_mx.x <<
", " << task[i].bbox.m_mx.y <<
", " << task[i].bbox.m_mx.z <<
")\n";
1500 Debug <<
"Axis: " << task[i].axis <<
"\n";
1501 Debug <<
"Depth: " << task[i].depth <<
"\n";
1502 Debug <<
"Step: " << task[i].step <<
"\n";
1506 #ifdef MALLOC_SCRATCHPAD
1507 Debug <<
"SubFailure: " << task[i].subFailureCounter <<
"\n";
1509 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
1510 Debug <<
"Parent: " << task[i].parent <<
"\n";
1514 Debug <<
"TerminatedBy: " << task[i].terminatedBy <<
"\n";
1516 if(task[i].terminatedBy != TerminatedBy_None)
1517 Debug <<
"Triangles: " << task[i].triEnd - task[i].triStart <<
"\n";
1522 if(header[i] > (
int)0xFF800000)
1525 if(task[i].
depth == m_cutOffDepth)
1528 long double tris = task[i].triEnd - task[i].triStart;
1529 if(task[i].terminatedBy != TerminatedBy_None)
1539 cntSortTris += tris;
1545 maxDepth =
max(task[i].
depth, maxDepth);
1546 syncCount += task[i].sync;
1552 if(stackMax == TASK_SIZE-1)
1553 printf(
"\aIncomplete result!\n");
1555 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
1561 Debug <<
"Avg naive task height (tris) = " << sumTris/(
long double)sortTasks <<
"\n";
1562 Debug <<
"Max naive task height (tris) = " << maxTris <<
", taskId: " << maxTaskId <<
"\n";
1563 Debug <<
"Cnt sorted operations = " << sortTasks <<
"\n";
1564 double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0
f)));
1565 Debug <<
"Cnt sorted triangles = " << cntSortTris <<
"\n";
1566 Debug <<
"Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris <<
"\n";
1568 Debug <<
"Max task depth = " << maxDepth <<
"\n";
1569 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
1570 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
1571 Debug <<
"Terminated by:" <<
"\n";
1572 for(
int i = 0; i < TerminatedBy_Max; i++)
1574 Debug << terminatedNames[i] <<
": " << terminatedBy[i] <<
"\n";
1578 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
1583 void CudaPersistentBVHTracer::printPool(TaskStack &tasks,
int numWarps)
1586 int* header = (
int*)m_taskData.
getPtr();
1587 printPoolHeader(&tasks, header, numWarps,
FW::sprintf(
""));
1589 Debug <<
"\n\nTasks" <<
"\n";
1590 Task* task = (Task*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
1597 long double sumRays = 0;
1598 long double maxRays = 0;
1599 long double sumTris = 0;
1600 long double maxTris = 0;
1603 long double cntIsect = 0;
1604 long double maxIsect = 0;
1605 long double clippedIsect = 0;
1608 long double cntSortRays = 0;
1609 long double cntClippedRays = 0;
1610 long double cntSortTris = 0;
1616 char terminatedNames[TerminatedBy_Max][255] = {
1617 "None",
"Depth",
"TotalLimit",
"OverheadLimit",
"Cost",
"FailureCounter"
1620 int terminatedBy[TerminatedBy_Max];
1621 memset(&terminatedBy,0,
sizeof(
int)*TerminatedBy_Max);
1626 if(task[i].depend1 != TaskHeader_Empty || task[i].depend2 != TaskHeader_Empty)
1629 _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1630 terminatedBy[ task[i].terminatedBy ]++;
1633 Debug <<
"Task " << i <<
"\n";
1634 Debug <<
"Header: " << header[i] <<
"\n";
1635 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
1636 Debug <<
"Type: " << task[i].type <<
"\n";
1637 Debug <<
"RayStart: " << task[i].rayStart <<
"\n";
1638 Debug <<
"RayEnd: " << task[i].rayEnd <<
"\n";
1639 if(task[i].
type != TaskType_Intersect)
1641 Debug <<
"RayLeft: " << task[i].rayLeft <<
"\n";
1642 Debug <<
"RayRight: " << task[i].rayRight <<
"\n";
1643 Debug <<
"RayActive: " << task[i].rayActive <<
"\n";
1645 #ifdef CLIP_INTERSECT
1646 if(task[i].
type == TaskType_Intersect)
1647 Debug <<
"RayActive: " << task[i].rayActive <<
"\n";
1649 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
1650 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
1651 if(task[i].
type != TaskType_Intersect)
1654 Debug <<
"TriLeft: " << task[i].triLeft <<
"\n";
1655 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
1657 Debug <<
"Depend1: " << task[i].depend1 <<
"\n";
1658 Debug <<
"Depend2: " << task[i].depend2 <<
"\n";
1659 if(task[i].
type != TaskType_Intersect)
1661 Debug <<
"Split: (" << task[i].splitPlane.x <<
", " << task[i].splitPlane.y <<
", " << task[i].splitPlane.z <<
", " << task[i].splitPlane.w <<
")\n";
1663 Debug <<
"Box: (" << task[i].bbox.m_mn.x <<
", " << task[i].bbox.m_mn.y <<
", " << task[i].bbox.m_mn.z <<
") - ("
1664 << task[i].bbox.m_mx.x <<
", " << task[i].bbox.m_mx.y <<
", " << task[i].bbox.m_mx.z <<
")\n";
1671 Debug <<
"Depth: " << task[i].depth <<
"\n";
1675 Debug <<
"SubFailure: " << task[i].subFailureCounter <<
"\n";
1676 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
1677 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
1678 Debug <<
"Parent: " << task[i].parent <<
"\n";
1680 if(task[i].
type == TaskType_AABB_Max)
1681 #elif AABB_TYPE == 3
1682 if(task[i].
type == TaskType_AABB)
1685 Debug <<
"SubtaskIdx: " << task[i].subtaskIdx <<
"\n";
1686 Debug <<
"Clipped rays: " << task[i].rayEnd-task[i].rayActive <<
"\n";
1691 if(task[i].
depth == m_cutOffDepth)
1693 if(task[i].
type == TaskType_Intersect)
1695 #ifdef CLIP_INTERSECT
1696 long double locRays = task[i].rayActive - task[i].rayStart;
1698 long double locRays = task[i].rayEnd - task[i].rayStart;
1700 long double locTris = task[i].triEnd - task[i].triStart;
1701 Debug <<
"Intersections: " << locRays * locTris <<
"\n";
1704 if( locRays <
sqrt((
double)locTris) )
1706 if( locTris <
sqrt((
double)locRays) )
1710 Debug <<
"ClippedIntersections: " << task[i].clippedRays * locTris <<
"\n";
1711 clippedIsect += task[i].clippedRays * locTris;
1716 Debug <<
"Clock: " << task[i].clockEnd <<
"\n";
1719 Debug <<
"TerminatedBy: " << task[i].terminatedBy <<
"\n";
1726 if(task[i].
depth == m_cutOffDepth)
1730 #ifdef CLIP_INTERSECT
1731 long double rays = task[i].rayActive - task[i].rayStart;
1733 long double rays = task[i].rayEnd - task[i].rayStart;
1736 long double tris = task[i].triEnd - task[i].triStart;
1737 if(task[i].
type == TaskType_Intersect)
1740 cntIsect += rays*tris;
1741 maxIsect = max<long double>(rays*tris, maxIsect);
1742 if(maxIsect==(rays*tris)) maxTaskId = i;
1744 maxRays = max<long double>(rays, maxRays);
1746 maxTris = max<long double>(tris, maxTris);
1747 if(task[i].subFailureCounter > failureCount)
1751 if(task[i].
type == TaskType_AABB_Max)
1752 #elif AABB_TYPE == 3
1753 if(task[i].
type == TaskType_AABB)
1757 cntSortRays += rays;
1758 cntClippedRays += task[i].rayEnd-task[i].rayActive;
1759 cntSortTris += tris;
1766 maxDepth =
max(task[i].
depth, maxDepth);
1767 syncCount += task[i].sync;
1772 if(stackMax == TASK_SIZE-1)
1773 printf(
"\aIncomplete result!\n");
1775 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
1781 Debug <<
"ray_obj_intersections per ray = " << cntIsect/m_numRays <<
"\n";
1782 Debug <<
"cnt_leaves = " << isectTasks <<
"\n";
1783 Debug <<
"cnt_leaves per obj = " << (float)isectTasks/(
float)m_numTris << "\n";
1784 Debug << "ray_obj_intersections = " << cntIsect << "\n";
1785 Debug << "Useless ray_obj_intersections = " << clippedIsect << "\n";
1786 Debug << "Avg ray_obj_intersections per leaf = " << cntIsect/(
long double)isectTasks << "\n";
1787 Debug << "Max ray_obj_intersections per leaf = " << maxIsect << ", taskId: " << maxTaskId << "\n";
1788 Debug << "reduction [%] = " << 100.0
f * (cntIsect/((
long double)m_numRays*(
long double)m_numTris)) << "\n";
1789 Debug << "Avg naive task
width (rays) = " << sumRays/(
long double)isectTasks << "\n";
1790 Debug << "Max naive task
width (rays) = " << maxRays << "\n";
1791 Debug << "Avg naive task
height (tris) = " << sumTris/(
long double)isectTasks << "\n";
1792 Debug << "Max naive task
height (tris) = " << maxTris << "\n";
1793 Debug << "Cnt sorted operations = " << sortTasks << "\n";
1794 double cntTrisLog2Tris = (
double(m_numTris) * (
double)(logf(m_numTris)/logf(2.0
f)));
1795 double cntRaysLog2Tris = (
double(m_numRays) * (
double)(logf(m_numTris)/logf(2.0f)));
1796 Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1797 Debug << "Cnt sorted triangles/(
N log N), N=
#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1798 Debug <<
"Cnt sorted rays = " << cntSortRays <<
" BEFORE CLIPPING\n";
1799 Debug <<
"Cnt sorted rays/(log N)/R, N=#tris,R=#rays = " << cntSortRays/cntRaysLog2Tris <<
" BEFORE CLIPPING\n";
1800 Debug <<
"Cnt clipped rays = " << cntClippedRays <<
"\n";
1802 Debug <<
"Max task depth = " << maxDepth <<
"\n";
1803 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
1804 Debug <<
"Ray issues = " << rayIssues <<
", tris issues = " << triIssues <<
"\n";
1805 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
1807 Debug <<
"Terminated by:" <<
"\n";
1808 for(
int i = 0; i < TerminatedBy_Max; i++)
1810 Debug << terminatedNames[i] <<
": " << terminatedBy[i] <<
"\n";
1814 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
1819 F32 CudaPersistentBVHTracer::traceCudaRayBuffer(
RayBuffer& rb)
1825 fail(
"Trace kernel not found!");
1833 in.numRays = m_numRays;
1834 in.numTris = m_numTris;
1850 kernel = m_module->
getKernel(
"__naive");
1852 fail(
"Trace kernel not found!");
1854 Vec2i blockSizeN(1024, 1);
1855 Vec2i gridSizeN((m_numRays+1023)/1024, 1);
1857 float tNaive = m_module->launchKernelTimed(kernel, blockSizeN, gridSizeN);
1859 printf(
"Verifying GPU trace\n");
1878 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
1879 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
1885 all.rayRight = m_numRays;
1886 all.rayEnd = m_numRays;
1889 all.triRight = m_numTris;
1890 all.triEnd = m_numTris;
1893 all.depend1 = DependType_Root;
1894 all.depend2 = DependType_None;
1895 all.lock = LockType_Free;
1896 all.bestCost = 1e38f;
1898 all.subFailureCounter = 0;
1899 Vector3
size = m_bbox.Diagonal();
1900 all.axis = size.MajorAxis();
1901 all.terminatedBy = TerminatedBy_None;
1906 all.clippedRays = 0;
1913 all.type = TaskType_Sort_PPS1;
1914 #elif SCAN_TYPE == 1
1915 all.type = TaskType_Sort_PPS1_Up;
1916 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
1917 all.type = TaskType_Sort_SORT1;
1920 all.unfinished = warpSubtasks(m_numRays) + warpSubtasks(m_numTris);
1921 all.bestOrder = warpSubtasks(m_numRays);
1922 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
1924 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
1925 else if(all.axis == 1)
1926 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
1928 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
1930 all.type = TaskType_Split;
1932 int evaluatedCandidates = (int)sqrtf(m_numRays) + (int)sqrtf(m_numTris);
1933 int numPlanes = 0.5f * (m_numRays + m_numTris)/evaluatedCandidates;
1934 all.unfinished = warpSubtasks(numPlanes);
1935 #elif SPLIT_TYPE == 2
1937 #elif SPLIT_TYPE == 3
1938 all.type = TaskType_SplitParallel;
1939 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
1940 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
1941 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
1946 all.type = TaskType_Sort_PPS1_Up;
1947 int pRays = warpSubtasks(m_numRays);
1948 all.bestOrder = pRays;
1949 int pTris = warpSubtasks(m_numTris);
1950 all.unfinished = pRays+pTris;
1953 all.origSize = all.unfinished;
1955 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(Task));
1958 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
1967 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
1968 tasks.active[0] = 0;
1971 tasks.activeTop = 1;
1976 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
1978 tasks.emptyBottom = 0;
1979 tasks.unfinished = -1;
1981 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
1982 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
1986 Vec2i blockSize(WARP_SIZE, 1);
1987 Vec2i gridSize(1, 1);
1992 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
1993 int gridSizeX = NUM_SM*numBlocksPerSM;
1994 int numWarps = numWarpsPerBlock*gridSizeX;
1995 Vec2i gridSize(gridSizeX, 1);
1997 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
1998 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2001 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2006 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2021 for(
int i=0;i<m_numTris;i++)
2026 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *ptout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2032 if(*stout < -1 || *stout > 1)
2034 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *stout <<
"!\n" <<
"\n";
2043 for(
int i=0;i<m_numRays;i++)
2048 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *prout <<
" for " << m_numRays <<
" rays!" <<
"\n";
2054 if(*srout < -1 || *srout > 2)
2056 cout <<
"\nWTF " << i <<
" of " << m_numRays <<
": " << *srout <<
"!\n" <<
"\n";
2065 cout <<
"PPS correct for " << m_numTris <<
" triangles and " << m_numRays <<
" rays!" <<
"\n";
2110 printPool(tasks, numWarps);
2131 F32 CudaPersistentBVHTracer::buildCudaBVH()
2136 fail(
"Build kernel not found!");
2138 #ifdef MALLOC_SCRATCHPAD
2140 in.numTris = m_numTris;
2143 #ifdef COMPACT_LAYOUT
2152 #ifndef MALLOC_SCRATCHPAD
2155 in.numTris = m_numTris;
2162 #ifdef COMPACT_LAYOUT
2167 CUfunction kernelAlloc = m_module->
getKernel(
"allocFreeableMemory", 2*
sizeof(
int));
2169 fail(
"Memory allocation kernel not found!");
2172 offset += m_module->
setParami(kernelAlloc, offset, m_numTris);
2173 offset += m_module->
setParami(kernelAlloc, offset, 0);
2174 F32 allocTime = m_module->launchKernelTimed(kernelAlloc,
Vec2i(1,1),
Vec2i(1, 1));
2177 printf(
"Memory allocated in %f\n", allocTime);
2180 CUfunction kernelMemCpyIndex = m_module->
getKernel(
"MemCpyIndex",
sizeof(CUdeviceptr)+
sizeof(
int));
2181 if (!kernelMemCpyIndex)
2182 fail(
"Memory copy kernel not found!");
2184 int memSize = m_trisIndex.
getSize()/
sizeof(int);
2187 offset += m_module->
setParami(kernelMemCpyIndex, offset, memSize);
2188 F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex,
Vec2i(256,1),
Vec2i((memSize-1+256)/256, 1));
2191 printf(
"Triangle indices copied in %f\n", memcpyTime);
2196 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2197 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2199 for(
int i = 0; i < 2; i++)
2201 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2202 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2203 split.children[i].cnt = 0;
2207 for(
int i = 0; i < NUM_WARPS; i++)
2209 for(
int j = 0; j < PLANE_COUNT; j++)
2210 sArray.splits[i][j] = split;
2214 for(
int i = 0; i < 2; i++)
2218 split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2219 split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2220 split.children[i].cnt = 0;
2224 for(
int j = 0; j < PLANE_COUNT; j++)
2225 sArray.splits[j] = split;
2227 m_splitData.
setRange(0, &sArray,
sizeof(SplitArray));
2230 m_splitData.
setRange(TASK_SIZE *
sizeof(SplitArray), &sArray,
sizeof(SplitArray));
2234 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
2235 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
2241 #ifndef MALLOC_SCRATCHPAD
2242 all.triRight = m_numTris;
2246 all.triEnd = m_numTris;
2249 all.lock = LockType_Free;
2250 all.bestCost = 1e38f;
2252 all.dynamicMemory= 0;
2253 #ifndef MALLOC_SCRATCHPAD
2259 Vector3 size = m_bbox.Diagonal();
2260 all.axis = size.MajorAxis();
2261 all.terminatedBy = TerminatedBy_None;
2271 all.type = TaskType_Sort_PPS1;
2272 #elif SCAN_TYPE == 1
2273 all.type = TaskType_Sort_PPS1_Up;
2274 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2275 all.type = TaskType_Sort_SORT1;
2277 all.unfinished = warpSubtasks(m_numTris);
2278 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2280 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2281 else if(all.axis == 1)
2282 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2284 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2285 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2286 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2287 all.type = TaskType_InitMemory;
2288 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
2290 all.type = TaskType_BinTriangles;
2291 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2296 all.origSize = all.unfinished;
2298 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2301 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2312 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
2313 tasks.active[0] = 0;
2316 tasks.activeTop = 1;
2321 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
2323 tasks.emptyBottom = 0;
2324 tasks.unfinished = -1;
2325 tasks.numSortedTris = 0;
2327 tasks.numLeaves = 0;
2328 tasks.numEmptyLeaves = 0;
2330 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2331 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2332 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
2334 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2345 Vec2i blockSize(WARP_SIZE, 1);
2346 Vec2i gridSize(1, 1);
2351 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2352 int gridSizeX = NUM_SM*numBlocksPerSM;
2353 int numWarps = numWarpsPerBlock*gridSizeX;
2354 Vec2i gridSize(gridSizeX, 1);
2356 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2357 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2360 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2365 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2387 for(
int i=0;i<m_numTris;i++)
2392 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *pout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2398 if(*sout != 0 && *sout != 1)
2400 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *sout <<
"!\n" <<
"\n";
2409 cout <<
"PPS correct for " << m_numTris <<
" triangles!" <<
"\n";
2413 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2414 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
2421 printPool(tasks, numWarps);
2443 F32 CudaPersistentBVHTracer::buildCudaKdtree()
2448 fail(
"Build kernel not found!");
2451 in.numTris = m_numTris;
2455 #ifndef INTERLEAVED_LAYOUT
2468 cudaEnv.optMaxDepth = k1 *
log2((
F32)m_numTris) + k2;
2469 cudaEnv.failureCount = f1 * cudaEnv.optMaxDepth + f2;
2471 printf(
"Maximum depth = %d\n", cudaEnv.optMaxDepth);
2472 printf(
"Failure count = %d\n", cudaEnv.failureCount);
2475 int baseOffset = setDynamicMemory();
2479 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
2480 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2481 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2483 for(
int i = 0; i < 2; i++)
2485 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2486 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2487 split.children[i].cnt = 0;
2491 for(
int i = 0; i < NUM_WARPS; i++)
2493 for(
int j = 0; j < PLANE_COUNT; j++)
2494 sArray.splits[i][j] = split;
2512 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
2525 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
2526 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
2539 all.triEnd = m_numTris;
2542 all.lock = LockType_Free;
2543 all.bestCost = 1e38f;
2545 all.dynamicMemory= baseOffset;
2546 #ifdef MALLOC_SCRATCHPAD
2547 all.subFailureCounter = 0;
2552 Vector3 size = m_bbox.Diagonal();
2553 all.axis = size.MajorAxis();
2554 all.terminatedBy = TerminatedBy_None;
2564 all.type = TaskType_Sort_PPS1;
2565 #elif SCAN_TYPE == 1
2566 all.type = TaskType_Sort_PPS1_Up;
2567 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2568 all.type = TaskType_Sort_SORT1;
2570 all.unfinished = warpSubtasks(m_numTris);
2571 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2573 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2574 else if(all.axis == 1)
2575 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2577 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2578 #elif SPLIT_TYPE == 1
2579 all.type = TaskType_Split;
2580 #if 0 // SQRT candidates
2581 int evaluatedCandidates = (int)sqrtf(m_numTris);
2582 int evaluatedCandidates = 1;
2583 int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
2584 #elif 0 // Fixed candidates
2585 int numPlanes = 32768;
2586 #else // All candidates
2587 int numPlanes = m_numTris*6;
2589 all.unfinished = warpSubtasks(numPlanes);
2590 #elif SPLIT_TYPE == 2
2591 all.type = TaskType_Split;
2593 #elif SPLIT_TYPE == 3
2594 all.type = TaskType_SplitParallel;
2595 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
2596 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
2597 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
2598 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2599 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2600 all.type = TaskType_InitMemory;
2601 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
2603 all.type = TaskType_BinTriangles;
2604 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2609 all.origSize = all.unfinished;
2611 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2614 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2620 #ifndef INTERLEAVED_LAYOUT
2623 tasks.nodeTop =
sizeof(CudaKdtreeNode);
2629 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
2630 tasks.active[0] = 0;
2633 tasks.activeTop = 1;
2638 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
2640 tasks.emptyBottom = 0;
2641 tasks.unfinished = -1;
2642 tasks.numSortedTris = 0;
2644 tasks.numEmptyLeaves = 0;
2645 tasks.numLeaves = 0;
2647 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2648 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2649 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
2656 Vec2i blockSize(WARP_SIZE, 1);
2657 Vec2i gridSize(1, 1);
2662 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2663 int gridSizeX = NUM_SM*numBlocksPerSM;
2664 int numWarps = numWarpsPerBlock*gridSizeX;
2665 Vec2i gridSize(gridSizeX, 1);
2667 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2668 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2671 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2676 float tKernel = 0.f;
2677 #ifndef DUPLICATE_REFERENCES
2680 tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
2702 for(
int i=0;i<m_numTris;i++)
2707 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *pout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2713 if(*sout != 0 && *sout != 1)
2715 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *sout <<
"!\n" <<
"\n";
2724 cout <<
"PPS correct for " << m_numTris <<
" triangles!" <<
"\n";
2728 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2729 #ifndef INTERLEAVED_LAYOUT
2730 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
2732 if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.
getSize())
2740 printPool(tasks, numWarps);
2762 F32 CudaPersistentBVHTracer::testSort(
S32 arraySize)
2764 m_compiler.
setSourceFile(
"src/rt/kernels/persistent_test.cu");
2765 m_module = m_compiler.
compile();
2771 kernel = m_module->
getKernel(
"testKeplerSort");
2773 fail(
"Sort kernel not found!");
2793 for(
int i=0; i < arraySize; i++)
2796 *tiout = (arraySize-1) - i;
2802 in.numTris = arraySize;
2811 all.triEnd = arraySize;
2815 all.bestCost = 1e38f;
2820 all.pivot = arraySize / 2;
2828 all.type = TaskType_Sort_PPS1;
2829 all.unfinished = warpSubtasks(arraySize);
2830 all.origSize = all.unfinished;
2832 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2835 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2844 memset(tasks.active, 0,
sizeof(
int)*(ACTIVE_MAX+1));
2845 tasks.activeTop = 1;
2851 tasks.emptyBottom = 0;
2852 tasks.unfinished = -1;
2854 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2855 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2859 Vec2i blockSize(WARP_SIZE, 1);
2860 Vec2i gridSize(1, 1);
2864 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2865 int gridSizeX = NUM_SM*numBlocksPerSM;
2866 Vec2i gridSize(gridSizeX, 1);
2868 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2869 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2872 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2877 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize,
false, 0,
false);
2885 for(
int i=0; i < arraySize; i++)
2889 printf(
"Sort error %d instead of %d\n", *tsort, i);
2895 Debug <<
"\nSort in " << tKernel <<
"\n\n";
2897 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2898 int* header = (
int*)m_taskData.
getPtr();
2899 printPoolHeader(&tasks, header, blockSize.y*gridSize.x,
sprintf(
""));
2901 Debug <<
"\n\nTasks" <<
"\n";
2902 TaskBVH* task = (TaskBVH*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
2907 long double sumTris = 0;
2908 long double maxTris = 0;
2911 long double cntSortTris = 0;
2917 if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
2919 Debug <<
"Task " << i <<
"\n";
2920 Debug <<
"Header: " << header[i] <<
"\n";
2921 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
2922 Debug <<
"Type: " << task[i].type <<
"\n";
2923 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
2924 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
2925 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
2926 Debug <<
"ParentIdx: " << task[i].parentIdx <<
"\n";
2927 Debug <<
"NodeIdx: " << task[i].nodeIdx <<
"\n";
2928 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
2929 Debug <<
"Depth: " << task[i].depth <<
"\n";
2934 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
2935 Debug <<
"Parent: " << task[i].parent <<
"\n";
2937 Debug <<
"Triangles: " << task[i].triEnd - task[i].triStart <<
"\n";
2938 Debug <<
"Pivot: " << task[i].pivot <<
"\n";
2944 if(task[i].
depth == m_cutOffDepth)
2947 long double tris = task[i].triEnd - task[i].triStart;
2955 cntSortTris += tris;
2961 maxDepth =
max(task[i].
depth, maxDepth);
2962 syncCount += task[i].sync;
2967 if(stackMax == TASK_SIZE-1)
2968 printf(
"\aIncomplete result!\n");
2970 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
2976 Debug <<
"Avg naive task height (tris) = " << sumTris/(
long double)sortTasks <<
"\n";
2977 Debug <<
"Max naive task height (tris) = " << maxTris <<
", taskId: " << maxTaskId <<
"\n";
2978 Debug <<
"Cnt sorted operations = " << sortTasks <<
"\n";
2979 double cntTrisLog2Tris = (double(arraySize) * (double)(logf(arraySize)/logf(2.0f)));
2980 Debug <<
"Cnt sorted triangles = " << cntSortTris <<
"\n";
2981 Debug <<
"Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris <<
"\n";
2983 Debug <<
"Max task depth = " << maxDepth <<
"\n";
2984 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
2985 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
2988 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
2993 F32 CudaPersistentBVHTracer::traceOnDemandBVHRayBuffer(
RayBuffer& rays,
bool rebuild)
2998 fail(
"Build kernel not found!");
3007 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3011 inBVH.numTris = m_numTris;
3018 #ifdef COMPACT_LAYOUT
3024 CUdeviceptr nodePtr = m_bvhData.
getCudaPtr();
3025 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
3026 Buffer& indexBuf = m_trisIndex;
3036 in.nodesA = nodePtr + nodeOfsA.x;
3037 in.trisA = triPtr + triOfsA.x;
3044 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3045 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3047 for(
int i = 0; i < 2; i++)
3049 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3050 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3051 split.children[i].cnt = 0;
3055 for(
int i = 0; i < NUM_WARPS; i++)
3057 for(
int j = 0; j < PLANE_COUNT; j++)
3058 sArray.splits[i][j] = split;
3062 for(
int i = 0; i < 2; i++)
3066 split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3067 split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3068 split.children[i].cnt = 0;
3072 for(
int j = 0; j < PLANE_COUNT; j++)
3073 sArray.splits[j] = split;
3075 m_splitData.
setRange(0, &sArray,
sizeof(SplitArray));
3078 m_splitData.
setRange(TASK_SIZE *
sizeof(SplitArray), &sArray,
sizeof(SplitArray));
3081 m_bvhData.clearRange32(0, UNBUILD_FLAG,
sizeof(CudaBVHNode));
3084 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
3085 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
3091 all.triRight = m_numTris;
3092 all.triEnd = m_numTris;
3095 all.lock = LockType_Free;
3096 all.bestCost = 1e38f;
3098 #ifndef MALLOC_SCRATCHPAD
3104 Vector3 size = m_bbox.Diagonal();
3105 all.axis = size.MajorAxis();
3107 all.terminatedBy = TerminatedBy_None;
3114 all.cached = LockType_None;
3118 all.type = TaskType_Sort_PPS1;
3119 #elif SCAN_TYPE == 1
3120 all.type = TaskType_Sort_PPS1_Up;
3121 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3122 all.type = TaskType_Sort_SORT1;
3124 all.unfinished = warpSubtasks(m_numTris);
3125 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3127 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3128 else if(all.axis == 1)
3129 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3131 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3132 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3133 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3134 all.type = TaskType_InitMemory;
3135 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
3137 all.type = TaskType_BinTriangles;
3138 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3141 all.origSize = all.unfinished;
3143 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
3146 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
3154 tasks.launchFlag = 0;
3163 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
3164 tasks.active[0] = 0;
3167 tasks.activeTop = 1;
3172 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
3174 tasks.emptyBottom = 0;
3175 tasks.numSortedTris = 0;
3177 tasks.numLeaves = 0;
3178 tasks.numEmptyLeaves = 0;
3180 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
3181 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
3182 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
3189 tasks.warpCounter = rays.
getSize();
3190 tasks.unfinished = -NUM_WARPS;
3192 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3203 Vec2i blockSize(WARP_SIZE, 1);
3204 Vec2i gridSize(1, 1);
3209 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
3210 int gridSizeX = NUM_SM*numBlocksPerSM;
3211 int numWarps = numWarpsPerBlock*gridSizeX;
3212 Vec2i gridSize(gridSizeX, 1);
3214 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3215 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
3218 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
3224 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
3230 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
3231 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
3237 printPool(tasks, numWarps);
3256 F32 CudaPersistentBVHTracer::traceOnDemandKdtreeRayBuffer(
RayBuffer& rays,
bool rebuild)
3261 fail(
"Build kernel not found!");
3270 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3273 cudaEnv.optMaxDepth = k1 *
log2((
F32)m_numTris) + k2;
3278 printf(
"Maximum depth = %d\n", cudaEnv.optMaxDepth);
3279 printf(
"Failure count = %d\n", cudaEnv.failureCount);
3285 inBVH.numTris = m_numTris;
3288 #ifndef INTERLEAVED_LAYOUT
3294 CUdeviceptr nodePtr = m_bvhData.
getCudaPtr();
3296 #ifndef INTERLEAVED_LAYOUT
3297 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
3299 Buffer& indexBuf = m_trisIndexOut;
3303 Buffer& indexBuf = m_bvhData;
3312 memcpy(&in.bmin, &m_bbox.min,
sizeof(float3));
3313 memcpy(&in.bmax, &m_bbox.max,
sizeof(float3));
3314 in.nodesA = nodePtr + nodeOfsA.x;
3315 in.trisA = triPtr + triOfsA.x;
3318 in.triIndices = indexBuf.getCudaPtr();
3322 m_module->
setTexRef(
"t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3328 int baseOffset = setDynamicMemory();
3332 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
3333 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3334 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3336 for(
int i = 0; i < 2; i++)
3338 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3339 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3340 split.children[i].cnt = 0;
3344 for(
int i = 0; i < NUM_WARPS; i++)
3346 for(
int j = 0; j < PLANE_COUNT; j++)
3347 sArray.splits[i][j] = split;
3365 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
3377 m_bvhData.clearRange32(0, UNBUILD_FLAG,
sizeof(CudaKdtreeNode));
3380 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
3381 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
3388 all.triEnd = m_numTris;
3391 all.lock = LockType_Free;
3392 all.bestCost = 1e38f;
3394 all.dynamicMemory= baseOffset;
3395 #ifdef MALLOC_SCRATCHPAD
3396 all.subFailureCounter = 0;
3401 Vector3 size = m_bbox.Diagonal();
3402 all.axis = size.MajorAxis();
3404 all.terminatedBy = TerminatedBy_None;
3411 all.cached = LockType_None;
3415 all.type = TaskType_Sort_PPS1;
3416 #elif SCAN_TYPE == 1
3417 all.type = TaskType_Sort_PPS1_Up;
3418 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3419 all.type = TaskType_Sort_SORT1;
3421 all.unfinished = warpSubtasks(m_numTris);
3422 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3424 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3425 else if(all.axis == 1)
3426 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3428 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3429 #elif SPLIT_TYPE == 1
3430 all.type = TaskType_Split;
3431 #if 0 // SQRT candidates
3432 int evaluatedCandidates = (int)sqrtf(m_numTris);
3433 int evaluatedCandidates = 1;
3434 int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
3435 #elif 0 // Fixed candidates
3436 int numPlanes = 32768;
3437 #else // All candidates
3438 int numPlanes = m_numTris*6;
3440 all.unfinished = warpSubtasks(numPlanes);
3441 #elif SPLIT_TYPE == 2
3442 all.type = TaskType_Split;
3444 #elif SPLIT_TYPE == 3
3445 all.type = TaskType_SplitParallel;
3446 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
3447 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
3448 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
3449 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3450 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3451 all.type = TaskType_InitMemory;
3452 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
3454 all.type = TaskType_BinTriangles;
3455 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3458 all.origSize = all.unfinished;
3460 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
3463 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
3471 tasks.launchFlag = 0;
3475 #ifndef INTERLEAVED_LAYOUT
3478 tasks.nodeTop =
sizeof(CudaKdtreeNode);
3484 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
3485 tasks.active[0] = 0;
3488 tasks.activeTop = 1;
3493 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
3495 tasks.emptyBottom = 0;
3496 tasks.numSortedTris = 0;
3498 tasks.numLeaves = 0;
3499 tasks.numEmptyLeaves = 0;
3501 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
3502 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
3503 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
3510 tasks.warpCounter = rays.
getSize();
3511 #ifndef ONDEMAND_FULL_BUILD
3512 tasks.unfinished = -NUM_WARPS;
3514 tasks.unfinished = -1;
3522 Vec2i blockSize(WARP_SIZE, 1);
3523 Vec2i gridSize(1, 1);
3528 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
3529 int gridSizeX = NUM_SM*numBlocksPerSM;
3530 int numWarps = numWarpsPerBlock*gridSizeX;
3531 Vec2i gridSize(gridSizeX, 1);
3533 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3534 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
3537 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
3543 float tKernel = 0.f;
3544 #ifndef DUPLICATE_REFERENCES
3548 tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
3564 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
3565 #ifndef INTERLEAVED_LAYOUT
3566 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
3568 if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.
getSize())
3575 printPool(tasks, numWarps);
3594 F32 CudaPersistentBVHTracer::traceCpuRayBuffer(
RayBuffer& rb)
3598 for(
int rid=0; rid < rb.
getSize(); rid++)
3600 if(rid % 10000 == 0)
printf(
"rid: %d\n",rid);
3607 void CudaPersistentBVHTracer::traceCpuRay(
const Ray& r,
RayResult& result,
bool anyHit)
3610 const S32 *t_trisIndices = (
S32*)(m_trisIndex.
getPtr());
3625 for (
int triAddr = 0; triAddr < m_numTris * 3 ; triAddr += 3)
3637 const float deni = 1.0f / den;
3639 float t =
dot(nrmN,org0)*deni;
3641 if (t > tmin && t < hitT)
3644 const float v =
dot(v00-v22,crossProd)*deni;
3645 if (v >= 0.0f && v <= 1.0f)
3647 const float u = -
dot(v00-v11,crossProd)*deni;
3648 if (u >= 0.0f && u + v <= 1.0f)
3660 hitIndex = hitIndex / 3;
3662 result.
id = hitIndex;
3668 void CudaPersistentBVHTracer::saveBufferSizes(
bool ads,
bool aux)
3670 float MB = (float)(1024*1024);
3675 #ifndef COMPACT_LAYOUT
3677 m_sizeTriIdx = m_trisIndex.
getSize()/
MB;
3679 m_sizeTri = m_trisCompactOut.
getSize()/
MB;
3680 m_sizeTriIdx = m_trisIndexOut.
getSize()/
MB;
3688 #ifdef MALLOC_SCRATCHPAD
3689 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3691 cuCtxGetLimit(&heapSize, CU_LIMIT_MALLOC_HEAP_SIZE);
3692 m_heap = heapSize/
MB;
3702 void CudaPersistentBVHTracer::prepareDynamicMemory()
3710 #if defined(SCATTER_ALLOC) || defined(FDG_ALLOC)
3711 U64 allocSize =
max(allocSize, 8ULL*1024ULL*1024ULL);
3714 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3715 cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, allocSize);
3716 #elif defined(ATOMIC_MALLOC) || defined(CIRCULAR_MALLOC)
3718 #ifdef WITH_SCATTER_ALLOC
3721 #elif defined(SCATTER_ALLOC)
3725 #if defined(SCATTER_ALLOC) || defined(WITH_SCATTER_ALLOC)
3727 CUfunction initHeap = m_module->
getKernel(
"_ZN8GPUTools8initHeapILj4096ELj8ELj16ELj2ELb0ELb1EEEvPNS_10DeviceHeapIXT_EXT0_EXT1_EXT2_EXT3_EXT4_EEEPvj", 2*
sizeof(CUdeviceptr)+
sizeof(
int));
3729 fail(
"Scatter alloc initialization kernel not found!");
3733 #ifdef WITH_SCATTER_ALLOC
3738 offset += m_module->
setParami(initHeap, offset, allocSize);
3739 F32 initTime = m_module->launchKernelTimed(initHeap,
Vec2i(256,1),
Vec2i(1, 1));
3741 printf(
"Scatter alloc initialized in %f\n", initTime);
3745 int CudaPersistentBVHTracer::setDynamicMemory()
3748 #if !defined(ATOMIC_MALLOC) && !defined(CIRCULAR_MALLOC)
3749 CUfunction kernelAlloc = m_module->
getKernel(
"allocFreeableMemory", 2*
sizeof(
int));
3751 fail(
"Memory allocation kernel not found!");
3754 offset += m_module->
setParami(kernelAlloc, offset, m_numTris);
3755 offset += m_module->
setParami(kernelAlloc, offset, 0);
3756 F32 allocTime = m_module->launchKernelTimed(kernelAlloc,
Vec2i(1,1),
Vec2i(1, 1));
3759 printf(
"Memory allocated in %f\n", allocTime);
3767 heapOffset = 4*m_numTris*
sizeof(int);
3769 heapOffset = m_numTris*
sizeof(int);
3773 heapSize = m_mallocData.
getSize();
3775 #if defined(CIRCULAR_MALLOC)
3776 #ifndef DOUBLY_LINKED
3777 int headerSize = 2*
sizeof(int);
3779 int headerSize = 3*
sizeof(int);
3781 heapOffset += headerSize;
3786 #ifndef DOUBLY_LINKED
3787 Vec2i first(LockType_Set, heapOffset);
3790 Vec3i first(LockType_Set, heapSize-headerSize, heapOffset);
3794 #ifdef GLOBAL_HEAP_LOCK
3795 #ifndef DOUBLY_LINKED
3796 Vec2i second(LockType_Free, heapSize-headerSize);
3799 Vec3i second(LockType_Free, 0, heapSize-headerSize);
3805 int numChunks = m_mallocData.
getSize()/heapOffset;
3806 for(
int i = 1; i < numChunks; i++)
3808 #ifndef DOUBLY_LINKED
3809 Vec2i next(0, (i+1)*heapOffset);
3812 Vec3i next(0, (i-1)*heapOffset, (i+1)*heapOffset);
3819 int delta = ((int)(heapOffset)+headerSize+3) & -4;
3824 for(ofs = heapOffset;
true; ofs += delta, i++)
3828 delta = ((int)(delta * 0.8f)+headerSize+3) & -4;
3833 if(ofs+delta >= heapSize-2*headerSize)
3836 #ifndef DOUBLY_LINKED
3837 Vec2i next(LockType_Free, ofs+delta);
3840 Vec3i next(LockType_Free, prevOfs, ofs+delta);
3848 #ifndef DOUBLY_LINKED
3849 Vec2i last(LockType_Free, heapSize-headerSize);
3852 Vec3i last(LockType_Free, prevOfs, heapSize-headerSize);
3857 #ifndef DOUBLY_LINKED
3858 Vec2i tail(LockType_Set, 0);
3859 m_mallocData.
setRange(heapSize-headerSize, &tail,
sizeof(
Vec2i));
3861 Vec3i tail(LockType_Set, ofs, 0);
3862 m_mallocData.
setRange(heapSize-headerSize, &tail,
sizeof(
Vec3i));
3866 baseOffset = headerSize;
3868 #ifdef WITH_SCATTER_ALLOC
3878 CUfunction kernelMemCpyIndex = m_module->
getKernel(
"MemCpyIndex",
sizeof(CUdeviceptr)+
sizeof(
int));
3879 if (!kernelMemCpyIndex)
3880 fail(
"Memory copy kernel not found!");
3882 int memSize = m_trisIndex.
getSize()/
sizeof(int);
3885 offset += m_module->
setParami(kernelMemCpyIndex, offset, memSize);
3886 F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex,
Vec2i(256,1),
Vec2i((memSize-1+256)/256, 1));
3889 printf(
"Triangle indices copied in %f\n", memcpyTime);
3892 #ifdef SCATTER_ALLOC
3895 baseOffset = heap - base;
3908 CUfunction kernelCreateWoop = m_module->
getKernel(
"createWoop", 2*
sizeof(CUdeviceptr)+
sizeof(
int));
3909 if (!kernelCreateWoop)
3910 fail(
"Regular triangle to Woop triangle conversion kernel not found!");
3915 offset += m_module->
setParami(kernelCreateWoop, offset, m_numTris);
3916 F32 woopTime = m_module->launchKernelTimed(kernelCreateWoop,
Vec2i(256,1),
Vec2i((m_numTris-1+256)/256, 1));
3919 printf(
"Woop triangles created in %f\n", woopTime);
3931 m_trisCompactOut.
reset();
3932 m_trisIndexOut.
reset();
3935 m_mallocData.
reset();
3936 m_mallocData2.
reset();
3938 m_splitData.
reset();
3940 m_raysIndex.
reset();
3943 m_ppsTrisIndex.
reset();
3946 m_ppsRaysIndex.
reset();
3953 saveBufferSizes(
false,
true);
3958 U32 nN, nL, eL, sT, bT, tT, sTr;
3959 getStats(nN, nL, eL, sT, bT, tT, sTr);
3960 #ifdef COMPACT_LAYOUT
3961 m_bvhData.
resize(nN *
sizeof(CudaBVHNode));
3962 m_trisCompactOut.
resize(tT*3*
sizeof(float4) + nL*
sizeof(float4));
3963 m_trisIndexOut.
resize(tT*3*
sizeof(
int) + nL*
sizeof(
int));
3965 m_bvhData.
resize((nN + nL) *
sizeof(CudaBVHNode));
3969 saveBufferSizes(
true,
false);
3975 saveBufferSizes(
false,
true);
3980 U32 nN, nL, eL, sT, nT, tT, sTr;
3981 getStats(nN, nL, eL, sT, nT, tT, sTr);
3982 #ifndef INTERLEAVED_LAYOUT
3983 #ifndef COMPACT_LAYOUT
3984 getStats(nN, nL, eL, sT, nT, tT, sTr,
false);
3985 m_bvhData.
resize((nN + nL) *
sizeof(CudaKdtreeNode));
3986 m_trisCompactOut.
resize(tT*3*
sizeof(float4));
3987 m_trisIndexOut.
resize(tT*3*
sizeof(
int));
3989 #ifdef DUPLICATE_REFERENCES
3990 m_bvhData.
resize(nN *
sizeof(CudaKdtreeNode));
3991 m_trisCompactOut.
resize(tT*3*
sizeof(float4) + nL*
sizeof(float4));
3992 m_trisIndexOut.
resize(tT*3*
sizeof(
int) + nL*
sizeof(
int));
3994 m_bvhData.
resize(nN *
sizeof(CudaKdtreeNode));
3995 m_trisIndexOut.
resize(tT*
sizeof(
int) + nL*
sizeof(
int));
4004 saveBufferSizes(
true,
false);
4009 TaskStackBVH tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
4011 #ifndef INTERLEAVED_LAYOUT
4012 #ifndef BVH_COUNT_NODES
4013 #ifndef COMPACT_LAYOUT
4014 nodes = tasks.nodeTop / 2;
4015 leaves = tasks.nodeTop - nodes;
4017 nodes = tasks.nodeTop;
4018 leaves = tasks.triTop;
4021 #else // BVH_COUNT_NODES
4022 nodes = tasks.numNodes;
4023 leaves = tasks.numLeaves;
4024 emptyLeaves = tasks.numEmptyLeaves;
4025 #endif // BVH_COUNT_NODES
4027 #ifdef COMPACT_LAYOUT
4028 tris = tasks.triTop;
4030 tris -= (leaves-emptyLeaves);
4031 #ifdef DUPLICATE_REFERENCES
4041 tris = tasks.triTop;
4046 #ifndef BVH_COUNT_NODES
4047 nodes = tasks.nodeTop / 2;
4048 leaves = tasks.nodeTop - nodes;
4050 #else // BVH_COUNT_NODES
4051 nodes = tasks.numNodes;
4052 leaves = tasks.numLeaves;
4053 emptyLeaves = tasks.numEmptyLeaves;
4054 #endif // BVH_COUNT_NODES
4056 tris = tasks.nodeTop - (nodes+leaves)*
sizeof(CudaKdtreeNode);
4057 tris /= 3*
sizeof(float4)+
sizeof(
int);
4060 nodeTop = tasks.nodeTop;
4061 sortedTris = tasks.numSortedTris;
4062 stackTop = tasks.top;
4068 split = m_sizeSplit;
4071 triIdx = m_sizeTriIdx;
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int offset
void resetBuffers(bool resetADSBuffers)
S32 getSize() const
Gets size of the buffer (number of rays).
bool endsWith(const String &str) const
void setRange(S64 dstOfs, const void *src, S64 size, bool async=false, CUstream cudaStream=NULL)
Buffer & getTriVtxIndexBuffer(void)
Returns buffer of triangle's vertex indieces.
float GetFloat(const char *name, const bool isFatal=false) const
F32 traceBatch(RayBuffer &rays)
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
int GetInt(const char *name, const bool isFatal=false) const
void getStats(U32 &nodes, U32 &leaves, U32 &emptyLeaves, U32 &stackTop, U32 &nodeTop, U32 &tris, U32 &sortedTris, bool sub=true)
Buffer & getVtxPosBuffer(void)
Returns vertex position buffer.
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N
CUdeviceptr getCudaPtr(S64 ofs=0)
S32 numTriangleTests
Total number of ray-triangle tests.
S32 numRays
Total number of rays.
F32 traceBatchBVH(RayBuffer &rays, RayStats *stats=NULL)
CudaKernel getKernel(const String &name)
FW_CUDA_FUNC Vec3f getXYZ(void) const
FW_CUDA_FUNC F32 sqrt(F32 a)
int setParami(CUfunction kernel, int offset, S32 value)
Structure holding ray statistics. Also provides print to the console. These statistics are used in a ...
const U8 * getPtr(S64 ofs=0)
void setOwner(Module module, bool modify, bool async=false, CUstream cudaStream=NULL, S64 validSize=-1)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx cuCtxSynchronize
FW_CUDA_FUNC T dot(const VectorBase< T, L, S > &a, const VectorBase< T, L, V > &b)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat x
Buffer & getRayBuffer()
Gets ray buffer.
bool getNeedClosestHit() const
Returns whether the closest hit is needed.
void setTexRef(const String &name, Buffer &buf, CUarray_format format, int numComponents)
void define(const String &key, const String &value="")
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z
static int getComputeCapability(void)
void getBBox(Vec3f &lo, Vec3f &hi) const
Gets scene AABB's minimum and maximum vector.
FW_CUDA_FUNC T sum(const VectorBase< T, L, S > &v)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat y
static Environment * GetSingleton()
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
F32 traceOnDemandKdtree(RayBuffer &rays, bool rebuild, int numRays=0)
Buffer & getResultBuffer()
Gets ray result buffer.
F32 traceOnDemandBVH(RayBuffer &rays, bool rebuild, int numRays=0)
Ray buffer class. Stores rays.
U8 * getMutablePtr(S64 ofs=0)
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
int getNumTriangles(void) const
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx, F32 &heap)
String sprintf(const char *fmt,...)
bool GetFloatValue(const char *name, float &value, const bool isFatal=false) const
S32 numNodeTests
Total number of ray-node tests.
Class holding information about a split of a BVH node.
FW_CUDA_FUNC void normalize(T len=(T) 1)
Buffer & getGlobal(const String &name)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
FW_CUDA_FUNC F32 cross(const Vec2f &a, const Vec2f &b)
void clearRange(S64 dstOfs, int value, S64 size, bool async=false, CUstream cudaStream=NULL)
F32 traceBatchKdtree(RayBuffer &rays, RayStats *stats=NULL)
CudaPersistentBVHTracer(Scene &scene, F32 epsilon)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
void printf(const char *fmt,...)
void traceOnDemandTrace(RayBuffer &rays, F32 &GPUmegakernel, F32 &CPUmegakernel, F32 &GPUtravKernel, F32 &CPUtravKernel, int &buildNodes, RayStats *stats=NULL)
String & appendf(const char *fmt,...)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei width
FW_CUDA_FUNC F64 log(F64 a)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height
FW_CUDA_FUNC S normalized(T len=(T) 1) const
int setParamPtr(CUfunction kernel, int offset, CUdeviceptr value)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei depth
bool GetIntValue(const char *name, int &value, const bool isFatal=false) const
F32 launchTimed(bool yield=true)
static void staticInit(void)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type
void fail(const char *fmt,...)
void addOptions(const String &options)
void resizeDiscard(S64 size)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
void setCachePath(const String &path)
void reset(U32 hints, int align)