4 #include "CudaNoStructTracer.hpp"
6 #include "../../../../AppEnvironment.h"
8 #define TASK_SIZE 430000
18 #include "kernels/thrustTest.hpp"
27 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
30 m_compiler.
define(
"FERMI");
35 m_numTris = scene.triangles.size();
36 m_numVerts = m_numTris * 3;
38 m_numShadingNormals = m_numVerts;
39 m_numTextureCoords = m_numVerts;
69 for (
int i = 0; i < m_numTris; i++)
71 Triangle& tris = *scene.triangles[i];
72 for(
int j = 0; j < 3; j++)
75 snout[i*3+j] =
Vec3f(tris.normals[j].x,tris.normals[j].y,tris.normals[j].z);
78 *tcout =
Vec4f(tris.vertices[j].x,tris.vertices[j].y,tris.vertices[j].z,0);
92 m.type = MeshBase::Material::MaterialType_Phong;
94 m.gloss_alpha =
Vec2f(0.0
f, 0.
f);
97 unsigned int matid = 1;
101 for(
int i=0,j=0;i<m_numTris;i++,j+=3)
104 tout->vertices =
Vec3i(j,j+1,j+2);
105 Triangle& tris = *scene.triangles[i];
106 Vector3 normalVec = tris.GetNormal();
107 tout->normal =
Vec3f(normalVec.x,normalVec.y,normalVec.z);
108 *nout = tout->normal;
116 tout->materialColor = diffuseColor.
toABGR();
117 tout->shadedColor =
Vec4f( diffuseColor.
getXYZ() * (
dot(tout->normal, light) * 0.5f + 0.5f), 1.0
f).
toABGR();
118 tout->materialId = matid;
120 *scout = tout->shadedColor;
121 *mcout = tout->materialColor;
139 m_kernelFile =
"src/rt/kernels/persistent_nostruct.cu";
141 m_module = m_compiler.
compile();
149 m_numRays = rand.
getU32(1, 1000000);
150 m_numTris = rand.
getU32(1, 1000000);
161 for(
int i=0;i<m_numTris;i++)
168 int rnd = rand.
getS32(-1, 2);
186 for(
int i = 0; i < m_numRays; i++)
197 memcpy(&mRay.origin, &ray.origin,
sizeof(
Vec3f));
198 memcpy(&mRay.direction, &ray.direction,
sizeof(
Vec3f));
200 bool intersects = m_bbox.ComputeMinMaxT(mRay,
204 if (ray.tmin < 1e-3
f)
209 int rnd = rand.
getS32(-1, 3);
234 m_gpuTime = traceCudaRayBuffer(rays);
235 m_cpuTime = m_timer.
end();
247 #ifdef MALLOC_SCRATCHPAD
250 printf(
"Setting dynamic memory limit to %fMB\n", (
float)(m_trisIndex.
getSize()*5*3)/(
float)(1024*1024));
252 cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, m_trisIndex.
getSize()*5*3);
257 m_kernelFile =
"src/rt/kernels/persistent_bvh.cu";
259 m_kernelFile =
"src/rt/kernels/persistent_sbvh.cu";
262 m_module = m_compiler.
compile();
267 m_numTris = rand.
getU32(1, 1000000);
277 for(
int i=0;i<m_numTris;i++)
284 int rnd = rand.
getU32(0, 2);
302 S64 bvhSize = ((m_numTris/2 *
sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
306 #ifdef COMPACT_LAYOUT
319 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
324 m_gpuTime = buildCudaBVH();
325 m_cpuTime = m_timer.
end();
345 #ifdef COMPACT_LAYOUT
346 #ifdef WOOP_TRIANGLES
347 String kernelName(
"src/rt/kernels/fermi_speculative_while_while");
349 String kernelName(
"src/rt/kernels/fermi_speculative_while_while_inter");
355 String kernelName(
"src/rt/kernels/fermi_persistent_speculative_while_while_inter");
357 m_compiler.
addOptions(
"-use_fast_math -maxrregcount 40");
362 kernelName +=
"_statistics";
367 m_module = m_compiler.
compile();
370 CUfunction queryKernel = m_module->
getKernel(
"queryConfig");
372 fail(
"Config query kernel not found!");
383 m_module->launchKernel(queryKernel, 1, 1);
388 kernel = m_module->
getKernel(
"trace_stats");
392 fail(
"Trace kernel not found!");
402 #ifdef COMPACT_LAYOUT
403 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
405 Buffer& indexBuf = m_trisIndexOut;
407 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
409 Buffer& indexBuf = m_trisIndex;
416 in.nodesA = nodePtr + nodeOfsA.x;
417 in.trisA = triPtr + triOfsA.x;
420 in.triIndices = indexBuf.getCudaPtr();
424 m_module->
setTexRef(
"t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
425 m_module->
setTexRef(
"t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
426 m_module->
setTexRef(
"t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
429 int desiredWarps = (rays.
getSize() + 31) / 32;
437 int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
438 Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
451 F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
457 stats->numEmptyLeavesVisited += *(
U32*)m_module->
getGlobal(
"g_NumEmptyLeaves").
getPtr();
459 stats->numFailedTriangleTests += *(
U32*)m_module->
getGlobal(
"g_NumFailedTris").
getPtr();
460 stats->numSuccessTriangleTestsOutside += *(
U32*)m_module->
getGlobal(
"g_NumHitTrisOutside").
getPtr();
464 m_gpuTime = launchTime;
465 m_cpuTime = m_timer.
end();
470 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
479 m_kernelFile =
"src/rt/kernels/persistent_kdtree.cu";
481 m_module = m_compiler.
compile();
484 prepareDynamicMemory();
488 m_numTris = rand.
getU32(1, 1000000);
498 for(
int i=0;i<m_numTris;i++)
505 int rnd = rand.
getU32(0, 2);
530 #ifndef INTERLEAVED_LAYOUT
534 S64 kdtreeSize = ((m_numTris*20 *
sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
538 #ifndef COMPACT_LAYOUT
542 #ifdef DUPLICATE_REFERENCES
554 S64 kdtreeSize = ((m_numTris*5 *
sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (
sizeof(
Vec4f)+
sizeof(
S32))) + 4096 - 1) & -4096;
558 m_bvhData.clearRange32(0, 0, kdtreeSize);
561 m_gpuTime = buildCudaKdtree();
562 m_cpuTime = m_timer.
end();
582 #ifdef COMPACT_LAYOUT
583 #ifdef WOOP_TRIANGLES
584 #ifdef DUPLICATE_REFERENCES
585 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while_childPtr");
588 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while_leafRef");
591 #error Undefined kernel
594 String kernelName(
"src/rt/kernels/fermi_kdtree_while_while");
603 kernelName +=
"_statistics";
608 m_module = m_compiler.
compile();
611 CUfunction queryKernel = m_module->
getKernel(
"queryConfig");
613 fail(
"Config query kernel not found!");
624 m_module->launchKernel(queryKernel, 1, 1);
629 kernel = m_module->
getKernel(
"trace_stats");
633 fail(
"Trace kernel not found!");
643 #ifndef INTERLEAVED_LAYOUT
644 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
646 Buffer& indexBuf = m_trisIndexOut;
650 Buffer& indexBuf = m_bvhData;
657 memcpy(&in.bmin, &m_bbox.min,
sizeof(float3));
658 memcpy(&in.bmax, &m_bbox.max,
sizeof(float3));
659 in.nodesA = nodePtr + nodeOfsA.x;
660 in.trisA = triPtr + triOfsA.x;
663 in.triIndices = indexBuf.getCudaPtr();
667 m_module->
setTexRef(
"t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
668 m_module->
setTexRef(
"t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
669 m_module->
setTexRef(
"t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);
672 int desiredWarps = (rays.
getSize() + 31) / 32;
680 int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
681 Vec2i gridSize((desiredWarps + blockWarps - 1) / blockWarps, 1);
694 F32 launchTime = m_module->launchKernelTimed(kernel, blockSize, gridSize);
700 stats->numEmptyLeavesVisited += *(
U32*)m_module->
getGlobal(
"g_NumEmptyLeaves").
getPtr();
702 stats->numFailedTriangleTests += *(
U32*)m_module->
getGlobal(
"g_NumFailedTris").
getPtr();
703 stats->numSuccessTriangleTestsOutside += *(
U32*)m_module->
getGlobal(
"g_NumHitTrisOutside").
getPtr();
707 m_gpuTime = launchTime;
708 m_cpuTime = m_timer.
end();
713 m_compiler.
addOptions(
"-use_fast_math -Xptxas -dlcm=cg");
726 m_kernelFile =
"src/rt/kernels/persistent_ondemand.cu";
728 m_module = m_compiler.
compile();
733 for(
int i=0;i<m_numTris;i++)
750 S64 bvhSize = ((m_numTris *
sizeof(CudaBVHNode)) + 4096 - 1) & -4096;
755 #ifdef COMPACT_LAYOUT
760 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
767 m_gpuTime = traceOnDemandBVHRayBuffer(rays, rebuild);
768 m_cpuTime = m_timer.
end();
784 m_kernelFile =
"src/rt/kernels/persistent_ondemand_kdtree.cu";
786 m_module = m_compiler.
compile();
789 prepareDynamicMemory();
793 for(
int i=0;i<m_numTris;i++)
818 #ifndef INTERLEAVED_LAYOUT
819 S64 kdtreeSize = ((m_numTris *
sizeof(CudaKdtreeNode)) + 4096 - 1) & -4096;
823 #ifndef COMPACT_LAYOUT
827 #ifdef DUPLICATE_REFERENCES
837 S64 kdtreeSize = ((m_numTris*5 *
sizeof(CudaKdtreeNode) + m_numTris*10 * 3 * (
sizeof(
Vec4f)+
sizeof(
S32))) + 4096 - 1) & -4096;
841 m_bvhData.clearRange32(0, 0, kdtreeSize);
846 m_gpuTime = traceOnDemandKdtreeRayBuffer(rays, rebuild);
847 m_cpuTime = m_timer.
end();
860 m_module = m_compiler.
compile();
866 fail(
"Build kernel not found!");
868 F32 tTrace, tTraceCPU;
869 #ifndef ONDEMAND_FULL_BUILD
874 inBVH.numTris = m_numTris;
881 #ifdef COMPACT_LAYOUT
888 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
889 Buffer& indexBuf = m_trisIndex;
897 in.nodesA = nodePtr + nodeOfsA.x;
898 in.trisA = triPtr + triOfsA.x;
904 m_module->
setTexRef(
"t_nodesA", m_bvhData, CU_AD_FORMAT_FLOAT, 4);
905 m_module->
setTexRef(
"t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
906 m_module->
setTexRef(
"t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
911 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
912 int gridSizeX = NUM_SM*numBlocksPerSM;
913 Vec2i gridSize(gridSizeX, 1);
921 oldNodes = tasks.numNodes;
933 tasks.warpCounter = rays.
getSize();
934 tasks.unfinished = -NUM_WARPS;
935 tasks.launchFlag = 1;
938 tTrace = m_module->launchKernelTimed(kernel, blockSize, gridSize);
939 tTraceCPU = m_timer.
end();
941 buildNodes += tasks.numNodes - oldNodes;
945 }
while(oldNodes != tasks.numNodes);
953 GPUmegakernel += tTrace;
954 CPUmegakernel += tTraceCPU;
978 if(m_kernelFile.
endsWith(
"kdtree.cu"))
986 m_module = m_compiler.
compile();
990 GPUtravKernel += tTrace;
991 CPUtravKernel += tTraceCPU;
996 float minTime = FLT_MAX;
1006 cout <<
"Prefix scan for problem size of " << width <<
"x" << height <<
" = " << width*height <<
"\n";
1009 cout <<
"Testing task pool" <<
"\n";
1011 cout <<
"Testing thrust" <<
"\n";
1014 for(
int i = 0; i < numRepeats; i++)
1017 float t = testSort(width*height);
1023 printf(
"Run %d sort in %fs\n", i, t);
1024 minTime =
min(t, minTime);
1028 printf(
"Minimum time from %d runs = %fs\n", numRepeats, minTime);
1029 printf(
"Average time from %d runs = %fs\n", numRepeats, sumTime/numRepeats);
1035 void CudaNoStructTracer::updateConstants()
1058 m_cutOffDepth = cudaEnv.optCutOffDepth;
1075 cudaEnv.siblingLimit = siblingLimit / WARP_SIZE;
1081 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
1083 cudaEnv.epsilon = m_epsilon;
1089 int CudaNoStructTracer::warpSubtasks(
int threads)
1092 return max((threads + WARP_SIZE - 1) / WARP_SIZE, 1);
1097 int CudaNoStructTracer::floatToOrderedInt(
float floatVal)
1099 int intVal = *((
int*)&floatVal);
1100 return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
1112 void CudaNoStructTracer::allocateSnapshots(
Buffer &snapData)
1115 #ifdef SNAPSHOT_POOL
1119 snapData.clearRange32(0, 0, SNAPSHOT_POOL*
sizeof(PoolInfo));
1121 #ifdef SNAPSHOT_WARP
1122 snapData.
resizeDiscard(
sizeof(WarpInfo)*SNAPSHOT_WARP*NUM_WARPS);
1125 snapData.clearRange32(0, 0, SNAPSHOT_WARP*NUM_WARPS*
sizeof(WarpInfo));
1131 void CudaNoStructTracer::printSnapshots(
Buffer &snapData)
1133 #ifdef SNAPSHOT_POOL
1134 PoolInfo* snapshots = (PoolInfo*)snapData.
getPtr();
1136 if(snapshots[SNAPSHOT_POOL-1].pool != 0)
1137 printf(
"\aSnapshot memory full!\n");
1139 long long int clockMin = snapshots[0].clockStart;
1140 long long int clockMax = 0;
1141 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1143 if(snapshots[i].pool == 0)
1145 clockMax = snapshots[i-1].clockEnd;
1150 ofstream snapfile(
"plots\\pool\\activity.dat");
1151 snapfile <<
"Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" <<
"\n";
1152 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1154 if(snapshots[i].pool == 0)
1157 snapfile << i <<
"\t" << snapshots[i].pool <<
"\t" << snapshots[i].tasks <<
"\t" << snapshots[i].active <<
"\t" << snapshots[i].chunks <<
"\t" << snapshots[i].depth
1158 <<
"\t" << snapshots[i].clockEnd - snapshots[i].clockStart <<
"\n";
1162 snapfile.open(
"plots\\pool\\activity_clockCor.dat");
1163 snapfile <<
"Snap#\tpool\t#tasks\t#active\t#chunks\tdepth\tclocks" <<
"\n";
1164 for(
int i = 0; i < SNAPSHOT_POOL; i++)
1166 if(snapshots[i].pool == 0)
1169 snapfile << (float)((
long double)(snapshots[i].clockEnd - clockMin) / (
long double)(clockMax - clockMin)) <<
"\t" << snapshots[i].pool <<
"\t" << snapshots[i].tasks <<
"\t"
1170 << snapshots[i].active <<
"\t" << snapshots[i].chunks <<
"\t" << snapshots[i].depth <<
"\t" << snapshots[i].clockEnd - snapshots[i].clockStart <<
"\n";
1175 #ifdef SNAPSHOT_WARP
1176 WarpInfo* snapshots = (WarpInfo*)snapData.
getPtr();
1178 for(
int w = 0; w < NUM_WARPS; w++)
1180 if(snapshots[SNAPSHOT_WARP-1].reads != 0)
1181 printf(
"\aSnapshot memory full for warp %d!\n", w);
1183 ostringstream filename;
1185 filename <<
"plots\\warps\\warp" << setw(3) << w <<
".dat";
1187 ofstream snapfile(filename.str());
1189 snapfile <<
"Snap#\t#reads\t#rays\t#tris\ttype(leaf=8)\t#chunks\tpopCount\tdepth\tcDequeue\tcCompute\tstackTop\ttaskIdx" <<
"\n";
1190 for(
int i = 0; i < SNAPSHOT_WARP; i++)
1192 if(snapshots[i].reads == 0)
1195 if(snapshots[i].clockDequeue < snapshots[i].clockSearch || snapshots[i].clockFinished < snapshots[i].clockDequeue)
1196 cout <<
"Error timer for warp " << w <<
"\n";
1198 snapfile << i <<
"\t" << snapshots[i].reads <<
"\t" << snapshots[i].rays <<
"\t" << snapshots[i].tris <<
"\t" << snapshots[i].type <<
"\t"
1199 << snapshots[i].chunks <<
"\t" << snapshots[i].popCount <<
"\t" << snapshots[i].depth <<
"\t" << (snapshots[i].clockDequeue - snapshots[i].clockSearch) <<
"\t"
1200 << (snapshots[i].clockFinished - snapshots[i].clockDequeue) <<
"\t" << snapshots[i].stackTop <<
"\t" << snapshots[i].idx <<
"\n";
1204 snapshots += SNAPSHOT_WARP;
1211 void CudaNoStructTracer::initPool(
int numRays,
Buffer* rayBuffer,
Buffer* nodeBuffer)
1215 #if PARALLELISM_TEST >= 0
1220 #ifndef MALLOC_SCRATCHPAD
1234 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1237 allocateSnapshots(snapData);
1244 m_taskData.clearRange32(0, TaskHeader_Empty,
TASK_SIZE *
sizeof(
int));
1246 m_taskData.clearRange32(0, TaskHeader_Empty,
TASK_SIZE * (
sizeof(
int)+
sizeof(Task)));
1261 if(rayBuffer !=
NULL)
1263 m_module->
setTexRef(
"t_rays", *rayBuffer, CU_AD_FORMAT_FLOAT, 4);
1265 if(nodeBuffer !=
NULL)
1267 m_module->
setTexRef(
"t_nodesA", *nodeBuffer, CU_AD_FORMAT_FLOAT, 4);
1269 m_module->
setTexRef(
"t_trisA", m_trisCompact, CU_AD_FORMAT_FLOAT, 4);
1270 m_module->
setTexRef(
"t_triIndices", m_trisIndex, CU_AD_FORMAT_SIGNED_INT32, 1);
1283 void CudaNoStructTracer::deinitPool(
int numRays)
1286 m_ppsTrisIndex.
reset();
1292 m_ppsRaysIndex.
reset();
1299 void CudaNoStructTracer::printPoolHeader(TaskStackBase* tasks,
int* header,
int numWarps,
FW::String state)
1301 #if PARALLELISM_TEST >= 0
1303 printf(
"Active: %d\n", numActive);
1307 #if defined(SNAPSHOT_POOL) || defined(SNAPSHOT_WARP)
1308 printSnapshots(snapData);
1312 Debug <<
"\nPRINTING DEBUG_INFO STATISTICS" <<
"\n\n";
1314 Debug <<
"\nPRINTING STATISTICS" <<
"\n\n";
1317 float4* debugData = (float4*)m_debug.
getPtr();
1318 float minAll[4] = {MAX_FLOAT, MAX_FLOAT, MAX_FLOAT, MAX_FLOAT};
1319 float maxAll[4] = {0, 0, 0, 0};
1320 float sumAll[4] = {0, 0, 0, 0};
1322 Debug <<
"Warp No. cnt_task_queues Avg. #Reads Max #Reads #Restarts" <<
"\n";
1323 for(
int i = 0; i < numWarps; i++)
1325 Debug <<
"Warp " << i <<
": (" << debugData[i].x <<
", " << debugData[i].y <<
", " << debugData[i].z <<
", " << debugData[i].w <<
")" <<
"\n";
1328 minAll[0] =
min(fabs(debugData[i].
x), minAll[0]);
1329 minAll[1] =
min(fabs(debugData[i].
y), minAll[1]);
1330 minAll[2] =
min(fabs(debugData[i].
z), minAll[2]);
1331 minAll[3] =
min(fabs(debugData[i].w), minAll[3]);
1333 maxAll[0] =
max(fabs(debugData[i].
x), maxAll[0]);
1334 maxAll[1] =
max(fabs(debugData[i].
y), maxAll[1]);
1335 maxAll[2] =
max(fabs(debugData[i].
z), maxAll[2]);
1336 maxAll[3] =
max(fabs(debugData[i].w), maxAll[3]);
1338 sumAll[0] += fabs(debugData[i].
x);
1339 sumAll[1] += fabs(debugData[i].
y);
1340 sumAll[2] += fabs(debugData[i].
z);
1341 sumAll[3] += fabs(debugData[i].w);
1343 if(debugData[i].
x < 0)
1346 Debug <<
"Dead=" << countDead <<
" / All=" << numWarps <<
" = " << (float)countDead/(
float)numWarps << "\
n";
1347 Debug << "Min: " << minAll[0] << ", " << minAll[1] << ", " << minAll[2] << ", " << minAll[3] << "\n";
1348 Debug << "Max: " << maxAll[0] << ", " << maxAll[1] << ", " << maxAll[2] << ", " << maxAll[3] << "\n";
1349 Debug << "Sum: " << sumAll[0] << ", " << sumAll[1] << ", " << sumAll[2] << ", " << sumAll[3] << "\n";
1350 Debug << "Avg: " << sumAll[0]/numWarps << ", " << sumAll[1]/numWarps << ", " << sumAll[2]/numWarps << ", " << sumAll[3]/numWarps << "\n\n" << "\n";
1351 Debug << "cnt_task_queues per
object = " << sumAll[0]/(
float)m_numTris << "\n";
1353 Debug << "Pool" << "\n";
1354 Debug << "Top = " << tasks->top << "; Bottom = " << tasks->bottom << "; Unfinished = " << tasks->unfinished << "; Size = " << tasks->sizePool << "; ";
1355 Debug << state.getPtr() << "\n";
1356 Debug << "ActiveTop = " << tasks->activeTop << "; Active = ";
1357 for(
int i = 0; i < ACTIVE_MAX+1; i++)
1358 Debug << tasks->active[i] << " ";
1359 Debug << "\n" << "\n";
1360 Debug << "EmptyTop = " << tasks->emptyTop << "; EmptyBottom = " << tasks->emptyBottom << "\nEmpty\n";
1361 for(
int i = 0; i < EMPTY_MAX+1; i++)
1367 Debug << tasks->empty[i];
1372 Debug <<
"\n" <<
"\n";
1375 int bellowEmpty = 0;
1376 Debug <<
"Header" <<
"\n";
1383 if(header[i] != TaskHeader_Empty)
1389 Debug << TaskHeader_Active;
1394 if(header[i] < TaskHeader_Empty)
1398 Debug <<
"\n\nEmptyItems = " << emptyItems <<
"\n";
1399 Debug <<
"BellowEmpty = " << bellowEmpty <<
"\n";
1404 void CudaNoStructTracer::printPool(TaskStackBVH &tasks,
int numWarps)
1406 #ifdef LEAF_HISTOGRAM
1407 printf(
"Leaf histogram\n");
1408 unsigned int leafSum = 0;
1409 unsigned int triSum = 0;
1412 printf(
"%d: %d\n", i, tasks.leafHist[i]);
1413 leafSum += tasks.leafHist[i];
1414 triSum += i*tasks.leafHist[i];
1416 printf(
"Leafs total %d, average leaf %.2f\n", leafSum, (
float)triSum/(
float)leafSum);
1419 int* header = (
int*)m_taskData.
getPtr();
1420 FW::String state =
sprintf(
"BVH Top = %d; Tri Top = %d; Warp counter = %d; ", tasks.nodeTop, tasks.triTop, tasks.warpCounter);
1421 #ifdef BVH_COUNT_NODES
1422 state.
appendf(
"Number of inner nodes = %d; Number of leaves = %d; Sorted tris = %d; ", tasks.numNodes, tasks.numLeaves, tasks.numSortedTris);
1424 printPoolHeader(&tasks, header, numWarps, state);
1426 Debug <<
"\n\nTasks" <<
"\n";
1427 TaskBVH* task = (TaskBVH*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
1432 long double sumTris = 0;
1433 long double maxTris = 0;
1436 long double cntSortTris = 0;
1441 char terminatedNames[TerminatedBy_Max][255] = {
1442 "None",
"Depth",
"TotalLimit",
"OverheadLimit",
"Cost",
"FailureCounter"
1445 int terminatedBy[TerminatedBy_Max];
1446 memset(&terminatedBy,0,
sizeof(
int)*TerminatedBy_Max);
1451 if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
1454 _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1455 terminatedBy[ task[i].terminatedBy ]++;
1458 Debug <<
"Task " << i <<
"\n";
1459 Debug <<
"Header: " << header[i] <<
"\n";
1460 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
1461 Debug <<
"Type: " << task[i].type <<
"\n";
1462 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
1463 Debug <<
"TriLeft: " << task[i].triLeft <<
"\n";
1464 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
1465 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
1466 Debug <<
"ParentIdx: " << task[i].parentIdx <<
"\n";
1467 Debug <<
"NodeIdx: " << task[i].nodeIdx <<
"\n";
1468 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
1469 Debug <<
"Split: (" << task[i].splitPlane.x <<
", " << task[i].splitPlane.y <<
", " << task[i].splitPlane.z <<
", " << task[i].splitPlane.w <<
")\n";
1470 Debug <<
"Box: (" << task[i].bbox.m_mn.x <<
", " << task[i].bbox.m_mn.y <<
", " << task[i].bbox.m_mn.z <<
") - ("
1471 << task[i].bbox.m_mx.x <<
", " << task[i].bbox.m_mx.y <<
", " << task[i].bbox.m_mx.z <<
")\n";
1476 Debug <<
"Axis: " << task[i].axis <<
"\n";
1477 Debug <<
"Depth: " << task[i].depth <<
"\n";
1478 Debug <<
"Step: " << task[i].step <<
"\n";
1482 #ifdef MALLOC_SCRATCHPAD
1483 Debug <<
"SubFailure: " << task[i].subFailureCounter <<
"\n";
1485 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
1486 Debug <<
"Parent: " << task[i].parent <<
"\n";
1490 Debug <<
"TerminatedBy: " << task[i].terminatedBy <<
"\n";
1492 if(task[i].terminatedBy != TerminatedBy_None)
1493 Debug <<
"Triangles: " << task[i].triEnd - task[i].triStart <<
"\n";
1498 if(header[i] > (
int)0xFF800000)
1501 if(task[i].
depth == m_cutOffDepth)
1504 long double tris = task[i].triEnd - task[i].triStart;
1505 if(task[i].terminatedBy != TerminatedBy_None)
1515 cntSortTris += tris;
1521 maxDepth =
max(task[i].
depth, maxDepth);
1522 syncCount += task[i].sync;
1528 if(stackMax == TASK_SIZE-1)
1529 printf(
"\aIncomplete result!\n");
1531 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
1537 Debug <<
"Avg naive task height (tris) = " << sumTris/(
long double)sortTasks <<
"\n";
1538 Debug <<
"Max naive task height (tris) = " << maxTris <<
", taskId: " << maxTaskId <<
"\n";
1539 Debug <<
"Cnt sorted operations = " << sortTasks <<
"\n";
1540 double cntTrisLog2Tris = (double(m_numTris) * (double)(logf(m_numTris)/logf(2.0
f)));
1541 Debug <<
"Cnt sorted triangles = " << cntSortTris <<
"\n";
1542 Debug <<
"Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris <<
"\n";
1544 Debug <<
"Max task depth = " << maxDepth <<
"\n";
1545 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
1546 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
1547 Debug <<
"Terminated by:" <<
"\n";
1548 for(
int i = 0; i < TerminatedBy_Max; i++)
1550 Debug << terminatedNames[i] <<
": " << terminatedBy[i] <<
"\n";
1554 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
1559 void CudaNoStructTracer::printPool(TaskStack &tasks,
int numWarps)
1562 int* header = (
int*)m_taskData.
getPtr();
1563 printPoolHeader(&tasks, header, numWarps,
FW::sprintf(
""));
1565 Debug <<
"\n\nTasks" <<
"\n";
1566 Task* task = (Task*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
1573 long double sumRays = 0;
1574 long double maxRays = 0;
1575 long double sumTris = 0;
1576 long double maxTris = 0;
1579 long double cntIsect = 0;
1580 long double maxIsect = 0;
1581 long double clippedIsect = 0;
1584 long double cntSortRays = 0;
1585 long double cntClippedRays = 0;
1586 long double cntSortTris = 0;
1592 char terminatedNames[TerminatedBy_Max][255] = {
1593 "None",
"Depth",
"TotalLimit",
"OverheadLimit",
"Cost",
"FailureCounter"
1596 int terminatedBy[TerminatedBy_Max];
1597 memset(&terminatedBy,0,
sizeof(
int)*TerminatedBy_Max);
1602 if(task[i].depend1 != TaskHeader_Empty || task[i].depend2 != TaskHeader_Empty)
1605 _ASSERT(task[i].terminatedBy >= 0 && task[i].terminatedBy < TerminatedBy_Max);
1606 terminatedBy[ task[i].terminatedBy ]++;
1609 Debug <<
"Task " << i <<
"\n";
1610 Debug <<
"Header: " << header[i] <<
"\n";
1611 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
1612 Debug <<
"Type: " << task[i].type <<
"\n";
1613 Debug <<
"RayStart: " << task[i].rayStart <<
"\n";
1614 Debug <<
"RayEnd: " << task[i].rayEnd <<
"\n";
1615 if(task[i].
type != TaskType_Intersect)
1617 Debug <<
"RayLeft: " << task[i].rayLeft <<
"\n";
1618 Debug <<
"RayRight: " << task[i].rayRight <<
"\n";
1619 Debug <<
"RayActive: " << task[i].rayActive <<
"\n";
1621 #ifdef CLIP_INTERSECT
1622 if(task[i].
type == TaskType_Intersect)
1623 Debug <<
"RayActive: " << task[i].rayActive <<
"\n";
1625 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
1626 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
1627 if(task[i].
type != TaskType_Intersect)
1630 Debug <<
"TriLeft: " << task[i].triLeft <<
"\n";
1631 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
1633 Debug <<
"Depend1: " << task[i].depend1 <<
"\n";
1634 Debug <<
"Depend2: " << task[i].depend2 <<
"\n";
1635 if(task[i].
type != TaskType_Intersect)
1637 Debug <<
"Split: (" << task[i].splitPlane.x <<
", " << task[i].splitPlane.y <<
", " << task[i].splitPlane.z <<
", " << task[i].splitPlane.w <<
")\n";
1639 Debug <<
"Box: (" << task[i].bbox.m_mn.x <<
", " << task[i].bbox.m_mn.y <<
", " << task[i].bbox.m_mn.z <<
") - ("
1640 << task[i].bbox.m_mx.x <<
", " << task[i].bbox.m_mx.y <<
", " << task[i].bbox.m_mx.z <<
")\n";
1647 Debug <<
"Depth: " << task[i].depth <<
"\n";
1651 Debug <<
"SubFailure: " << task[i].subFailureCounter <<
"\n";
1652 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
1653 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
1654 Debug <<
"Parent: " << task[i].parent <<
"\n";
1656 if(task[i].
type == TaskType_AABB_Max)
1657 #elif AABB_TYPE == 3
1658 if(task[i].
type == TaskType_AABB)
1661 Debug <<
"SubtaskIdx: " << task[i].subtaskIdx <<
"\n";
1662 Debug <<
"Clipped rays: " << task[i].rayEnd-task[i].rayActive <<
"\n";
1667 if(task[i].
depth == m_cutOffDepth)
1669 if(task[i].
type == TaskType_Intersect)
1671 #ifdef CLIP_INTERSECT
1672 long double locRays = task[i].rayActive - task[i].rayStart;
1674 long double locRays = task[i].rayEnd - task[i].rayStart;
1676 long double locTris = task[i].triEnd - task[i].triStart;
1677 Debug <<
"Intersections: " << locRays * locTris <<
"\n";
1680 if( locRays <
sqrt((
double)locTris) )
1682 if( locTris <
sqrt((
double)locRays) )
1686 Debug <<
"ClippedIntersections: " << task[i].clippedRays * locTris <<
"\n";
1687 clippedIsect += task[i].clippedRays * locTris;
1692 Debug <<
"Clock: " << task[i].clockEnd <<
"\n";
1695 Debug <<
"TerminatedBy: " << task[i].terminatedBy <<
"\n";
1702 if(task[i].
depth == m_cutOffDepth)
1706 #ifdef CLIP_INTERSECT
1707 long double rays = task[i].rayActive - task[i].rayStart;
1709 long double rays = task[i].rayEnd - task[i].rayStart;
1712 long double tris = task[i].triEnd - task[i].triStart;
1713 if(task[i].
type == TaskType_Intersect)
1716 cntIsect += rays*tris;
1717 maxIsect = max<long double>(rays*tris, maxIsect);
1718 if(maxIsect==(rays*tris)) maxTaskId = i;
1720 maxRays = max<long double>(rays, maxRays);
1722 maxTris = max<long double>(tris, maxTris);
1723 if(task[i].subFailureCounter > failureCount)
1727 if(task[i].
type == TaskType_AABB_Max)
1728 #elif AABB_TYPE == 3
1729 if(task[i].
type == TaskType_AABB)
1733 cntSortRays += rays;
1734 cntClippedRays += task[i].rayEnd-task[i].rayActive;
1735 cntSortTris += tris;
1742 maxDepth =
max(task[i].
depth, maxDepth);
1743 syncCount += task[i].sync;
1748 if(stackMax == TASK_SIZE-1)
1749 printf(
"\aIncomplete result!\n");
1751 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
1757 Debug <<
"ray_obj_intersections per ray = " << cntIsect/m_numRays <<
"\n";
1758 Debug <<
"cnt_leaves = " << isectTasks <<
"\n";
1759 Debug <<
"cnt_leaves per obj = " << (float)isectTasks/(
float)m_numTris << "\n";
1760 Debug << "ray_obj_intersections = " << cntIsect << "\n";
1761 Debug << "Useless ray_obj_intersections = " << clippedIsect << "\n";
1762 Debug << "Avg ray_obj_intersections per leaf = " << cntIsect/(
long double)isectTasks << "\n";
1763 Debug << "Max ray_obj_intersections per leaf = " << maxIsect << ", taskId: " << maxTaskId << "\n";
1764 Debug << "reduction [%] = " << 100.0
f * (cntIsect/((
long double)m_numRays*(
long double)m_numTris)) << "\n";
1765 Debug << "Avg naive task
width (rays) = " << sumRays/(
long double)isectTasks << "\n";
1766 Debug << "Max naive task
width (rays) = " << maxRays << "\n";
1767 Debug << "Avg naive task
height (tris) = " << sumTris/(
long double)isectTasks << "\n";
1768 Debug << "Max naive task
height (tris) = " << maxTris << "\n";
1769 Debug << "Cnt sorted operations = " << sortTasks << "\n";
1770 double cntTrisLog2Tris = (
double(m_numTris) * (
double)(logf(m_numTris)/logf(2.0
f)));
1771 double cntRaysLog2Tris = (
double(m_numRays) * (
double)(logf(m_numTris)/logf(2.0f)));
1772 Debug << "Cnt sorted triangles = " << cntSortTris << "\n";
1773 Debug << "Cnt sorted triangles/(
N log N), N=
#tris = " << cntSortTris/cntTrisLog2Tris << "\n";
1774 Debug <<
"Cnt sorted rays = " << cntSortRays <<
" BEFORE CLIPPING\n";
1775 Debug <<
"Cnt sorted rays/(log N)/R, N=#tris,R=#rays = " << cntSortRays/cntRaysLog2Tris <<
" BEFORE CLIPPING\n";
1776 Debug <<
"Cnt clipped rays = " << cntClippedRays <<
"\n";
1778 Debug <<
"Max task depth = " << maxDepth <<
"\n";
1779 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
1780 Debug <<
"Ray issues = " << rayIssues <<
", tris issues = " << triIssues <<
"\n";
1781 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
1783 Debug <<
"Terminated by:" <<
"\n";
1784 for(
int i = 0; i < TerminatedBy_Max; i++)
1786 Debug << terminatedNames[i] <<
": " << terminatedBy[i] <<
"\n";
1790 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
1795 F32 CudaNoStructTracer::traceCudaRayBuffer(
RayBuffer& rb)
1801 fail(
"Trace kernel not found!");
1809 in.numRays = m_numRays;
1810 in.numTris = m_numTris;
1826 kernel = m_module->
getKernel(
"__naive");
1828 fail(
"Trace kernel not found!");
1830 Vec2i blockSizeN(1024, 1);
1831 Vec2i gridSizeN((m_numRays+1023)/1024, 1);
1833 float tNaive = m_module->launchKernelTimed(kernel, blockSizeN, gridSizeN);
1835 printf(
"Verifying GPU trace\n");
1854 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
1855 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
1861 all.rayRight = m_numRays;
1862 all.rayEnd = m_numRays;
1865 all.triRight = m_numTris;
1866 all.triEnd = m_numTris;
1869 all.depend1 = DependType_Root;
1870 all.depend2 = DependType_None;
1871 all.lock = LockType_Free;
1872 all.bestCost = 1e38f;
1874 all.subFailureCounter = 0;
1875 Vector3
size = m_bbox.Diagonal();
1876 all.axis = size.MajorAxis();
1877 all.terminatedBy = TerminatedBy_None;
1882 all.clippedRays = 0;
1889 all.type = TaskType_Sort_PPS1;
1890 #elif SCAN_TYPE == 1
1891 all.type = TaskType_Sort_PPS1_Up;
1892 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
1893 all.type = TaskType_Sort_SORT1;
1896 all.unfinished = warpSubtasks(m_numRays) + warpSubtasks(m_numTris);
1897 all.bestOrder = warpSubtasks(m_numRays);
1898 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
1900 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
1901 else if(all.axis == 1)
1902 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
1904 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
1906 all.type = TaskType_Split;
1908 int evaluatedCandidates = (int)sqrtf(m_numRays) + (int)sqrtf(m_numTris);
1909 int numPlanes = 0.5f * (m_numRays + m_numTris)/evaluatedCandidates;
1910 all.unfinished = warpSubtasks(numPlanes);
1911 #elif SPLIT_TYPE == 2
1913 #elif SPLIT_TYPE == 3
1914 all.type = TaskType_SplitParallel;
1915 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
1916 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
1917 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
1922 all.type = TaskType_Sort_PPS1_Up;
1923 int pRays = warpSubtasks(m_numRays);
1924 all.bestOrder = pRays;
1925 int pTris = warpSubtasks(m_numTris);
1926 all.unfinished = pRays+pTris;
1929 all.origSize = all.unfinished;
1931 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(Task));
1934 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
1943 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
1944 tasks.active[0] = 0;
1947 tasks.activeTop = 1;
1952 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
1954 tasks.emptyBottom = 0;
1955 tasks.unfinished = -1;
1957 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
1958 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
1962 Vec2i blockSize(WARP_SIZE, 1);
1963 Vec2i gridSize(1, 1);
1968 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
1969 int gridSizeX = NUM_SM*numBlocksPerSM;
1970 int numWarps = numWarpsPerBlock*gridSizeX;
1971 Vec2i gridSize(gridSizeX, 1);
1973 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
1974 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
1977 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
1982 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
1997 for(
int i=0;i<m_numTris;i++)
2002 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *ptout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2008 if(*stout < -1 || *stout > 1)
2010 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *stout <<
"!\n" <<
"\n";
2019 for(
int i=0;i<m_numRays;i++)
2024 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *prout <<
" for " << m_numRays <<
" rays!" <<
"\n";
2030 if(*srout < -1 || *srout > 2)
2032 cout <<
"\nWTF " << i <<
" of " << m_numRays <<
": " << *srout <<
"!\n" <<
"\n";
2041 cout <<
"PPS correct for " << m_numTris <<
" triangles and " << m_numRays <<
" rays!" <<
"\n";
2086 printPool(tasks, numWarps);
2107 F32 CudaNoStructTracer::buildCudaBVH()
2112 fail(
"Build kernel not found!");
2114 #ifdef MALLOC_SCRATCHPAD
2116 in.numTris = m_numTris;
2119 #ifdef COMPACT_LAYOUT
2128 #ifndef MALLOC_SCRATCHPAD
2131 in.numTris = m_numTris;
2138 #ifdef COMPACT_LAYOUT
2143 CUfunction kernelAlloc = m_module->
getKernel(
"allocFreeableMemory", 2*
sizeof(
int));
2145 fail(
"Memory allocation kernel not found!");
2148 offset += m_module->
setParami(kernelAlloc, offset, m_numTris);
2149 offset += m_module->
setParami(kernelAlloc, offset, 0);
2150 F32 allocTime = m_module->launchKernelTimed(kernelAlloc,
Vec2i(1,1),
Vec2i(1, 1));
2153 printf(
"Memory allocated in %f\n", allocTime);
2156 CUfunction kernelMemCpyIndex = m_module->
getKernel(
"MemCpyIndex",
sizeof(CUdeviceptr)+
sizeof(
int));
2157 if (!kernelMemCpyIndex)
2158 fail(
"Memory copy kernel not found!");
2160 int memSize = m_trisIndex.
getSize()/
sizeof(int);
2163 offset += m_module->
setParami(kernelMemCpyIndex, offset, memSize);
2164 F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex,
Vec2i(256,1),
Vec2i((memSize-1+256)/256, 1));
2167 printf(
"Triangle indices copied in %f\n", memcpyTime);
2172 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2173 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2175 for(
int i = 0; i < 2; i++)
2177 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2178 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2179 split.children[i].cnt = 0;
2183 for(
int i = 0; i < NUM_WARPS; i++)
2185 for(
int j = 0; j < PLANE_COUNT; j++)
2186 sArray.splits[i][j] = split;
2190 for(
int i = 0; i < 2; i++)
2194 split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
2195 split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
2196 split.children[i].cnt = 0;
2200 for(
int j = 0; j < PLANE_COUNT; j++)
2201 sArray.splits[j] = split;
2203 m_splitData.
setRange(0, &sArray,
sizeof(SplitArray));
2206 m_splitData.
setRange(TASK_SIZE *
sizeof(SplitArray), &sArray,
sizeof(SplitArray));
2210 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
2211 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
2217 #ifndef MALLOC_SCRATCHPAD
2218 all.triRight = m_numTris;
2222 all.triEnd = m_numTris;
2225 all.lock = LockType_Free;
2226 all.bestCost = 1e38f;
2228 all.dynamicMemory= 0;
2229 #ifndef MALLOC_SCRATCHPAD
2235 Vector3 size = m_bbox.Diagonal();
2236 all.axis = size.MajorAxis();
2237 all.terminatedBy = TerminatedBy_None;
2247 all.type = TaskType_Sort_PPS1;
2248 #elif SCAN_TYPE == 1
2249 all.type = TaskType_Sort_PPS1_Up;
2250 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2251 all.type = TaskType_Sort_SORT1;
2253 all.unfinished = warpSubtasks(m_numTris);
2254 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2256 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2257 else if(all.axis == 1)
2258 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2260 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2261 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2262 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2263 all.type = TaskType_InitMemory;
2264 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
2266 all.type = TaskType_BinTriangles;
2267 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2272 all.origSize = all.unfinished;
2274 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2277 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2288 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
2289 tasks.active[0] = 0;
2292 tasks.activeTop = 1;
2297 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
2299 tasks.emptyBottom = 0;
2300 tasks.unfinished = -1;
2301 tasks.numSortedTris = 0;
2303 tasks.numLeaves = 0;
2304 tasks.numEmptyLeaves = 0;
2306 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2307 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2308 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
2310 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2321 Vec2i blockSize(WARP_SIZE, 1);
2322 Vec2i gridSize(1, 1);
2327 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2328 int gridSizeX = NUM_SM*numBlocksPerSM;
2329 int numWarps = numWarpsPerBlock*gridSizeX;
2330 Vec2i gridSize(gridSizeX, 1);
2332 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2333 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2336 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2341 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
2363 for(
int i=0;i<m_numTris;i++)
2368 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *pout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2374 if(*sout != 0 && *sout != 1)
2376 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *sout <<
"!\n" <<
"\n";
2385 cout <<
"PPS correct for " << m_numTris <<
" triangles!" <<
"\n";
2389 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2390 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
2397 printPool(tasks, numWarps);
2419 F32 CudaNoStructTracer::buildCudaKdtree()
2424 fail(
"Build kernel not found!");
2427 in.numTris = m_numTris;
2431 #ifndef INTERLEAVED_LAYOUT
2444 cudaEnv.optMaxDepth = k1 *
log2((
F32)m_numTris) + k2;
2445 cudaEnv.failureCount = f1 * cudaEnv.optMaxDepth + f2;
2447 printf(
"Maximum depth = %d\n", cudaEnv.optMaxDepth);
2448 printf(
"Failure count = %d\n", cudaEnv.failureCount);
2451 int baseOffset = setDynamicMemory();
2455 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
2456 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2457 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2459 for(
int i = 0; i < 2; i++)
2461 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
2462 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
2463 split.children[i].cnt = 0;
2467 for(
int i = 0; i < NUM_WARPS; i++)
2469 for(
int j = 0; j < PLANE_COUNT; j++)
2470 sArray.splits[i][j] = split;
2488 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
2501 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
2502 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
2515 all.triEnd = m_numTris;
2518 all.lock = LockType_Free;
2519 all.bestCost = 1e38f;
2521 all.dynamicMemory= baseOffset;
2522 #ifdef MALLOC_SCRATCHPAD
2523 all.subFailureCounter = 0;
2528 Vector3 size = m_bbox.Diagonal();
2529 all.axis = size.MajorAxis();
2530 all.terminatedBy = TerminatedBy_None;
2540 all.type = TaskType_Sort_PPS1;
2541 #elif SCAN_TYPE == 1
2542 all.type = TaskType_Sort_PPS1_Up;
2543 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
2544 all.type = TaskType_Sort_SORT1;
2546 all.unfinished = warpSubtasks(m_numTris);
2547 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
2549 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
2550 else if(all.axis == 1)
2551 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
2553 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
2554 #elif SPLIT_TYPE == 1
2555 all.type = TaskType_Split;
2556 #if 0 // SQRT candidates
2557 int evaluatedCandidates = (int)sqrtf(m_numTris);
2558 int evaluatedCandidates = 1;
2559 int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
2560 #elif 0 // Fixed candidates
2561 int numPlanes = 32768;
2562 #else // All candidates
2563 int numPlanes = m_numTris*6;
2565 all.unfinished = warpSubtasks(numPlanes);
2566 #elif SPLIT_TYPE == 2
2567 all.type = TaskType_Split;
2569 #elif SPLIT_TYPE == 3
2570 all.type = TaskType_SplitParallel;
2571 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
2572 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
2573 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
2574 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
2575 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
2576 all.type = TaskType_InitMemory;
2577 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
2579 all.type = TaskType_BinTriangles;
2580 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
2585 all.origSize = all.unfinished;
2587 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2590 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2596 #ifndef INTERLEAVED_LAYOUT
2599 tasks.nodeTop =
sizeof(CudaKdtreeNode);
2605 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
2606 tasks.active[0] = 0;
2609 tasks.activeTop = 1;
2614 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
2616 tasks.emptyBottom = 0;
2617 tasks.unfinished = -1;
2618 tasks.numSortedTris = 0;
2620 tasks.numEmptyLeaves = 0;
2621 tasks.numLeaves = 0;
2623 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2624 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2625 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
2632 Vec2i blockSize(WARP_SIZE, 1);
2633 Vec2i gridSize(1, 1);
2638 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2639 int gridSizeX = NUM_SM*numBlocksPerSM;
2640 int numWarps = numWarpsPerBlock*gridSizeX;
2641 Vec2i gridSize(gridSizeX, 1);
2643 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2644 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2647 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2652 float tKernel = 0.f;
2653 #ifndef DUPLICATE_REFERENCES
2656 tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
2678 for(
int i=0;i<m_numTris;i++)
2683 cout <<
"PPS error for item " << i <<
", CPU=" << sum <<
", GPU=" << *pout <<
" for " << m_numTris <<
" triangles!" <<
"\n";
2689 if(*sout != 0 && *sout != 1)
2691 cout <<
"\nWTF " << i <<
" of " << m_numTris <<
": " << *sout <<
"!\n" <<
"\n";
2700 cout <<
"PPS correct for " << m_numTris <<
" triangles!" <<
"\n";
2704 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2705 #ifndef INTERLEAVED_LAYOUT
2706 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
2708 if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.
getSize())
2716 printPool(tasks, numWarps);
2738 F32 CudaNoStructTracer::testSort(
S32 arraySize)
2740 m_compiler.
setSourceFile(
"src/rt/kernels/persistent_test.cu");
2741 m_module = m_compiler.
compile();
2747 kernel = m_module->
getKernel(
"testKeplerSort");
2749 fail(
"Sort kernel not found!");
2769 for(
int i=0; i < arraySize; i++)
2772 *tiout = (arraySize-1) - i;
2778 in.numTris = arraySize;
2787 all.triEnd = arraySize;
2791 all.bestCost = 1e38f;
2796 all.pivot = arraySize / 2;
2804 all.type = TaskType_Sort_PPS1;
2805 all.unfinished = warpSubtasks(arraySize);
2806 all.origSize = all.unfinished;
2808 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
2811 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
2820 memset(tasks.active, 0,
sizeof(
int)*(ACTIVE_MAX+1));
2821 tasks.activeTop = 1;
2827 tasks.emptyBottom = 0;
2828 tasks.unfinished = -1;
2830 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
2831 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
2835 Vec2i blockSize(WARP_SIZE, 1);
2836 Vec2i gridSize(1, 1);
2840 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
2841 int gridSizeX = NUM_SM*numBlocksPerSM;
2842 Vec2i gridSize(gridSizeX, 1);
2844 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
2845 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
2848 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
2853 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize,
false, 0,
false);
2861 for(
int i=0; i < arraySize; i++)
2865 printf(
"Sort error %d instead of %d\n", *tsort, i);
2871 Debug <<
"\nSort in " << tKernel <<
"\n\n";
2873 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
2874 int* header = (
int*)m_taskData.
getPtr();
2875 printPoolHeader(&tasks, header, blockSize.y*gridSize.x,
sprintf(
""));
2877 Debug <<
"\n\nTasks" <<
"\n";
2878 TaskBVH* task = (TaskBVH*)m_taskData.
getPtr(TASK_SIZE*
sizeof(
int));
2883 long double sumTris = 0;
2884 long double maxTris = 0;
2887 long double cntSortTris = 0;
2893 if(task[i].nodeIdx != TaskHeader_Empty || task[i].parentIdx != TaskHeader_Empty)
2895 Debug <<
"Task " << i <<
"\n";
2896 Debug <<
"Header: " << header[i] <<
"\n";
2897 Debug <<
"Unfinished: " << task[i].unfinished <<
"\n";
2898 Debug <<
"Type: " << task[i].type <<
"\n";
2899 Debug <<
"TriStart: " << task[i].triStart <<
"\n";
2900 Debug <<
"TriEnd: " << task[i].triEnd <<
"\n";
2901 Debug <<
"TriRight: " << task[i].triRight <<
"\n";
2902 Debug <<
"ParentIdx: " << task[i].parentIdx <<
"\n";
2903 Debug <<
"NodeIdx: " << task[i].nodeIdx <<
"\n";
2904 Debug <<
"TaskID: " << task[i].taskID <<
"\n";
2905 Debug <<
"Depth: " << task[i].depth <<
"\n";
2910 Debug <<
"GMEMSync: " << task[i].sync <<
"\n";
2911 Debug <<
"Parent: " << task[i].parent <<
"\n";
2913 Debug <<
"Triangles: " << task[i].triEnd - task[i].triStart <<
"\n";
2914 Debug <<
"Pivot: " << task[i].pivot <<
"\n";
2920 if(task[i].
depth == m_cutOffDepth)
2923 long double tris = task[i].triEnd - task[i].triStart;
2931 cntSortTris += tris;
2937 maxDepth =
max(task[i].
depth, maxDepth);
2938 syncCount += task[i].sync;
2943 if(stackMax == TASK_SIZE-1)
2944 printf(
"\aIncomplete result!\n");
2946 Debug <<
"\n\nStatistics for cutoff depth " << m_cutOffDepth <<
"\n\n";
2952 Debug <<
"Avg naive task height (tris) = " << sumTris/(
long double)sortTasks <<
"\n";
2953 Debug <<
"Max naive task height (tris) = " << maxTris <<
", taskId: " << maxTaskId <<
"\n";
2954 Debug <<
"Cnt sorted operations = " << sortTasks <<
"\n";
2955 double cntTrisLog2Tris = (double(arraySize) * (double)(logf(arraySize)/logf(2.0f)));
2956 Debug <<
"Cnt sorted triangles = " << cntSortTris <<
"\n";
2957 Debug <<
"Cnt sorted triangles/(N log N), N=#tris = " << cntSortTris/cntTrisLog2Tris <<
"\n";
2959 Debug <<
"Max task depth = " << maxDepth <<
"\n";
2960 Debug <<
"Cnt gmem synchronizations: " << syncCount <<
"\n";
2961 Debug <<
"Leafs failed to subdivide = " << subFailed <<
" (*3) => total useless tasks " << subFailed * 3 <<
"\n";
2964 Debug <<
"max_queue_length = " << stackMax <<
"\n\n" <<
"\n";
2969 F32 CudaNoStructTracer::traceOnDemandBVHRayBuffer(
RayBuffer& rays,
bool rebuild)
2974 fail(
"Build kernel not found!");
2983 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
2987 inBVH.numTris = m_numTris;
2994 #ifdef COMPACT_LAYOUT
3000 CUdeviceptr nodePtr = m_bvhData.
getCudaPtr();
3001 CUdeviceptr triPtr = m_trisCompact.
getCudaPtr();
3002 Buffer& indexBuf = m_trisIndex;
3012 in.nodesA = nodePtr + nodeOfsA.x;
3013 in.trisA = triPtr + triOfsA.x;
3020 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3021 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3023 for(
int i = 0; i < 2; i++)
3025 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3026 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3027 split.children[i].cnt = 0;
3031 for(
int i = 0; i < NUM_WARPS; i++)
3033 for(
int j = 0; j < PLANE_COUNT; j++)
3034 sArray.splits[i][j] = split;
3038 for(
int i = 0; i < 2; i++)
3042 split.children[i].bbox.m_mn = make_int3(floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX), floatToOrderedInt(FLT_MAX));
3043 split.children[i].bbox.m_mx = make_int3(floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX), floatToOrderedInt(-FLT_MAX));
3044 split.children[i].cnt = 0;
3048 for(
int j = 0; j < PLANE_COUNT; j++)
3049 sArray.splits[j] = split;
3051 m_splitData.
setRange(0, &sArray,
sizeof(SplitArray));
3054 m_splitData.
setRange(TASK_SIZE *
sizeof(SplitArray), &sArray,
sizeof(SplitArray));
3057 m_bvhData.clearRange32(0, UNBUILD_FLAG,
sizeof(CudaBVHNode));
3060 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
3061 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
3067 all.triRight = m_numTris;
3068 all.triEnd = m_numTris;
3071 all.lock = LockType_Free;
3072 all.bestCost = 1e38f;
3074 #ifndef MALLOC_SCRATCHPAD
3080 Vector3 size = m_bbox.Diagonal();
3081 all.axis = size.MajorAxis();
3083 all.terminatedBy = TerminatedBy_None;
3090 all.cached = LockType_None;
3094 all.type = TaskType_Sort_PPS1;
3095 #elif SCAN_TYPE == 1
3096 all.type = TaskType_Sort_PPS1_Up;
3097 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3098 all.type = TaskType_Sort_SORT1;
3100 all.unfinished = warpSubtasks(m_numTris);
3101 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3103 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3104 else if(all.axis == 1)
3105 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3107 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3108 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3109 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3110 all.type = TaskType_InitMemory;
3111 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
3113 all.type = TaskType_BinTriangles;
3114 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3117 all.origSize = all.unfinished;
3119 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
3122 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
3130 tasks.launchFlag = 0;
3139 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
3140 tasks.active[0] = 0;
3143 tasks.activeTop = 1;
3148 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
3150 tasks.emptyBottom = 0;
3151 tasks.numSortedTris = 0;
3153 tasks.numLeaves = 0;
3154 tasks.numEmptyLeaves = 0;
3156 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
3157 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
3158 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
3165 tasks.warpCounter = rays.
getSize();
3166 tasks.unfinished = -NUM_WARPS;
3168 #if SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3179 Vec2i blockSize(WARP_SIZE, 1);
3180 Vec2i gridSize(1, 1);
3185 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
3186 int gridSizeX = NUM_SM*numBlocksPerSM;
3187 int numWarps = numWarpsPerBlock*gridSizeX;
3188 Vec2i gridSize(gridSizeX, 1);
3190 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3191 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
3194 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
3200 float tKernel = m_module->launchKernelTimed(kernel, blockSize, gridSize);
3206 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
3207 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaBVHNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
3213 printPool(tasks, numWarps);
3232 F32 CudaNoStructTracer::traceOnDemandKdtreeRayBuffer(
RayBuffer& rays,
bool rebuild)
3237 fail(
"Build kernel not found!");
3246 cudaEnv.subdivThreshold = (m_bbox.SurfaceArea() / (float)m_numRays) * ((float)cudaEnv.optCt/10.0f);
3249 cudaEnv.optMaxDepth = k1 *
log2((
F32)m_numTris) + k2;
3254 printf(
"Maximum depth = %d\n", cudaEnv.optMaxDepth);
3255 printf(
"Failure count = %d\n", cudaEnv.failureCount);
3261 inBVH.numTris = m_numTris;
3264 #ifndef INTERLEAVED_LAYOUT
3270 CUdeviceptr nodePtr = m_bvhData.
getCudaPtr();
3272 #ifndef INTERLEAVED_LAYOUT
3273 CUdeviceptr triPtr = m_trisCompactOut.
getCudaPtr();
3275 Buffer& indexBuf = m_trisIndexOut;
3279 Buffer& indexBuf = m_bvhData;
3288 memcpy(&in.bmin, &m_bbox.min,
sizeof(float3));
3289 memcpy(&in.bmax, &m_bbox.max,
sizeof(float3));
3290 in.nodesA = nodePtr + nodeOfsA.x;
3291 in.trisA = triPtr + triOfsA.x;
3294 in.triIndices = indexBuf.getCudaPtr();
3298 m_module->
setTexRef(
"t_nodesI", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
3304 int baseOffset = setDynamicMemory();
3308 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
3309 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3310 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3312 for(
int i = 0; i < 2; i++)
3314 split.children[i].bbox.m_mn = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
3315 split.children[i].bbox.m_mx = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
3316 split.children[i].cnt = 0;
3320 for(
int i = 0; i < NUM_WARPS; i++)
3322 for(
int j = 0; j < PLANE_COUNT; j++)
3323 sArray.splits[i][j] = split;
3341 m_splitData.
clearRange(0, 0,
sizeof(SplitInfoTri));
3353 m_bvhData.clearRange32(0, UNBUILD_FLAG,
sizeof(CudaKdtreeNode));
3356 memcpy(&bbox.m_mn, &m_bbox.min,
sizeof(float3));
3357 memcpy(&bbox.m_mx, &m_bbox.max,
sizeof(float3));
3364 all.triEnd = m_numTris;
3367 all.lock = LockType_Free;
3368 all.bestCost = 1e38f;
3370 all.dynamicMemory= baseOffset;
3371 #ifdef MALLOC_SCRATCHPAD
3372 all.subFailureCounter = 0;
3377 Vector3 size = m_bbox.Diagonal();
3378 all.axis = size.MajorAxis();
3380 all.terminatedBy = TerminatedBy_None;
3387 all.cached = LockType_None;
3391 all.type = TaskType_Sort_PPS1;
3392 #elif SCAN_TYPE == 1
3393 all.type = TaskType_Sort_PPS1_Up;
3394 #elif SCAN_TYPE == 2 || SCAN_TYPE == 3
3395 all.type = TaskType_Sort_SORT1;
3397 all.unfinished = warpSubtasks(m_numTris);
3398 float pos = m_bbox.min[all.axis] + m_bbox.Size(all.axis)/2.0f;
3400 all.splitPlane = make_float4(1.f, 0.f, 0.f, -pos);
3401 else if(all.axis == 1)
3402 all.splitPlane = make_float4(0.f, 1.f, 0.f, -pos);
3404 all.splitPlane = make_float4(0.f, 0.f, 1.f, -pos);
3405 #elif SPLIT_TYPE == 1
3406 all.type = TaskType_Split;
3407 #if 0 // SQRT candidates
3408 int evaluatedCandidates = (int)sqrtf(m_numTris);
3409 int evaluatedCandidates = 1;
3410 int numPlanes = 0.5f * m_numTris/evaluatedCandidates;
3411 #elif 0 // Fixed candidates
3412 int numPlanes = 32768;
3413 #else // All candidates
3414 int numPlanes = m_numTris*6;
3416 all.unfinished = warpSubtasks(numPlanes);
3417 #elif SPLIT_TYPE == 2
3418 all.type = TaskType_Split;
3420 #elif SPLIT_TYPE == 3
3421 all.type = TaskType_SplitParallel;
3422 int evaluatedRays = warpSubtasks((
int)sqrtf(m_numRays));
3423 int evaluatedTris = warpSubtasks((
int)sqrtf(m_numTris));
3424 all.unfinished = PLANE_COUNT*(evaluatedRays+evaluatedTris);
3425 #elif SPLIT_TYPE >= 4 && SPLIT_TYPE <= 6
3426 #if BINNING_TYPE == 0 || BINNING_TYPE == 1
3427 all.type = TaskType_InitMemory;
3428 all.unfinished = warpSubtasks(
sizeof(SplitArray)/
sizeof(
int));
3430 all.type = TaskType_BinTriangles;
3431 all.unfinished = (warpSubtasks(m_numTris)+BIN_MULTIPLIER-1)/BIN_MULTIPLIER;
3434 all.origSize = all.unfinished;
3436 m_taskData.
setRange(TASK_SIZE *
sizeof(
int), &all,
sizeof(TaskBVH));
3439 m_taskData.
setRange(0, &all.unfinished,
sizeof(
int));
3447 tasks.launchFlag = 0;
3451 #ifndef INTERLEAVED_LAYOUT
3454 tasks.nodeTop =
sizeof(CudaKdtreeNode);
3460 memset(tasks.active, -1,
sizeof(
int)*(ACTIVE_MAX+1));
3461 tasks.active[0] = 0;
3464 tasks.activeTop = 1;
3469 memset(tasks.empty, 0,
sizeof(
int)*(EMPTY_MAX+1));
3471 tasks.emptyBottom = 0;
3472 tasks.numSortedTris = 0;
3474 tasks.numLeaves = 0;
3475 tasks.numEmptyLeaves = 0;
3477 tasks.sizeNodes = m_bvhData.
getSize()/
sizeof(CudaKdtreeNode);
3478 tasks.sizeTris = m_trisIndexOut.
getSize()/
sizeof(
S32);
3479 memset(tasks.leafHist, 0,
sizeof(tasks.leafHist));
3486 tasks.warpCounter = rays.
getSize();
3487 #ifndef ONDEMAND_FULL_BUILD
3488 tasks.unfinished = -NUM_WARPS;
3490 tasks.unfinished = -1;
3498 Vec2i blockSize(WARP_SIZE, 1);
3499 Vec2i gridSize(1, 1);
3504 Vec2i blockSize(WARP_SIZE, numWarpsPerBlock);
3505 int gridSizeX = NUM_SM*numBlocksPerSM;
3506 int numWarps = numWarpsPerBlock*gridSizeX;
3507 Vec2i gridSize(gridSizeX, 1);
3509 if(gridSizeX*numWarpsPerBlock != NUM_WARPS)
3510 printf(
"\aNUM_WARPS constant does not match the launch parameters\n");
3513 m_debug.
resizeDiscard(blockSize.y*gridSize.x*
sizeof(float4));
3519 float tKernel = 0.f;
3520 #ifndef DUPLICATE_REFERENCES
3524 tKernel += m_module->launchKernelTimed(kernel, blockSize, gridSize);
3540 tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
3541 #ifndef INTERLEAVED_LAYOUT
3542 if(tasks.unfinished != 0 || tasks.top > tasks.sizePool || tasks.nodeTop > m_bvhData.
getSize() /
sizeof(CudaKdtreeNode) || tasks.triTop > m_trisIndexOut.
getSize() /
sizeof(
S32))
3544 if(tasks.unfinished != 0 || tasks.nodeTop > m_bvhData.
getSize())
3551 printPool(tasks, numWarps);
3570 F32 CudaNoStructTracer::traceCpuRayBuffer(
RayBuffer& rb)
3574 for(
int rid=0; rid < rb.
getSize(); rid++)
3576 if(rid % 10000 == 0)
printf(
"rid: %d\n",rid);
3583 void CudaNoStructTracer::traceCpuRay(
const Ray& r,
RayResult& result,
bool anyHit)
3586 const S32 *t_trisIndices = (
S32*)(m_trisIndex.
getPtr());
3601 for (
int triAddr = 0; triAddr < m_numTris * 3 ; triAddr += 3)
3613 const float deni = 1.0f / den;
3615 float t =
dot(nrmN,org0)*deni;
3617 if (t > tmin && t < hitT)
3620 const float v =
dot(v00-v22,crossProd)*deni;
3621 if (v >= 0.0f && v <= 1.0f)
3623 const float u = -
dot(v00-v11,crossProd)*deni;
3624 if (u >= 0.0f && u + v <= 1.0f)
3636 hitIndex = hitIndex / 3;
3638 result.
id = hitIndex;
3644 void CudaNoStructTracer::saveBufferSizes(
bool ads,
bool aux)
3646 float MB = (float)(1024*1024);
3651 #ifndef COMPACT_LAYOUT
3653 m_sizeTriIdx = m_trisIndex.
getSize()/
MB;
3655 m_sizeTri = m_trisCompactOut.
getSize()/
MB;
3656 m_sizeTriIdx = m_trisIndexOut.
getSize()/
MB;
3664 #ifdef MALLOC_SCRATCHPAD
3665 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3667 cuCtxGetLimit(&heapSize, CU_LIMIT_MALLOC_HEAP_SIZE);
3668 m_heap = heapSize/
MB;
3678 void CudaNoStructTracer::prepareDynamicMemory()
3686 #if defined(SCATTER_ALLOC) || defined(FDG_ALLOC)
3687 U64 allocSize =
max(allocSize, 8ULL*1024ULL*1024ULL);
3690 #if !defined(ATOMIC_MALLOC) && !defined(SCATTER_ALLOC) && !defined(CIRCULAR_MALLOC)
3691 cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, allocSize);
3692 #elif defined(ATOMIC_MALLOC) || defined(CIRCULAR_MALLOC)
3694 #ifdef WITH_SCATTER_ALLOC
3697 #elif defined(SCATTER_ALLOC)
3701 #if defined(SCATTER_ALLOC) || defined(WITH_SCATTER_ALLOC)
3703 CUfunction initHeap = m_module->
getKernel(
"_ZN8GPUTools8initHeapILj4096ELj8ELj16ELj2ELb0ELb1EEEvPNS_10DeviceHeapIXT_EXT0_EXT1_EXT2_EXT3_EXT4_EEEPvj", 2*
sizeof(CUdeviceptr)+
sizeof(
int));
3705 fail(
"Scatter alloc initialization kernel not found!");
3709 #ifdef WITH_SCATTER_ALLOC
3714 offset += m_module->
setParami(initHeap, offset, allocSize);
3715 F32 initTime = m_module->launchKernelTimed(initHeap,
Vec2i(256,1),
Vec2i(1, 1));
3717 printf(
"Scatter alloc initialized in %f\n", initTime);
3721 int CudaNoStructTracer::setDynamicMemory()
3724 #if !defined(ATOMIC_MALLOC) && !defined(CIRCULAR_MALLOC)
3725 CUfunction kernelAlloc = m_module->
getKernel(
"allocFreeableMemory", 2*
sizeof(
int));
3727 fail(
"Memory allocation kernel not found!");
3730 offset += m_module->
setParami(kernelAlloc, offset, m_numTris);
3731 offset += m_module->
setParami(kernelAlloc, offset, 0);
3732 F32 allocTime = m_module->launchKernelTimed(kernelAlloc,
Vec2i(1,1),
Vec2i(1, 1));
3735 printf(
"Memory allocated in %f\n", allocTime);
3743 heapOffset = 4*m_numTris*
sizeof(int);
3745 heapOffset = m_numTris*
sizeof(int);
3749 heapSize = m_mallocData.
getSize();
3751 #if defined(CIRCULAR_MALLOC)
3752 #ifndef DOUBLY_LINKED
3753 int headerSize = 2*
sizeof(int);
3755 int headerSize = 3*
sizeof(int);
3757 heapOffset += headerSize;
3762 #ifndef DOUBLY_LINKED
3763 Vec2i first(LockType_Set, heapOffset);
3766 Vec3i first(LockType_Set, heapSize-headerSize, heapOffset);
3770 #ifdef GLOBAL_HEAP_LOCK
3771 #ifndef DOUBLY_LINKED
3772 Vec2i second(LockType_Free, heapSize-headerSize);
3775 Vec3i second(LockType_Free, 0, heapSize-headerSize);
3781 int numChunks = m_mallocData.
getSize()/heapOffset;
3782 for(
int i = 1; i < numChunks; i++)
3784 #ifndef DOUBLY_LINKED
3785 Vec2i next(0, (i+1)*heapOffset);
3788 Vec3i next(0, (i-1)*heapOffset, (i+1)*heapOffset);
3795 int delta = ((int)(heapOffset)+headerSize+3) & -4;
3800 for(ofs = heapOffset;
true; ofs += delta, i++)
3804 delta = ((int)(delta * 0.8f)+headerSize+3) & -4;
3809 if(ofs+delta >= heapSize-2*headerSize)
3812 #ifndef DOUBLY_LINKED
3813 Vec2i next(LockType_Free, ofs+delta);
3816 Vec3i next(LockType_Free, prevOfs, ofs+delta);
3824 #ifndef DOUBLY_LINKED
3825 Vec2i last(LockType_Free, heapSize-headerSize);
3828 Vec3i last(LockType_Free, prevOfs, heapSize-headerSize);
3833 #ifndef DOUBLY_LINKED
3834 Vec2i tail(LockType_Set, 0);
3835 m_mallocData.
setRange(heapSize-headerSize, &tail,
sizeof(
Vec2i));
3837 Vec3i tail(LockType_Set, ofs, 0);
3838 m_mallocData.
setRange(heapSize-headerSize, &tail,
sizeof(
Vec3i));
3842 baseOffset = headerSize;
3844 #ifdef WITH_SCATTER_ALLOC
3854 CUfunction kernelMemCpyIndex = m_module->
getKernel(
"MemCpyIndex",
sizeof(CUdeviceptr)+
sizeof(
int));
3855 if (!kernelMemCpyIndex)
3856 fail(
"Memory copy kernel not found!");
3858 int memSize = m_trisIndex.
getSize()/
sizeof(int);
3861 offset += m_module->
setParami(kernelMemCpyIndex, offset, memSize);
3862 F32 memcpyTime = m_module->launchKernelTimed(kernelMemCpyIndex,
Vec2i(256,1),
Vec2i((memSize-1+256)/256, 1));
3865 printf(
"Triangle indices copied in %f\n", memcpyTime);
3868 #ifdef SCATTER_ALLOC
3871 baseOffset = heap - base;
3884 CUfunction kernelCreateWoop = m_module->
getKernel(
"createWoop", 2*
sizeof(CUdeviceptr)+
sizeof(
int));
3885 if (!kernelCreateWoop)
3886 fail(
"Regular triangle to Woop triangle conversion kernel not found!");
3891 offset += m_module->
setParami(kernelCreateWoop, offset, m_numTris);
3892 F32 woopTime = m_module->launchKernelTimed(kernelCreateWoop,
Vec2i(256,1),
Vec2i((m_numTris-1+256)/256, 1));
3895 printf(
"Woop triangles created in %f\n", woopTime);
3907 m_trisCompactOut.
reset();
3908 m_trisIndexOut.
reset();
3911 m_mallocData.
reset();
3912 m_mallocData2.
reset();
3914 m_splitData.
reset();
3916 m_raysIndex.
reset();
3919 m_ppsTrisIndex.
reset();
3922 m_ppsRaysIndex.
reset();
3929 saveBufferSizes(
false,
true);
3934 U32 nN, nL, eL, sT, bT, tT, sTr;
3935 getStats(nN, nL, eL, sT, bT, tT, sTr);
3936 #ifdef COMPACT_LAYOUT
3937 m_bvhData.
resize(nN *
sizeof(CudaBVHNode));
3938 m_trisCompactOut.
resize(tT*3*
sizeof(float4) + nL*
sizeof(float4));
3939 m_trisIndexOut.
resize(tT*3*
sizeof(
int) + nL*
sizeof(
int));
3941 m_bvhData.
resize((nN + nL) *
sizeof(CudaBVHNode));
3945 saveBufferSizes(
true,
false);
3951 saveBufferSizes(
false,
true);
3956 U32 nN, nL, eL, sT, nT, tT, sTr;
3957 getStats(nN, nL, eL, sT, nT, tT, sTr);
3958 #ifndef INTERLEAVED_LAYOUT
3959 #ifndef COMPACT_LAYOUT
3960 getStats(nN, nL, eL, sT, nT, tT, sTr,
false);
3961 m_bvhData.
resize((nN + nL) *
sizeof(CudaKdtreeNode));
3962 m_trisCompactOut.
resize(tT*3*
sizeof(float4));
3963 m_trisIndexOut.
resize(tT*3*
sizeof(
int));
3965 #ifdef DUPLICATE_REFERENCES
3966 m_bvhData.
resize(nN *
sizeof(CudaKdtreeNode));
3967 m_trisCompactOut.
resize(tT*3*
sizeof(float4) + nL*
sizeof(float4));
3968 m_trisIndexOut.
resize(tT*3*
sizeof(
int) + nL*
sizeof(
int));
3970 m_bvhData.
resize(nN *
sizeof(CudaKdtreeNode));
3971 m_trisIndexOut.
resize(tT*
sizeof(
int) + nL*
sizeof(
int));
3980 saveBufferSizes(
true,
false);
3985 TaskStackBVH tasks = *(TaskStackBVH*)m_module->
getGlobal(
"g_taskStackBVH").
getPtr();
3987 #ifndef INTERLEAVED_LAYOUT
3988 #ifndef BVH_COUNT_NODES
3989 #ifndef COMPACT_LAYOUT
3990 nodes = tasks.nodeTop / 2;
3991 leaves = tasks.nodeTop - nodes;
3993 nodes = tasks.nodeTop;
3994 leaves = tasks.triTop;
3997 #else // BVH_COUNT_NODES
3998 nodes = tasks.numNodes;
3999 leaves = tasks.numLeaves;
4000 emptyLeaves = tasks.numEmptyLeaves;
4001 #endif // BVH_COUNT_NODES
4003 #ifdef COMPACT_LAYOUT
4004 tris = tasks.triTop;
4006 tris -= (leaves-emptyLeaves);
4007 #ifdef DUPLICATE_REFERENCES
4017 tris = tasks.triTop;
4022 #ifndef BVH_COUNT_NODES
4023 nodes = tasks.nodeTop / 2;
4024 leaves = tasks.nodeTop - nodes;
4026 #else // BVH_COUNT_NODES
4027 nodes = tasks.numNodes;
4028 leaves = tasks.numLeaves;
4029 emptyLeaves = tasks.numEmptyLeaves;
4030 #endif // BVH_COUNT_NODES
4032 tris = tasks.nodeTop - (nodes+leaves)*
sizeof(CudaKdtreeNode);
4033 tris /= 3*
sizeof(float4)+
sizeof(
int);
4036 nodeTop = tasks.nodeTop;
4037 sortedTris = tasks.numSortedTris;
4038 stackTop = tasks.top;
4044 split = m_sizeSplit;
4047 triIdx = m_sizeTriIdx;
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int offset
S32 getSize() const
Gets size of the buffer (number of rays).
bool endsWith(const String &str) const
void setRange(S64 dstOfs, const void *src, S64 size, bool async=false, CUstream cudaStream=NULL)
void traceOnDemandTrace(RayBuffer &rays, F32 &GPUmegakernel, F32 &CPUmegakernel, F32 &GPUtravKernel, F32 &CPUtravKernel, int &buildNodes, RayStats *stats=NULL)
float GetFloat(const char *name, const bool isFatal=false) const
CudaModule * compile(bool enablePrints=true, bool autoFail=true)
int GetInt(const char *name, const bool isFatal=false) const
F32 traceBatchKdtree(RayBuffer &rays, RayStats *stats=NULL)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N
CUdeviceptr getCudaPtr(S64 ofs=0)
S32 numTriangleTests
Total number of ray-triangle tests.
S32 numRays
Total number of rays.
CudaKernel getKernel(const String &name)
FW_CUDA_FUNC Vec3f getXYZ(void) const
FW_CUDA_FUNC F32 sqrt(F32 a)
int setParami(CUfunction kernel, int offset, S32 value)
Structure holding ray statistics. Also provides print to the console. These statistics are used in a ...
F32 traceBatch(RayBuffer &rays)
const U8 * getPtr(S64 ofs=0)
void setOwner(Module module, bool modify, bool async=false, CUstream cudaStream=NULL, S64 validSize=-1)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx cuCtxSynchronize
FW_CUDA_FUNC T dot(const VectorBase< T, L, S > &a, const VectorBase< T, L, V > &b)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat x
Buffer & getRayBuffer()
Gets ray buffer.
bool getNeedClosestHit() const
Returns whether the closest hit is needed.
void setTexRef(const String &name, Buffer &buf, CUarray_format format, int numComponents)
void define(const String &key, const String &value="")
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z
static int getComputeCapability(void)
FW_CUDA_FUNC T sum(const VectorBase< T, L, S > &v)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat y
static Environment * GetSingleton()
F32 traceOnDemandBVH(RayBuffer &rays, bool rebuild, int numRays=0)
CUdeviceptr getMutableCudaPtr(S64 ofs=0)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v
Buffer & getResultBuffer()
Gets ray result buffer.
F32 traceBatchBVH(RayBuffer &rays, RayStats *stats=NULL)
Ray buffer class. Stores rays.
U8 * getMutablePtr(S64 ofs=0)
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
CudaNoStructTracer(MiniMax::Scene &scene, F32 epsilon)
String sprintf(const char *fmt,...)
void getStats(U32 &nodes, U32 &leaves, U32 &emptyLeaves, U32 &stackTop, U32 &nodeTop, U32 &tris, U32 &sortedTris, bool sub=true)
bool GetFloatValue(const char *name, float &value, const bool isFatal=false) const
S32 numNodeTests
Total number of ray-node tests.
Class holding information about a split of a BVH node.
Buffer & getGlobal(const String &name)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
FW_CUDA_FUNC F32 cross(const Vec2f &a, const Vec2f &b)
void clearRange(S64 dstOfs, int value, S64 size, bool async=false, CUstream cudaStream=NULL)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei n
void printf(const char *fmt,...)
String & appendf(const char *fmt,...)
F32 traceOnDemandKdtree(RayBuffer &rays, bool rebuild, int numRays=0)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei width
void resetBuffers(bool resetADSBuffers)
FW_CUDA_FUNC F64 log(F64 a)
void getSizes(F32 &task, F32 &split, F32 &ads, F32 &tri, F32 &triIdx, F32 &heap)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height
FW_CUDA_FUNC S normalized(T len=(T) 1) const
int setParamPtr(CUfunction kernel, int offset, CUdeviceptr value)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei depth
bool GetIntValue(const char *name, int &value, const bool isFatal=false) const
static void staticInit(void)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type
void fail(const char *fmt,...)
void addOptions(const String &options)
void resizeDiscard(S64 size)
void setSourceFile(const String &path)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
void setCachePath(const String &path)
void reset(U32 hints, int align)