NTrace
GPU ray tracing framework
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
CudaKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009-2011, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "gpu/CudaKernel.hpp"
29 #include "gpu/CudaModule.hpp"
30 #include "base/Timer.hpp"
31 
32 using namespace FW;
33 
34 //------------------------------------------------------------------------
35 
37 : m_module (module),
38  m_function (function),
39  m_preferL1 (true),
40  m_sharedBankSize (4),
41  m_async (false),
42  m_stream (NULL),
43  m_gridSize (1, 1),
44  m_blockSize (1, 1)
45 {
46 }
47 
48 //------------------------------------------------------------------------
49 
51 {
52 }
53 
54 //------------------------------------------------------------------------
55 
56 int CudaKernel::getAttribute(CUfunction_attribute attrib) const
57 {
58  int value = 0;
59 #if (!FW_USE_CUDA)
60  FW_UNREF(attrib);
61 #elif (CUDA_VERSION >= 2020)
62  if (m_function && CudaModule::getDriverVersion() >= 22)
63  CudaModule::checkError("cuFuncGetAttribute", cuFuncGetAttribute(&value, attrib, m_function));
64 #endif
65  return value;
66 }
67 
68 //------------------------------------------------------------------------
69 
70 CudaKernel& CudaKernel::setParams(const Param* const* params, int numParams)
71 {
72  FW_ASSERT(numParams == 0 || params);
73  FW_ASSERT(numParams >= 0);
74 
75  int size = 0;
76  for (int i = 0; i < numParams; i++)
77  {
78  size = (size + params[i]->align - 1) & -params[i]->align;
79  size += params[i]->size;
80  }
81  m_params.reset(size);
82 
83  int ofs = 0;
84  for (int i = 0; i < numParams; i++)
85  {
86  ofs = (ofs + params[i]->align - 1) & -params[i]->align;
87  memcpy(m_params.getPtr(ofs), params[i]->value, params[i]->size);
88  ofs += params[i]->size;
89  }
90  return *this;
91 }
92 
93 //------------------------------------------------------------------------
94 
96 {
98 #if (CUDA_VERSION >= 2020)
99  int driver = CudaModule::getDriverVersion();
100 #endif
101 
102  // Details available => choose smallest block that reaches maximal occupancy.
103 
104 #if (CUDA_VERSION >= 2020)
105  if (m_function && driver >= 22)
106  {
107  int warpSize = max(CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_WARP_SIZE), 1);
108  int warpRounding = 2;
109  int maxBlocksPerSM = 8;
110  int maxSharedPerSM = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK);
111  int maxThreadsPerBlock = getAttribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
112  int maxThreadsPerSM = (arch >= 20 && arch < 30) ? 1536 : maxThreadsPerBlock;
113  int sharedPerBlock = getAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES);
114 
115 #if (CUDA_VERSION >= 4000)
116  if (driver >= 40)
117  maxThreadsPerSM = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
118 #endif
119 
120  if (arch >= 20 && m_preferL1 && sharedPerBlock <= maxSharedPerSM / 3)
121  maxSharedPerSM /= 3;
122 
123  int numBlocks = maxBlocksPerSM;
124  if (sharedPerBlock > 0)
125  numBlocks = min(numBlocks, maxSharedPerSM / sharedPerBlock);
126  numBlocks = min(numBlocks, maxThreadsPerSM / warpSize / warpRounding);
127  numBlocks = max(numBlocks, 1);
128 
129  int numWarps = maxThreadsPerBlock / warpSize;
130  numWarps = min(numWarps, maxThreadsPerSM / numBlocks / warpSize);
131  numWarps -= numWarps % warpRounding;
132  numWarps = max(numWarps, 1);
133  return Vec2i(warpSize, numWarps);
134  }
135 #endif
136 
137  // Otherwise => guess based on GPU architecture.
138 
139  return Vec2i(32, (arch < 20) ? 2 : 4);
140 }
141 
142 //------------------------------------------------------------------------
143 
144 CudaKernel& CudaKernel::setGrid(int numThreads, const Vec2i& blockSize)
145 {
146  FW_ASSERT(numThreads >= 0);
147  m_blockSize = (min(blockSize) > 0) ? blockSize : getDefaultBlockSize();
148 
149  int maxGridWidth = 65536;
150 #if FW_USE_CUDA
151  int tmp = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X);
152  if (tmp != 0)
153  maxGridWidth = tmp;
154 #endif
155 
156  int threadsPerBlock = m_blockSize.x * m_blockSize.y;
157  m_gridSize = Vec2i((numThreads + threadsPerBlock - 1) / threadsPerBlock, 1);
158  while (m_gridSize.x > maxGridWidth)
159  {
160  m_gridSize.x = (m_gridSize.x + 1) >> 1;
161  m_gridSize.y <<= 1;
162  }
163 
164  return *this;
165 }
166 
167 //------------------------------------------------------------------------
168 
169 CudaKernel& CudaKernel::setGrid(const Vec2i& sizeThreads, const Vec2i& blockSize)
170 {
171  FW_ASSERT(min(sizeThreads) >= 0);
172  m_blockSize = (min(blockSize) > 0) ? blockSize : getDefaultBlockSize();
173  m_gridSize = (sizeThreads + m_blockSize - 1) / m_blockSize;
174  return *this;
175 }
176 
177 //------------------------------------------------------------------------
178 
180 {
181  if (prepareLaunch())
182  performLaunch();
183  return *this;
184 }
185 
186 //------------------------------------------------------------------------
187 
189 {
190  // Prepare and sync before timing.
191 
192  if (!prepareLaunch())
193  return 0.0f;
194  sync(false); // wait is short => spin
195 
196  // Events not supported => use CPU-based timer.
197 
198  CUevent startEvent = CudaModule::getStartEvent();
199  CUevent endEvent = CudaModule::getEndEvent();
200 
201  if (!startEvent)
202  {
203  Timer timer(true);
204  performLaunch();
205  sync(false); // need accurate timing => spin
206  return timer.getElapsed();
207  }
208 
209  // Launch and record events.
210 
211  CudaModule::checkError("cuEventRecord", cuEventRecord(startEvent, NULL));
212  performLaunch();
213  CudaModule::checkError("cuEventRecord", cuEventRecord(endEvent, NULL));
214  sync(yield);
215 
216  // Query GPU time between the events.
217 
218  F32 time = 0.0f;
219  CudaModule::checkError("cuEventElapsedTime", cuEventElapsedTime(&time, startEvent, endEvent));
220  return time * 1.0e-3f;
221 }
222 
223 //------------------------------------------------------------------------
224 
226 {
227  CudaModule::sync(yield);
228  return *this;
229 }
230 
231 //------------------------------------------------------------------------
232 
233 bool CudaKernel::prepareLaunch(void)
234 {
235  // Nothing to do => skip.
236 
237  if (!m_module || !m_function || min(m_gridSize) == 0)
238  return false;
239 
240  // Set parameters.
241 
242  CudaModule::checkError("cuParamSetSize", cuParamSetSize(m_function, m_params.getSize()));
243  if (m_params.getSize())
244  CudaModule::checkError("cuParamSetv", cuParamSetv(m_function, 0, m_params.getPtr(), m_params.getSize()));
245 
246  // Set L1 and shared memory configuration.
247 
248 #if (CUDA_VERSION >= 3000)
249  if (isAvailable_cuFuncSetCacheConfig())
250  CudaModule::checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig(m_function,
251  (m_preferL1) ? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED));
252 #endif
253 
254 #if (CUDA_VERSION >= 4020)
255  if (isAvailable_cuFuncSetSharedMemConfig())
256  CudaModule::checkError("cuFuncSetSharedMemConfig", cuFuncSetSharedMemConfig(m_function,
257  (m_sharedBankSize == 4) ? CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE));
258 #endif
259 
260  // Set block size.
261 
262  CudaModule::checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(m_function, m_blockSize.x, m_blockSize.y, 1));
263 
264  // Update globals.
265 
266  m_module->updateGlobals();
267  m_module->updateTexRefs(m_function);
268  return true;
269 }
270 
271 //------------------------------------------------------------------------
272 
273 void CudaKernel::performLaunch(void)
274 {
275  if (m_async && isAvailable_cuLaunchGridAsync())
276  CudaModule::checkError("cuLaunchGridAsync", cuLaunchGridAsync(m_function, m_gridSize.x, m_gridSize.y, m_stream));
277  else
278  CudaModule::checkError("cuLaunchGrid", cuLaunchGrid(m_function, m_gridSize.x, m_gridSize.y));
279 }
280 
281 //------------------------------------------------------------------------
282 
284 {
285  m_module = other.m_module;
286  m_function = other.m_function;
287  m_params = other.m_params;
288  m_preferL1 = other.m_preferL1;
289  m_async = other.m_async;
290  m_stream = other.m_stream;
291  m_gridSize = other.m_gridSize;
292  m_blockSize = other.m_blockSize;
293  return *this;
294 }
295 
296 //------------------------------------------------------------------------
#define FW_UNREF(X)
Definition: Defs.hpp:78
static CUevent getEndEvent(void)
Definition: CudaModule.hpp:79
#define NULL
Definition: Defs.hpp:39
static int getDriverVersion(void)
Definition: CudaModule.cpp:495
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule * module
Definition: DLLImports.inl:60
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z GLuint GLint GLenum GLboolean GLsizei const GLvoid pointer GLuint GLuint const GLchar name GLenum GLsizei GLenum GLsizei GLsizei height GLenum GLuint renderbuffer GLenum GLenum GLint * params
Definition: DLLImports.inl:373
CudaKernel & sync(bool yield=true)
Definition: CudaKernel.cpp:225
static void checkError(const char *funcName, CUresult res)
Definition: CudaModule.cpp:487
static int getComputeCapability(void)
Definition: CudaModule.cpp:508
void reset(S size=0)
Definition: Array.hpp:317
float F32
Definition: Defs.hpp:89
CudaKernel(CudaModule *module=NULL, CUfunction function=NULL)
Definition: CudaKernel.cpp:36
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
Definition: Math.hpp:461
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
Definition: Math.hpp:462
#define FW_ASSERT(X)
Definition: Defs.hpp:67
const void * value
Definition: CudaKernel.hpp:55
CudaKernel & operator=(const CudaKernel &other)
Definition: CudaKernel.cpp:283
Vec2i getDefaultBlockSize(void) const
Definition: CudaKernel.cpp:95
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
Definition: DLLImports.inl:88
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value
Definition: DLLImports.inl:84
CudaKernel & setParams(void)
Definition: CudaKernel.hpp:86
void updateGlobals(bool async=false, CUstream stream=NULL)
Definition: CudaModule.cpp:137
CudaKernel & launch(void)
Definition: CudaKernel.cpp:179
static void sync(bool yield=true)
Definition: CudaModule.cpp:398
static CUevent getStartEvent(void)
Definition: CudaModule.hpp:78
int getAttribute(CUfunction_attribute attrib) const
Definition: CudaKernel.cpp:56
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef cuLaunchGrid
Definition: DLLImports.inl:88
static int getDeviceAttribute(CUdevice_attribute attrib)
Definition: CudaModule.cpp:522
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef cuParamSetSize
Definition: DLLImports.inl:82
F32 launchTimed(bool yield=true)
Definition: CudaKernel.cpp:188
const T * getPtr(S idx=0) const
Definition: Array.hpp:202
CudaKernel & setGrid(int numThreads, const Vec2i &blockSize=0)
Definition: CudaKernel.cpp:144
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
Definition: DLLImports.inl:319
F32 getElapsed(void)
Definition: Timer.hpp:44
S getSize(void) const
Definition: Array.hpp:188
void updateTexRefs(CUfunction kernel)
Definition: CudaModule.cpp:264