38 m_function (function),
61 #elif (CUDA_VERSION >= 2020)
76 for (
int i = 0; i < numParams; i++)
78 size = (size + params[i]->
align - 1) & -params[i]->align;
79 size += params[i]->
size;
84 for (
int i = 0; i < numParams; i++)
86 ofs = (ofs + params[i]->
align - 1) & -params[i]->align;
88 ofs += params[i]->
size;
98 #if (CUDA_VERSION >= 2020)
104 #if (CUDA_VERSION >= 2020)
105 if (m_function && driver >= 22)
108 int warpRounding = 2;
109 int maxBlocksPerSM = 8;
111 int maxThreadsPerBlock =
getAttribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
112 int maxThreadsPerSM = (arch >= 20 && arch < 30) ? 1536 : maxThreadsPerBlock;
113 int sharedPerBlock =
getAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES);
115 #if (CUDA_VERSION >= 4000)
120 if (arch >= 20 && m_preferL1 && sharedPerBlock <= maxSharedPerSM / 3)
123 int numBlocks = maxBlocksPerSM;
124 if (sharedPerBlock > 0)
125 numBlocks =
min(numBlocks, maxSharedPerSM / sharedPerBlock);
126 numBlocks =
min(numBlocks, maxThreadsPerSM / warpSize / warpRounding);
127 numBlocks =
max(numBlocks, 1);
129 int numWarps = maxThreadsPerBlock / warpSize;
130 numWarps =
min(numWarps, maxThreadsPerSM / numBlocks / warpSize);
131 numWarps -= numWarps % warpRounding;
132 numWarps =
max(numWarps, 1);
133 return Vec2i(warpSize, numWarps);
139 return Vec2i(32, (arch < 20) ? 2 : 4);
149 int maxGridWidth = 65536;
156 int threadsPerBlock = m_blockSize.x * m_blockSize.y;
157 m_gridSize =
Vec2i((numThreads + threadsPerBlock - 1) / threadsPerBlock, 1);
158 while (m_gridSize.x > maxGridWidth)
160 m_gridSize.x = (m_gridSize.x + 1) >> 1;
173 m_gridSize = (sizeThreads + m_blockSize - 1) / m_blockSize;
192 if (!prepareLaunch())
220 return time * 1.0e-3
f;
233 bool CudaKernel::prepareLaunch(
void)
237 if (!m_module || !m_function ||
min(m_gridSize) == 0)
248 #if (CUDA_VERSION >= 3000)
249 if (isAvailable_cuFuncSetCacheConfig())
251 (m_preferL1) ? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED));
254 #if (CUDA_VERSION >= 4020)
255 if (isAvailable_cuFuncSetSharedMemConfig())
257 (m_sharedBankSize == 4) ? CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE));
262 CudaModule::checkError(
"cuFuncSetBlockShape", cuFuncSetBlockShape(m_function, m_blockSize.x, m_blockSize.y, 1));
273 void CudaKernel::performLaunch(
void)
275 if (m_async && isAvailable_cuLaunchGridAsync())
276 CudaModule::checkError(
"cuLaunchGridAsync", cuLaunchGridAsync(m_function, m_gridSize.x, m_gridSize.y, m_stream));
285 m_module = other.m_module;
286 m_function = other.m_function;
287 m_params = other.m_params;
288 m_preferL1 = other.m_preferL1;
289 m_async = other.m_async;
290 m_stream = other.m_stream;
291 m_gridSize = other.m_gridSize;
292 m_blockSize = other.m_blockSize;
static CUevent getEndEvent(void)
static int getDriverVersion(void)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule * module
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z GLuint GLint GLenum GLboolean GLsizei const GLvoid pointer GLuint GLuint const GLchar name GLenum GLsizei GLenum GLsizei GLsizei height GLenum GLuint renderbuffer GLenum GLenum GLint * params
CudaKernel & sync(bool yield=true)
static void checkError(const char *funcName, CUresult res)
static int getComputeCapability(void)
CudaKernel(CudaModule *module=NULL, CUfunction function=NULL)
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
CudaKernel & operator=(const CudaKernel &other)
Vec2i getDefaultBlockSize(void) const
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value
CudaKernel & setParams(void)
void updateGlobals(bool async=false, CUstream stream=NULL)
CudaKernel & launch(void)
static void sync(bool yield=true)
static CUevent getStartEvent(void)
int getAttribute(CUfunction_attribute attrib) const
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef cuLaunchGrid
static int getDeviceAttribute(CUdevice_attribute attrib)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef cuParamSetSize
F32 launchTimed(bool yield=true)
const T * getPtr(S idx=0) const
CudaKernel & setGrid(int numThreads, const Vec2i &blockSize=0)
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
void updateTexRefs(CUfunction kernel)