NTrace/_cuda_kernel_8cpp_source.html

 /*

  *  Copyright (c) 2009-2011, NVIDIA Corporation

  *  All rights reserved.

  *

  *  Redistribution and use in source and binary forms, with or without

  *  modification, are permitted provided that the following conditions are met:

  *      * Redistributions of source code must retain the above copyright

  *        notice, this list of conditions and the following disclaimer.

  *      * Redistributions in binary form must reproduce the above copyright

  *        notice, this list of conditions and the following disclaimer in the

  *        documentation and/or other materials provided with the distribution.

  *      * Neither the name of NVIDIA Corporation nor the

  *        names of its contributors may be used to endorse or promote products

  *        derived from this software without specific prior written permission.

  *

  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

  *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

  *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

  *  DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY

  *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

  *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

  *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

  *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  */


 #include "gpu/CudaKernel.hpp"

 #include "gpu/CudaModule.hpp"

 #include "base/Timer.hpp"


 using namespace FW;


 //------------------------------------------------------------------------


 CudaKernel::CudaKernel(CudaModule* module, CUfunction function)

 :   m_module            (module),

     m_function          (function),

     m_preferL1          (true),

     m_sharedBankSize    (4),

     m_async             (false),

     m_stream            (NULL),

     m_gridSize          (1, 1),

     m_blockSize         (1, 1)

 {

 }


 //------------------------------------------------------------------------


 CudaKernel::~CudaKernel(void)

 {

 }


 //------------------------------------------------------------------------


 int CudaKernel::getAttribute(CUfunction_attribute attrib) const

 {

     int value = 0;

 #if (!FW_USE_CUDA)

     FW_UNREF(attrib);

 #elif (CUDA_VERSION >= 2020)

     if (m_function && CudaModule::getDriverVersion() >= 22)

         CudaModule::checkError("cuFuncGetAttribute", cuFuncGetAttribute(&value, attrib, m_function));

 #endif

     return value;

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::setParams(const Param* const* params, int numParams)

 {

     FW_ASSERT(numParams == 0 || params);

     FW_ASSERT(numParams >= 0);


     int size = 0;

     for (int i = 0; i < numParams; i++)

     {

         size = (size + params[i]->align - 1) & -params[i]->align;

         size += params[i]->size;

     }

     m_params.reset(size);


     int ofs = 0;

     for (int i = 0; i < numParams; i++)

     {

         ofs = (ofs + params[i]->align - 1) & -params[i]->align;

         memcpy(m_params.getPtr(ofs), params[i]->value, params[i]->size);

         ofs += params[i]->size;

     }

     return *this;

 }


 //------------------------------------------------------------------------


 Vec2i CudaKernel::getDefaultBlockSize(void) const

 {

     int arch = CudaModule::getComputeCapability();

 #if (CUDA_VERSION >= 2020)

     int driver = CudaModule::getDriverVersion();

 #endif


     // Details available => choose smallest block that reaches maximal occupancy.


 #if (CUDA_VERSION >= 2020)

     if (m_function && driver >= 22)

     {

         int warpSize            = max(CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_WARP_SIZE), 1);

         int warpRounding        = 2;

         int maxBlocksPerSM      = 8;

         int maxSharedPerSM      = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK);

         int maxThreadsPerBlock  = getAttribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);

         int maxThreadsPerSM     = (arch >= 20 && arch < 30) ? 1536 : maxThreadsPerBlock;

         int sharedPerBlock      = getAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES);


 #if (CUDA_VERSION >= 4000)

         if (driver >= 40)

             maxThreadsPerSM = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);

 #endif


         if (arch >= 20 && m_preferL1 && sharedPerBlock <= maxSharedPerSM / 3)

             maxSharedPerSM /= 3;


         int numBlocks = maxBlocksPerSM;

         if (sharedPerBlock > 0)

             numBlocks = min(numBlocks, maxSharedPerSM / sharedPerBlock);

         numBlocks = min(numBlocks, maxThreadsPerSM / warpSize / warpRounding);

         numBlocks = max(numBlocks, 1);


         int numWarps = maxThreadsPerBlock / warpSize;

         numWarps = min(numWarps, maxThreadsPerSM / numBlocks / warpSize);

         numWarps -= numWarps % warpRounding;

         numWarps = max(numWarps, 1);

         return Vec2i(warpSize, numWarps);

     }

 #endif


     // Otherwise => guess based on GPU architecture.


     return Vec2i(32, (arch < 20) ? 2 : 4);

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::setGrid(int numThreads, const Vec2i& blockSize)

 {

     FW_ASSERT(numThreads >= 0);

     m_blockSize = (min(blockSize) > 0) ? blockSize : getDefaultBlockSize();


     int maxGridWidth = 65536;

 #if FW_USE_CUDA

     int tmp = CudaModule::getDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X);

     if (tmp != 0)

         maxGridWidth = tmp;

 #endif


     int threadsPerBlock = m_blockSize.x * m_blockSize.y;

     m_gridSize = Vec2i((numThreads + threadsPerBlock - 1) / threadsPerBlock, 1);

     while (m_gridSize.x > maxGridWidth)

     {

         m_gridSize.x = (m_gridSize.x + 1) >> 1;

         m_gridSize.y <<= 1;

     }


     return *this;

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::setGrid(const Vec2i& sizeThreads, const Vec2i& blockSize)

 {

     FW_ASSERT(min(sizeThreads) >= 0);

     m_blockSize = (min(blockSize) > 0) ? blockSize : getDefaultBlockSize();

     m_gridSize = (sizeThreads + m_blockSize - 1) / m_blockSize;

     return *this;

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::launch(void)

 {

     if (prepareLaunch())

         performLaunch();

     return *this;

 }


 //------------------------------------------------------------------------


 F32 CudaKernel::launchTimed(bool yield)

 {

     // Prepare and sync before timing.


     if (!prepareLaunch())

         return 0.0f;

     sync(false); // wait is short => spin


     // Events not supported => use CPU-based timer.


     CUevent startEvent = CudaModule::getStartEvent();

     CUevent endEvent = CudaModule::getEndEvent();


     if (!startEvent)

     {

         Timer timer(true);

         performLaunch();

         sync(false); // need accurate timing => spin

         return timer.getElapsed();

     }


     // Launch and record events.


     CudaModule::checkError("cuEventRecord", cuEventRecord(startEvent, NULL));

     performLaunch();

     CudaModule::checkError("cuEventRecord", cuEventRecord(endEvent, NULL));

     sync(yield);


     // Query GPU time between the events.


     F32 time = 0.0f;

     CudaModule::checkError("cuEventElapsedTime", cuEventElapsedTime(&time, startEvent, endEvent));

     return time * 1.0e-3f;

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::sync(bool yield)

 {

     CudaModule::sync(yield);

     return *this;

 }


 //------------------------------------------------------------------------


 bool CudaKernel::prepareLaunch(void)

 {

     // Nothing to do => skip.


     if (!m_module || !m_function || min(m_gridSize) == 0)

         return false;


     // Set parameters.


     CudaModule::checkError("cuParamSetSize", cuParamSetSize(m_function, m_params.getSize()));

     if (m_params.getSize())

         CudaModule::checkError("cuParamSetv", cuParamSetv(m_function, 0, m_params.getPtr(), m_params.getSize()));


     // Set L1 and shared memory configuration.


 #if (CUDA_VERSION >= 3000)

     if (isAvailable_cuFuncSetCacheConfig())

         CudaModule::checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig(m_function,

             (m_preferL1) ? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED));

 #endif


 #if (CUDA_VERSION >= 4020)

     if (isAvailable_cuFuncSetSharedMemConfig())

         CudaModule::checkError("cuFuncSetSharedMemConfig", cuFuncSetSharedMemConfig(m_function,

             (m_sharedBankSize == 4) ? CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE));

 #endif


     // Set block size.


     CudaModule::checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(m_function, m_blockSize.x, m_blockSize.y, 1));


     // Update globals.


     m_module->updateGlobals();

     m_module->updateTexRefs(m_function);

     return true;

 }


 //------------------------------------------------------------------------


 void CudaKernel::performLaunch(void)

 {

     if (m_async && isAvailable_cuLaunchGridAsync())

         CudaModule::checkError("cuLaunchGridAsync", cuLaunchGridAsync(m_function, m_gridSize.x, m_gridSize.y, m_stream));

     else

         CudaModule::checkError("cuLaunchGrid", cuLaunchGrid(m_function, m_gridSize.x, m_gridSize.y));

 }


 //------------------------------------------------------------------------


 CudaKernel& CudaKernel::operator=(const CudaKernel& other)

 {

     m_module    = other.m_module;

     m_function  = other.m_function;

     m_params    = other.m_params;

     m_preferL1  = other.m_preferL1;

     m_async     = other.m_async;

     m_stream    = other.m_stream;

     m_gridSize  = other.m_gridSize;

     m_blockSize = other.m_blockSize;

     return *this;

 }


 //------------------------------------------------------------------------

FW_UNREF
#define FW_UNREF(X)
Definition: Defs.hpp:78

FW::CudaModule::getEndEvent
static CUevent getEndEvent(void)
Definition: CudaModule.hpp:79

NULL
#define NULL
Definition: Defs.hpp:39

FW::CudaKernel::Param::align
S32 align
Definition: CudaKernel.hpp:54

FW::CudaModule::getDriverVersion
static int getDriverVersion(void)
Definition: CudaModule.cpp:495

module
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule * module
Definition: DLLImports.inl:60

params
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr const GLvoid GLenum usage GLuint shader GLenum type GLsizei const GLuint framebuffers GLsizei const GLuint renderbuffers GLuint v GLuint v GLenum GLenum GLenum GLuint GLint level GLsizei GLuint framebuffers GLuint const GLchar name GLenum GLintptr GLsizeiptr GLvoid data GLuint GLenum GLint param GLuint GLenum GLint param GLhandleARB programObj GLenum GLenum GLsizei GLsizei height GLenum GLint GLint GLsizei GLsizei GLsizei GLint GLenum GLenum const GLvoid pixels GLint GLsizei const GLfloat value GLint GLfloat GLfloat v1 GLint GLfloat GLfloat GLfloat v2 GLint GLsizei const GLfloat value GLint GLsizei GLboolean const GLfloat value GLuint program GLuint GLfloat GLfloat GLfloat z GLuint GLint GLenum GLboolean GLsizei const GLvoid pointer GLuint GLuint const GLchar name GLenum GLsizei GLenum GLsizei GLsizei height GLenum GLuint renderbuffer GLenum GLenum GLint * params
Definition: DLLImports.inl:373

FW::CudaKernel::sync
CudaKernel & sync(bool yield=true)
Definition: CudaKernel.cpp:225

FW::CudaModule
Definition: CudaModule.hpp:36

FW::CudaModule::checkError
static void checkError(const char *funcName, CUresult res)
Definition: CudaModule.cpp:487

FW::CudaModule::getComputeCapability
static int getComputeCapability(void)
Definition: CudaModule.cpp:508

FW::ArrayBase::reset
void reset(S size=0)
Definition: Array.hpp:317

CudaKernel.hpp

FW::F32
float F32
Definition: Defs.hpp:89

CudaModule.hpp

FW::CudaKernel::CudaKernel
CudaKernel(CudaModule *module=NULL, CUfunction function=NULL)
Definition: CudaKernel.cpp:36

FW::min
FW_CUDA_FUNC T min(const VectorBase< T, L, S > &v)
Definition: Math.hpp:461

FW::max
FW_CUDA_FUNC T max(const VectorBase< T, L, S > &v)
Definition: Math.hpp:462

FW::CudaKernel::Param
Definition: CudaKernel.hpp:51

FW_ASSERT
#define FW_ASSERT(X)
Definition: Defs.hpp:67

FW::CudaKernel::Param::value
const void * value
Definition: CudaKernel.hpp:55

FW::CudaKernel::operator=
CudaKernel & operator=(const CudaKernel &other)
Definition: CudaKernel.cpp:283

FW::CudaKernel::~CudaKernel
~CudaKernel(void)
Definition: CudaKernel.cpp:50

FW::CudaKernel::Param::size
S32 size
Definition: CudaKernel.hpp:53

FW::CudaKernel::getDefaultBlockSize
Vec2i getDefaultBlockSize(void) const
Definition: CudaKernel.cpp:95

FW::Timer
Definition: Timer.hpp:35

f
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction f
Definition: DLLImports.inl:88

value
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value
Definition: DLLImports.inl:84

FW::CudaKernel::setParams
CudaKernel & setParams(void)
Definition: CudaKernel.hpp:86

FW::CudaKernel
Definition: CudaKernel.hpp:48

FW::CudaModule::updateGlobals
void updateGlobals(bool async=false, CUstream stream=NULL)
Definition: CudaModule.cpp:137

FW::CudaKernel::launch
CudaKernel & launch(void)
Definition: CudaKernel.cpp:179

FW::CudaModule::sync
static void sync(bool yield=true)
Definition: CudaModule.cpp:398

FW::CudaModule::getStartEvent
static CUevent getStartEvent(void)
Definition: CudaModule.hpp:78

FW::CudaKernel::getAttribute
int getAttribute(CUfunction_attribute attrib) const
Definition: CudaKernel.cpp:56

cuLaunchGrid
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef cuLaunchGrid
Definition: DLLImports.inl:88

Timer.hpp

FW::CudaModule::getDeviceAttribute
static int getDeviceAttribute(CUdevice_attribute attrib)
Definition: CudaModule.cpp:522

cuParamSetSize
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef cuParamSetSize
Definition: DLLImports.inl:82

FW::CudaKernel::launchTimed
F32 launchTimed(bool yield=true)
Definition: CudaKernel.cpp:188

FW::ArrayBase::getPtr
const T * getPtr(S idx=0) const
Definition: Array.hpp:202

FW::Vec2i
Definition: Math.hpp:238

FW::CudaKernel::setGrid
CudaKernel & setGrid(int numThreads, const Vec2i &blockSize=0)
Definition: CudaKernel.cpp:144

size
CUdevice int ordinal char int CUdevice dev CUdevprop CUdevice dev CUcontext ctx CUcontext ctx CUcontext pctx CUmodule const void image CUmodule const void fatCubin CUfunction CUmodule const char name void p CUfunction unsigned int bytes CUtexref pTexRef CUtexref CUarray unsigned int Flags CUtexref int CUaddress_mode am CUtexref unsigned int Flags CUaddress_mode CUtexref int dim CUarray_format int CUtexref hTexRef CUfunction unsigned int numbytes CUfunction int float value CUfunction int CUtexref hTexRef CUfunction int int grid_height CUevent unsigned int Flags CUevent hEvent CUevent hEvent CUstream unsigned int Flags CUstream hStream GLuint bufferobj unsigned int CUdevice dev CUdeviceptr unsigned int CUmodule const char name CUdeviceptr unsigned int bytesize CUdeviceptr dptr void unsigned int bytesize void CUdeviceptr unsigned int ByteCount CUarray unsigned int CUdeviceptr unsigned int ByteCount CUarray unsigned int const void unsigned int ByteCount CUarray unsigned int CUarray unsigned int unsigned int ByteCount void CUarray unsigned int unsigned int CUstream hStream const CUDA_MEMCPY2D pCopy CUdeviceptr const void unsigned int CUstream hStream const CUDA_MEMCPY2D CUstream hStream CUdeviceptr unsigned char unsigned int N CUdeviceptr unsigned int unsigned int N CUdeviceptr unsigned int unsigned short unsigned int unsigned int Height CUarray const CUDA_ARRAY_DESCRIPTOR pAllocateArray CUarray const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray unsigned int CUtexref CUdeviceptr unsigned int bytes CUcontext unsigned int CUdevice device GLenum texture GLenum GLuint buffer GLenum GLuint renderbuffer GLenum GLsizeiptr size
Definition: DLLImports.inl:319

FW::Timer::getElapsed
F32 getElapsed(void)
Definition: Timer.hpp:44

FW::ArrayBase::getSize
S getSize(void) const
Definition: Array.hpp:188

FW::CudaModule::updateTexRefs
void updateTexRefs(CUfunction kernel)
Definition: CudaModule.cpp:264