 Kashif Rasul
my name is Kashif
objective: Deeper

#include <cutil_inline.h>

int main( void )
  int N = 50000;
  size_t size = N * sizeof(float);

    cudaSetDevice( cutGetMaxGflopsDeviceId() );
    cutilSafeCall( cudaMalloc((void**)&d_A, size) );
    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );

                                                 blocks & threads
__global__ void dot( float *a, float *b, float *c )
  __shared__ float cache[threadsPerBlock];
  int cacheIndex = threadIdx.x;
  // set the cache values
  cache[cacheIndex] = temp;
  // synchronize threads in this block

int main( void )
  dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c );

                                             shared memory
•   thread coop. & shared mem. useful
    for reduction algorithms

•   avoid race conditions by using

•   avoid bank conflicts

•   every thread in the block needs to
    call __syncthreads()

                                 keep in mind

__constant__ float constFloat;

__device__ float getConstFloat() { return constFloat; }

__global__ void addConstant(float *vec, int N)
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i<N)
        vec[i] += getConstFloat();

#include <cutil_inline.h>

int main( int argc, char** argv)
    float constValue = 4.0f;

    cutilSafeCall( cudaMemcpyToSymbol(constFloat,
                                      sizeof(float), 0,
                                      cudaMemcpyHostToDevice) );

                                                 constant mem.
•   read-only, but conserves mem.

•   a single read can be broadcasted and
    cached for additional reads

•   painfully slow when each thread
    reads a different address from
    constant memory

                                 keep in mind
// textures containing look-up tables
texture<uint> edgeTex;
texture<uint, 2> edge2dTex;

int main(int argc, char** argv)
    cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) );
    cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable,
                              256*sizeof(uint), cudaMemcpyHostToDevice) );

    cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable,
                                   256*sizeof(uint)) );

    // run kernel
    kernel<<<blocks, threads>>>(...)

    cutilSafeCall( cudaUnbindTexture(edgeTex) );

__global__ void kernel(...)
    uint edge = tex1Dfetch(edgeTex, index*16 + i);
                                                        texture mem.
•   read-only, like for const. mem.

•   great when memory access exhibits
    spatial locality, i.e. each thread
    reads a loc. near where the next or
    previous thread reads

•   comes in 1-D, 2-D and 3-D versions
    & typically used in finite diff. apps

                                   keep in mind
surface<void, 2> output_surface;

__global__ void surfaceWrite(float* g_idata, int width, int height) {
    // calculate surface coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // read from global memory and write to cuarray (via surface reference)
    surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);

int main( int argc, char** argv) {
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
    cudaArray* cu_array;
    cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height,
                                   cudaArraySurfaceLoadStore) );
    cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) );
    cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) );

    surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height);
    cutilSafeCall( cudaFree(d_data) );
    cutilSafeCall( cudaFreeArray(cu_array) );

                                                                  surface mem.

// OpenGL Graphics includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#include <GL/freeglut.h>

int main(int argc, char **argv) {
    // Initialize GL
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
    glutInitWindowSize(1000, 1000);

    // Create a window with rendering context and all else we need
    glutCreateWindow("CUDA Interop.");

    // initialize necessary OpenGL extensions

    // Select CUDA device with OpenGL interoperability
    if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) {
        cutilGLDeviceInit(argc, argv);
    else {
        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
                                                         set device
// vbo variables
GLuint vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

// create buffer object
glGenBuffers(1, vbo);
glBindBuffer(GL_ARRAY_BUFFER, *vbo);

// initialize buffer object
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER, 0);

// register this buffer object with CUDA
cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo,

                                       register data with CUDA
// map OpenGL buffer object for writing from CUDA
float4 *dptr;
cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );

size_t num_bytes;
cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr,
                                                    *cuda_vbo_resource) );

// run kernel

// unmap buffer object
cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) );

                             pass data via shared buffers
•   need to tell the CUDA runtime the
    device we intend to use for CUDA
    and OpenGL

•   initialize OpenGL first and then use
    the cudaGLSetGLDevice() method

•   DirectX interop. is nearly identical

                                   keep in mind
Pro Tip

➜ git clone
Cloning into cuda-workshop...

➜ cd cuda-workshop

➜ cmake CMakeLists.txt
-- The C compiler identification is GNU

➜ make
Scanning dependencies of target cutil
[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o
[100%] Built target matrixMul

➜ ./bin/matrixMul
[ matrixMul ]
bin/matrixMul Starting (CUDA and CUBLAS tests)...

Device 0: "GeForce GTX 480" with Compute 2.0 capability

                                   install CMake, glut & glew
➜ ls src/matrixMul
CMakeLists.txt        matrixMul.h

➜ cat src/matrixMul/CMakeLists.txt


➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt

                            great for experimenting
Events &
cudaEvent_t start, stop;
float time;

// initialize events
cutilSafeCall( cudaEventCreate(&start) );
cutilSafeCall( cudaEventCreate(&stop) );

// warmup to avoid timing startup
kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);

// take measurements for loop over kernel launches
cutilSafeCall( cudaEventRecord(start, 0) );
for (int i=0; i < NUM_REPS; i++) {
    kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);
    // Ensure no launch failure
    cutilSafeCall( cudaGetLastError() );
cutilSafeCall( cudaEventRecord(stop, 0) );
cutilSafeCall( cudaEventSynchronize(stop) );
cutilSafeCall( cudaEventElapsedTime(&time, start, stop) );

// report effective bandwidth in GB/s (2.0f due to read + write)
float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS);

cutilSafeCall( cudaEventDestroy(stop) );
cutilSafeCall( cudaEventDestroy(start) );
                                   events: GPU timestamp
#include <cutil_inline.h>
unsigned int timer_matrixMul = 0;

// start timing
cutilCheckError( cutStartTimer(timer_matrixMul) );

// do some work
kernel<<<grid, threads, mem_size>>>(d_idata, d_odata);

// stop timer
cutilCheckError( cutStopTimer(timer_matrixMul) );

double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0);
double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
double gflops = 1.0e-9 * dNumOps/dSeconds;

// destroy timer
cutilCheckError( cutDeleteTimer(timer_matrixMul) );

                                                               os timers
•   creating and recording events is
    tricky since some CUDA calls are

•   all kernel launches are asynch.

•   instruct the CPU to synch. on an
    event via cudaDeviceSynchronize()

                                 keep in mind

➜ cat
import pycuda.driver as drv
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
        drv.Out(dest), drv.In(a), drv.In(b),

print dest-a*b
➜ python
[ 0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0.]
// Initialize the driver and create a context for the first device.
CUdevice device = new CUdevice();       cuDeviceGet(device, 0);
CUcontext context = new CUcontext();    cuCtxCreate(context, 0, device);

// Create the PTX file by calling the NVCC and load it
String ptxFileName = preparePtxFile("");
CUmodule module = new CUmodule();       cuModuleLoad(module, ptxFileName);

// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");

// Allocate the device input data
float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA,, numElements * Sizeof.FLOAT);
// Set up the kernel parameters
Pointer kernelParameters =,...);

// Call the kernel function
int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
               gridSizeX, 1, 1,         // Grid dimension
               blockSizeX, 1, 1,        // Block dimension
               0, null,                 // Shared memory size and stream
               kernelParameters, null); // Kernel- and extra parameters
➜ ls
License.txt                          jcuda-0.4.0-beta1.jar
jcurand-0.4.0-beta1.jar              libJCublas-apple-x86_64.dylib
libJCudaRuntime-apple-x86_64.dylib   libJCurand-apple-x86_64.dylib
jcublas-0.4.0-beta1.jar              jcufft-0.4.0-beta1.jar
jcusparse-0.4.0-beta1.jar            libJCudaDriver-apple-x86_64.dylib
libJCufft-apple-x86_64.dylib         libJCusparse-apple-x86_64.dylib        

➜ cat
extern "C"
__global__ void add(float *a, float *b, float *sum, int n)
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i<n)
        sum[i] = a[i] + b[i];

➜ javac -classpath jcuda-0.4.0-beta1.jar

➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAdd
nvcc -m64 -ptx -o JCudaVectorAddKernel.ptx
Finished creating PTX file
➜ gem install sgc-ruby-cuda
Successfully installed sgc-ruby-cuda-0.1.1
1 gem installed

➜ cat vector_add.rb
# Prepare and load vadd kernel.
kernel_lib_file = compile(vadd_kernel_src)

# Copy input buffers from host memory to device memory.
memcpy_htod(da, ha, nbytes)
memcpy_htod(db, hb, nbytes)

# Invoke vadd kernel.
nthreads_per_block = 256
block_dim =, 1, 1)
grid_dim = + nthreads_per_block - 1) / nthreads_per_block, 1, 1)
CudaFunction.configure(block_dim, grid_dim)
CudaFunction.setup(da, db, dc, N)
f ="vadd")

# Copy output buffer from device memory to host memory.
memcpy_dtoh(hc, dc, nbytes)

➜ ruby vector_add.rb
Vector Addition
Verification completed. All matches? YES                        ruby-cuda

cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);

float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));
/* Fill the matrices with test data */
/* Allocate device memory for the matrices */
cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);
/* Performs Sgemm: C <- alphaAB + betaC */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
                     &alpha, d_A, N, d_B, N, &beta, d_C, N);

/* Allocate host mem & read back the result from device mem */
h_C = (float*)malloc(N * N * sizeof(h_C[0]));
status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);

/* Memory clean up */
/* Shutdown */
status = cublasDestroy(handle);
cudaSetDevice( cutGetMaxGflopsDeviceId() );

// Allocate & init. host memory for the signal
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Pad signal
Complex* h_padded_signal;
// Allocate device memory for signal
Complex* d_signal;
cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );
// Copy host memory to device
cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size,
                          cudaMemcpyHostToDevice) );

// CUFFT plan
cufftHandle plan;
cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );

// Transform signal
cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal,
                            (cufftComplex *)d_signal, CUFFT_FORWARD) );

// Destroy CUFFT context
cufftSafeCall( cufftDestroy(plan) );

// Cleanup memory
cutilSafeCall( cudaFree(d_signal) );
cutilDeviceReset();                                                    cufft
cusparseHandle_t handle = 0;
cusparseStatus_t status = cusparseCreate(&handle);

// create a matrix description for the matrix M
cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);
cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );
cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );
cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );

// create & perform analysis info for the non-trans & trans case
cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;

cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM,
                         d_valsICP, d_rowPtrsICP, d_colIndsICP, info);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM,
                         d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);
// Solve M z = H H^T z = r by first doing a forward solve: H y = r
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM,
                     d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);
// and then a back substitution: H^T z = y
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM,
                     d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z);
cudaError_t cudaResult = cudaSuccess;

// Allocate memory for points
float *d_points = 0;
cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));

// Generate random points in unit square
curandStatus_t curandResult;
curandGenerator_t qrng;

curandResult   =   curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
curandResult   =   curandSetQuasiRandomGeneratorDimensions(qrng, 2);
curandResult   =   curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);
curandResult   =   curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);

// Cleanup
curandResult = curandDestroyGenerator(qrng);

// declare a host image object for an 8-bit grayscale image
npp::ImageCPU_8u_C1 oHostSrc;
// load gray-scale image from disk
npp::loadImage(sFilename, oHostSrc);
// declare a device image and copy from the host image to the device
npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);

// create struct with box-filter mask size
NppiSize oMaskSize = {5, 5};
// create struct with ROI size given the current mask
NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1,
                     oDeviceSrc.height() - oMaskSize.height + 1};

// allocate device image of appropriately reduced size
npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);

// set anchor point inside the mask to (0, 0)
NppiPoint oAnchor = {0, 0};
// run box filter
nppiFilterBox_8u_C1R(, oDeviceSrc.pitch(),
           , oDeviceDst.pitch(),
                     oSizeROI, oMaskSize, oAnchor);

// declare a host image for the result
npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());
// and copy the device result data into it
oDeviceDst.copyTo(, oHostDst.pitch());

cudaStream_t stream;

cutilSafeCall( cudaStreamCreate(&stream) );

// allocate page locked memory
cutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) );

// allocate device memory
cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );
cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice,
                               stream) );

// run kernel and copy result back
cutilSafeCall( cudaEventRecord(start, stream) );
kernel<<<N,M,0,stream>>>(&d_a, ... );
cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost,
                               stream) );

// free

                                                      pinned memory
// loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
    // copy the locked memory to the device, async
    cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i,
                                   N * sizeof(int),
                                   stream) );
    cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i,
                                   N * sizeof(int),
                                   stream) );

    kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c);

    // copy the data from device to locked memory
    cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c,
                                   N * sizeof(int),
                                   stream) );

// wait for all operations to finish
cutilSafeCall( cudaStreamSynchronize(stream) );

                                  chunked computation
cudaStream_t *streamArray = 0;
streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));

for ( int i = 0; i < N ; i++) {

for ( int i = 0; i < N ; i++) {
    cublasSetMatrix (..., devPtrA[i], ...);

for ( int i = 0; i < N ; i++) {
    cublasSetStream(handle, streamArray[i]);
    cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);

                                         batched computation
•   use it to specify in which order
    operations get executed async.

•   idea is to use more than 1 stream

•   requires a new kind of mem. copy
    which in turn requires pinned: paged
    locked mem.

•   free pinned mem. when not needed

                                 keep in mind
// Allocate resources
for( int i =0; i<STREAM_COUNT; ++i ) {
    cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);
    cudaMalloc(&d_data_in[i], memsize);

int current_stream = 0;
// Do processing in a loop...
    int next_stream = (current_stream + 1 ) % STREAM_COUNT;
    // Ensure that processing and copying of the last cycle has finished

    // Process current frame
    kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream],
                                                       N, ...);
    // Upload next frame
    cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice,

    // Download current frame
    cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost,

    cudaEventRecord(cycleDone[current_stream], stream[current_stream]);
    current_stream = next_stream;
                               overlap kernel exec. & memcpy
•   devices with CC 1.1 and above can
    overlap a kernel exec & memcpy as
    long as they are issued from
    different streams

•   kernels are serialized

•   queue in a way that independent
    streams can execute in parallel

                                keep in mind

float *a, *d_a;

/* Allocate mapped CPU memory. */
cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );

/* Initialize the vectors. */
for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }

/* Get the device pointers for the pinned CPU memory mapped into the GPU
    memory space. */
cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );

/* Call the GPU kernel using the device pointers for the mapped memory. */
kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);

/* Memory clean up */
cutilSafeCall( cudaFreeHost(a) );

                                     zero-copy host memory
//Create streams for issuing GPU command asynchronously and allocate memory
for(int i = 0; i < GPU_N; i++) {
    cutilSafeCall( cudaStreamCreate(&stream[i]) );
    cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) );
    cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) );
    //init h_Data

//Copy data to GPU, launch the kernel and copy data back. All asynchronously
for(int i = 0; i < GPU_N; i++) {
    //Set device
    cutilSafeCall( cudaSetDevice(i) );

    // Copy input data from CPU
    cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float),
                                   cudaMemcpyHostToDevice, stream[i]) );

    // Perform GPU computations
    kernel<<<blocks, threads, 0, stream[i]>>>(...)

    // Copy back the result
    cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i],
                                   ACCUM_N * sizeof(float),
                                   cudaMemcpyDeviceToHost, stream[i]) );

// Process GPU results
for(i = 0; i < GPU_N; i++) {
    // Set device
    cutilSafeCall( cudaSetDevice(i) );

    // Wait for all operations to finish

    // Shut down this GPU
    cutilSafeCall( cudaFreeHost(h_Data[i]) );
    cutilSafeCall( cudaFree(d_Data[i]) );
    cutilSafeCall( cudaStreamDestroy(stream[i]) );

// shutdown
for(int i = 0; i < GPU_N; i++) {
    cutilSafeCall( cudaSetDevice(i) );

                                  process the result
•   can also control each GPU by a
    separate CPU thread

•   need to assign portable pinned
    memory if a different thread needs
    access to one thread’s memory

•   use the flag cudaHostAllocPortable
    to cudaHostAlloc()

                                keep in mind
// Initialize MPI state
MPI_CHECK( MPI_Init(&argc, &argv) );

// Get our MPI node number and node count
int commSize, commRank;
MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );
MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );

if(commRank == 0) {// Are we the root node?
    //initialize dataRoot...

// Allocate a buffer on each node
float * dataNode = new float[dataSizePerNode];

// Dispatch a portion of the input data to each node
MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
                       dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );

// if commRank == 0 then free dataRoot...

kernel<<<gridSize, blockSize>>>(dataNode, ...);

// Reduction to the root node
float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot;
MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0,
                      MPI_COMM_WORLD) );

MPI_CHECK( MPI_Finalize() );                                mpi + cuda
// Enable peer access
cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));

// Allocate buffers
cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);
cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);

// Ping-pong copy between GPUs
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

// Prepare host buffer and copy to GPU 0
cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f
cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);

// Disable peer access (also unregisters memory for non-UVA cases)
cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);
cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);


                   P2P & unified virtual address space
Thank you
CUDA Deep Dive

  • 1. CUDA Deep Dive Kashif Rasul @krasul
  • 5. #include <cutil_inline.h> int main( void ) { int N = 50000; size_t size = N * sizeof(float); cudaSetDevice( cutGetMaxGflopsDeviceId() ); ... cutilSafeCall( cudaMalloc((void**)&d_A, size) ); cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) ); ... int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N); ... cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) ); cudaFree(d_A); ... cutilDeviceReset(); } blocks & threads
  • 6. __global__ void dot( float *a, float *b, float *c ) { __shared__ float cache[threadsPerBlock]; int cacheIndex = threadIdx.x; ... // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads(); ... } int main( void ) { ... dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c ); ... } shared memory
  • 7. thread coop. & shared mem. useful for reduction algorithms • avoid race conditions by using __syncthreads() • avoid bank conflicts • every thread in the block needs to call __syncthreads() keep in mind
  • 8. Memory 2
  • 9. __constant__ float constFloat; __device__ float getConstFloat() { return constFloat; } __global__ void addConstant(float *vec, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i<N) vec[i] += getConstFloat(); } #include <cutil_inline.h> int main( int argc, char** argv) { float constValue = 4.0f; cutilSafeCall( cudaMemcpyToSymbol(constFloat, &constValue, sizeof(float), 0, cudaMemcpyHostToDevice) ); ... } constant mem.
  • 10. read-only, but conserves mem. bandwidth • a single read can be broadcasted and cached for additional reads • painfully slow when each thread reads a different address from constant memory keep in mind
  • 11. // textures containing look-up tables texture<uint> edgeTex; texture<uint, 2> edge2dTex; int main(int argc, char** argv) { ... cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) ); cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256*sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable, 256*sizeof(uint)) ); // run kernel kernel<<<blocks, threads>>>(...) //cleanup cutilSafeCall( cudaUnbindTexture(edgeTex) ); } __global__ void kernel(...) { ... uint edge = tex1Dfetch(edgeTex, index*16 + i); ... } texture mem.
  • 12. read-only, like for const. mem. • great when memory access exhibits spatial locality, i.e. each thread reads a loc. near where the next or previous thread reads • comes in 1-D, 2-D and 3-D versions & typically used in finite diff. apps keep in mind
  • 13. surface<void, 2> output_surface; __global__ void surfaceWrite(float* g_idata, int width, int height) { // calculate surface coordinates unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; // read from global memory and write to cuarray (via surface reference) surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap); } int main( int argc, char** argv) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cu_array; cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height, cudaArraySurfaceLoadStore) ); cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) ); surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height); ... cutilSafeCall( cudaFree(d_data) ); cutilSafeCall( cudaFreeArray(cu_array) ); } surface mem.
  • 14. InterOp. 3
  • 15. // OpenGL Graphics includes #include <GL/glew.h> #if defined (__APPLE__) || defined(MACOSX) #include <GLUT/glut.h> #else #include <GL/freeglut.h> #endif int main(int argc, char **argv) { // Initialize GL glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); glutInitWindowSize(1000, 1000); // Create a window with rendering context and all else we need glutCreateWindow("CUDA Interop."); // initialize necessary OpenGL extensions glewInit(); // Select CUDA device with OpenGL interoperability if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } } set device
  • 16. // vbo variables GLuint vbo; struct cudaGraphicsResource *cuda_vbo_resource; void *d_vbo_buffer = NULL; // create buffer object glGenBuffers(1, vbo); glBindBuffer(GL_ARRAY_BUFFER, *vbo); // initialize buffer object unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // register this buffer object with CUDA cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo, cudaGraphicsMapFlagsWriteDiscard)); register data with CUDA
  • 17. // map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) ); size_t num_bytes; cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *cuda_vbo_resource) ); // run kernel kernel<<<blocks,threads>>>(dptr,...); // unmap buffer object cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) ); pass data via shared buffers
  • 18. need to tell the CUDA runtime the device we intend to use for CUDA and OpenGL • initialize OpenGL first and then use the cudaGLSetGLDevice() method • DirectX interop. is nearly identical keep in mind
  • 19. Pro Tip 4
  • 20. ➜ git clone Cloning into cuda-workshop... ... ➜ cd cuda-workshop ➜ cmake CMakeLists.txt -- The C compiler identification is GNU ... ➜ make Scanning dependencies of target cutil [ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o ... [100%] Built target matrixMul ➜ ./bin/matrixMul [ matrixMul ] bin/matrixMul Starting (CUDA and CUBLAS tests)... Device 0: "GeForce GTX 480" with Compute 2.0 capability ... install CMake, glut & glew
  • 21. ➜ ls src/matrixMul CMakeLists.txt matrixMul.h matrixMul_gold.cpp ➜ cat src/matrixMul/CMakeLists.txt CUDA_ADD_EXECUTABLE( matrixMul matrixMul_gold.cpp ) TARGET_LINK_LIBRARIES( matrixMul cutil shrutil ${CUDA_CUBLAS_LIBRARIES} ) ➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt ... great for experimenting
  • 23. cudaEvent_t start, stop; float time; // initialize events cutilSafeCall( cudaEventCreate(&start) ); cutilSafeCall( cudaEventCreate(&stop) ); // warmup to avoid timing startup kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // take measurements for loop over kernel launches cutilSafeCall( cudaEventRecord(start, 0) ); for (int i=0; i < NUM_REPS; i++) { kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // Ensure no launch failure cutilSafeCall( cudaGetLastError() ); } cutilSafeCall( cudaEventRecord(stop, 0) ); cutilSafeCall( cudaEventSynchronize(stop) ); cutilSafeCall( cudaEventElapsedTime(&time, start, stop) ); // report effective bandwidth in GB/s (2.0f due to read + write) float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS); cutilSafeCall( cudaEventDestroy(stop) ); cutilSafeCall( cudaEventDestroy(start) ); events: GPU timestamp
  • 24. #include <cutil_inline.h> ... unsigned int timer_matrixMul = 0; // start timing cutilCheckError( cutStartTimer(timer_matrixMul) ); // do some work kernel<<<grid, threads, mem_size>>>(d_idata, d_odata); cutilDeviceSynchronize(); // stop timer cutilCheckError( cutStopTimer(timer_matrixMul) ); double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0); double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; // destroy timer cutilCheckError( cutDeleteTimer(timer_matrixMul) ); os timers
  • 25. creating and recording events is tricky since some CUDA calls are asynch. • all kernel launches are asynch. • instruct the CPU to synch. on an event via cudaDeviceSynchronize() keep in mind
  • 26. Bindings 6
  • 27. ➜ cat import pycuda.driver as drv import import pycuda.autoinit import numpy import numpy.linalg as la from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1)) print dest-a*b pycuda
  • 28. ➜ python [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  • 29. // Initialize the driver and create a context for the first device. cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device); // Create the PTX file by calling the NVCC and load it String ptxFileName = preparePtxFile(""); CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName); // Obtain a function pointer to the "add" function. CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add"); // Allocate the device input data float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr(); cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT); cuMemcpyHtoD(deviceInputA,, numElements * Sizeof.FLOAT); ... // Set up the kernel parameters Pointer kernelParameters =,...); // Call the kernel function int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX); cuLaunchKernel(function, gridSizeX, 1, 1, // Grid dimension blockSizeX, 1, 1, // Block dimension 0, null, // Shared memory size and stream kernelParameters, null); // Kernel- and extra parameters cuCtxSynchronize(); jcuda
  • 30. ➜ ls License.txt jcuda-0.4.0-beta1.jar jcurand-0.4.0-beta1.jar libJCublas-apple-x86_64.dylib libJCudaRuntime-apple-x86_64.dylib libJCurand-apple-x86_64.dylib jcublas-0.4.0-beta1.jar jcufft-0.4.0-beta1.jar jcusparse-0.4.0-beta1.jar libJCudaDriver-apple-x86_64.dylib libJCufft-apple-x86_64.dylib libJCusparse-apple-x86_64.dylib ➜ cat extern "C" __global__ void add(float *a, float *b, float *sum, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i<n) { sum[i] = a[i] + b[i]; } } ➜ javac -classpath jcuda-0.4.0-beta1.jar ➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAdd Executing nvcc -m64 -ptx -o JCudaVectorAddKernel.ptx Finished creating PTX file Test PASSED
  • 31. ➜ gem install sgc-ruby-cuda Successfully installed sgc-ruby-cuda-0.1.1 1 gem installed ➜ cat vector_add.rb ... # Prepare and load vadd kernel. kernel_lib_file = compile(vadd_kernel_src) CudaFunction.load_lib_file(kernel_lib_file.path) # Copy input buffers from host memory to device memory. memcpy_htod(da, ha, nbytes) memcpy_htod(db, hb, nbytes) # Invoke vadd kernel. nthreads_per_block = 256 block_dim =, 1, 1) grid_dim = + nthreads_per_block - 1) / nthreads_per_block, 1, 1) CudaFunction.configure(block_dim, grid_dim) CudaFunction.setup(da, db, dc, N) f ="vadd") f.launch # Copy output buffer from device memory to host memory. memcpy_dtoh(hc, dc, nbytes) ... ➜ ruby vector_add.rb Vector Addition Verification completed. All matches? YES ruby-cuda
  • 33. cublasHandle_t handle; cublasStatus_t status = cublasCreate(&handle); float* h_A = (float*)malloc(N * N * sizeof(h_A[0])); ... /* Fill the matrices with test data */ ... /* Allocate device memory for the matrices */ cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0])); ... /* Initialize the device matrices with the host matrices */ status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1); ... /* Performs Sgemm: C <- alphaAB + betaC */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); /* Allocate host mem & read back the result from device mem */ h_C = (float*)malloc(N * N * sizeof(h_C[0])); status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1); /* Memory clean up */ cudaFree(d_A); ... /* Shutdown */ status = cublasDestroy(handle); cublas
  • 34. cudaSetDevice( cutGetMaxGflopsDeviceId() ); // Allocate & init. host memory for the signal Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE); ... // Pad signal Complex* h_padded_signal; ... // Allocate device memory for signal Complex* d_signal; cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) ); // Copy host memory to device cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice) ); // CUFFT plan cufftHandle plan; cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) ); // Transform signal cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD) ); // Destroy CUFFT context cufftSafeCall( cufftDestroy(plan) ); // Cleanup memory cutilSafeCall( cudaFree(d_signal) ); ... cutilDeviceReset(); cufft
  • 35. cusparseHandle_t handle = 0; cusparseStatus_t status = cusparseCreate(&handle); // create a matrix description for the matrix M cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM); cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR ); cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO ); cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT ); cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER ); // create & perform analysis info for the non-trans & trans case cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0; cusparseCreateSolveAnalysisInfo(&info); cusparseCreateSolveAnalysisInfo(&infoTrans); cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info); cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans); ... // Solve M z = H H^T z = r by first doing a forward solve: H y = r cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y); // and then a back substitution: H^T z = y cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z); ... cusparseDestroy(handle); cusparse
  • 36. cudaError_t cudaResult = cudaSuccess; // Allocate memory for points float *d_points = 0; cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float)); // Generate random points in unit square curandStatus_t curandResult; curandGenerator_t qrng; curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32); curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2); curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT); curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims); // Cleanup curandResult = curandDestroyGenerator(qrng); cudaFree(d_points); curand
  • 37. // declare a host image object for an 8-bit grayscale image npp::ImageCPU_8u_C1 oHostSrc; // load gray-scale image from disk npp::loadImage(sFilename, oHostSrc); // declare a device image and copy from the host image to the device npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); // create struct with box-filter mask size NppiSize oMaskSize = {5, 5}; // create struct with ROI size given the current mask NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1, oDeviceSrc.height() - oMaskSize.height + 1}; // allocate device image of appropriately reduced size npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); // set anchor point inside the mask to (0, 0) NppiPoint oAnchor = {0, 0}; // run box filter nppiFilterBox_8u_C1R(, oDeviceSrc.pitch(),, oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor); // declare a host image for the result npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); // and copy the device result data into it oDeviceDst.copyTo(, oHostDst.pitch()); npp
  • 38. Streams 8
  • 39. cudaStream_t stream; cutilSafeCall( cudaStreamCreate(&stream) ); // allocate page locked memory cutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) ); // allocate device memory cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) ); cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, stream) ); // run kernel and copy result back cutilSafeCall( cudaEventRecord(start, stream) ); kernel<<<N,M,0,stream>>>(&d_a, ... ); cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, stream) ); // free cudaStreamDestroy(stream); cudaFreeHost(a); cudaFree(d_a); pinned memory
  • 40. // loop over full data, in bite-sized chunks for (int i=0; i<FULL_DATA_SIZE; i+= N) { // copy the locked memory to the device, async cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c); // copy the data from device to locked memory cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream) ); } // wait for all operations to finish cutilSafeCall( cudaStreamSynchronize(stream) ); chunked computation
  • 41. cudaStream_t *streamArray = 0; streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *)); ... for ( int i = 0; i < N ; i++) { cudaStreamCreate(&streamArray[i]); ... } ... for ( int i = 0; i < N ; i++) { cublasSetMatrix (..., devPtrA[i], ...); ... } ... for ( int i = 0; i < N ; i++) { cublasSetStream(handle, streamArray[i]); cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...); } cudaThreadSynchronize(); batched computation
  • 42. use it to specify in which order operations get executed async. • idea is to use more than 1 stream • requires a new kind of mem. copy which in turn requires pinned: paged locked mem. • free pinned mem. when not needed keep in mind
  • 43. // Allocate resources for( int i =0; i<STREAM_COUNT; ++i ) { cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); cudaMalloc(&d_data_in[i], memsize); ... } int current_stream = 0; // Do processing in a loop... { int next_stream = (current_stream + 1 ) % STREAM_COUNT; // Ensure that processing and copying of the last cycle has finished cudaEventSynchronize(cycleDone[next_stream]); // Process current frame kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream], d_data_in[current_stream], N, ...); // Upload next frame cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice, stream[next_stream]); // Download current frame cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost, stream[current_stream]); cudaEventRecord(cycleDone[current_stream], stream[current_stream]); current_stream = next_stream; } overlap kernel exec. & memcpy
  • 44. devices with CC 1.1 and above can overlap a kernel exec & memcpy as long as they are issued from different streams • kernels are serialized • queue in a way that independent streams can execute in parallel keep in mind
  • 46. float *a, *d_a; ... /* Allocate mapped CPU memory. */ cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) ); ... /* Initialize the vectors. */ for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... } /* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */ cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) ); ... /* Call the GPU kernel using the device pointers for the mapped memory. */ ... kernel<<<grid, block>>>(d_a, d_b, d_c, nelem); ... /* Memory clean up */ cutilSafeCall( cudaFreeHost(a) ); ... zero-copy host memory
  • 47. //Create streams for issuing GPU command asynchronously and allocate memory for(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaStreamCreate(&stream[i]) ); cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) ); cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) ); //init h_Data } //Copy data to GPU, launch the kernel and copy data back. All asynchronously for(int i = 0; i < GPU_N; i++) { //Set device cutilSafeCall( cudaSetDevice(i) ); // Copy input data from CPU cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float), cudaMemcpyHostToDevice, stream[i]) ); // Perform GPU computations kernel<<<blocks, threads, 0, stream[i]>>>(...) // Copy back the result cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i], ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) ); } streams
  • 48. // Process GPU results for(i = 0; i < GPU_N; i++) { // Set device cutilSafeCall( cudaSetDevice(i) ); // Wait for all operations to finish cudaStreamSynchronize(stream[i]); // Shut down this GPU cutilSafeCall( cudaFreeHost(h_Data[i]) ); cutilSafeCall( cudaFree(d_Data[i]) ); cutilSafeCall( cudaStreamDestroy(stream[i]) ); } // shutdown for(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaSetDevice(i) ); cutilDeviceReset(); } process the result
  • 49. can also control each GPU by a separate CPU thread • need to assign portable pinned memory if a different thread needs access to one thread’s memory • use the flag cudaHostAllocPortable to cudaHostAlloc() keep in mind
  • 50. // Initialize MPI state MPI_CHECK( MPI_Init(&argc, &argv) ); // Get our MPI node number and node count int commSize, commRank; MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) ); MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) ); if(commRank == 0) {// Are we the root node? //initialize dataRoot... } // Allocate a buffer on each node float * dataNode = new float[dataSizePerNode]; // Dispatch a portion of the input data to each node MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) ); // if commRank == 0 then free dataRoot... kernel<<<gridSize, blockSize>>>(dataNode, ...); // Reduction to the root node float sumNode = sum(dataNode, dataSizePerNode); float sumRoot; MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD) ); MPI_CHECK( MPI_Finalize() ); mpi + cuda
  • 51. // Enable peer access cutilSafeCall(cudaSetDevice(gpuid_tesla[0])); cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0])); ... // Allocate buffers cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size); cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size); // Ping-pong copy between GPUs cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault); // Prepare host buffer and copy to GPU 0 cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault); // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing // output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1); cutilDeviceSynchronize(); // Disable peer access (also unregisters memory for non-UVA cases) cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]); cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]); cudaFree(g0); ... P2P & unified virtual address space
  • 53.
  • 54. Thank you download slides (2MB pdf) from