6. __global__ void dot( float *a, float *b, float *c )
{
__shared__ float cache[threadsPerBlock];
int cacheIndex = threadIdx.x;
...
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
...
}
int main( void )
{
...
dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c );
...
}
shared memory
7. • thread coop. & shared mem. useful
for reduction algorithms
• avoid race conditions by using
__syncthreads()
• avoid bank conflicts
• every thread in the block needs to
call __syncthreads()
keep in mind
9. __constant__ float constFloat;
__device__ float getConstFloat() { return constFloat; }
__global__ void addConstant(float *vec, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<N)
vec[i] += getConstFloat();
}
#include <cutil_inline.h>
int main( int argc, char** argv)
{
float constValue = 4.0f;
cutilSafeCall( cudaMemcpyToSymbol(constFloat,
&constValue,
sizeof(float), 0,
cudaMemcpyHostToDevice) );
...
}
constant mem.
10. • read-only, but conserves mem.
bandwidth
• a single read can be broadcasted and
cached for additional reads
• painfully slow when each thread
reads a different address from
constant memory
keep in mind
12. • read-only, like for const. mem.
• great when memory access exhibits
spatial locality, i.e. each thread
reads a loc. near where the next or
previous thread reads
• comes in 1-D, 2-D and 3-D versions
& typically used in finite diff. apps
keep in mind
13. surface<void, 2> output_surface;
__global__ void surfaceWrite(float* g_idata, int width, int height) {
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference)
surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);
}
int main( int argc, char** argv) {
...
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cu_array;
cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height,
cudaArraySurfaceLoadStore) );
cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) );
surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height);
...
cutilSafeCall( cudaFree(d_data) );
cutilSafeCall( cudaFreeArray(cu_array) );
}
surface mem.
15. // OpenGL Graphics includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif
int main(int argc, char **argv) {
// Initialize GL
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
glutInitWindowSize(1000, 1000);
// Create a window with rendering context and all else we need
glutCreateWindow("CUDA Interop.");
// initialize necessary OpenGL extensions
glewInit();
// Select CUDA device with OpenGL interoperability
if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) {
cutilGLDeviceInit(argc, argv);
}
else {
cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
}
}
set device
16. // vbo variables
GLuint vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;
// create buffer object
glGenBuffers(1, vbo);
glBindBuffer(GL_ARRAY_BUFFER, *vbo);
// initialize buffer object
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
// register this buffer object with CUDA
cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo,
cudaGraphicsMapFlagsWriteDiscard));
register data with CUDA
17. // map OpenGL buffer object for writing from CUDA
float4 *dptr;
cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );
size_t num_bytes;
cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr,
&num_bytes,
*cuda_vbo_resource) );
// run kernel
kernel<<<blocks,threads>>>(dptr,...);
// unmap buffer object
cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) );
pass data via shared buffers
18. • need to tell the CUDA runtime the
device we intend to use for CUDA
and OpenGL
• initialize OpenGL first and then use
the cudaGLSetGLDevice() method
• DirectX interop. is nearly identical
keep in mind
25. • creating and recording events is
tricky since some CUDA calls are
asynch.
• all kernel launches are asynch.
• instruct the CPU to synch. on an
event via cudaDeviceSynchronize()
keep in mind
29. // Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice(); cuDeviceGet(device, 0);
CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device);
// Create the PTX file by calling the NVCC and load it
String ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu");
CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");
// Allocate the device input data
float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT);
...
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...);
// Call the kernel function
int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize();
jcuda
33. cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));
...
/* Fill the matrices with test data */
...
/* Allocate device memory for the matrices */
cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));
...
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);
...
/* Performs Sgemm: C <- alphaAB + betaC */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha, d_A, N, d_B, N, &beta, d_C, N);
/* Allocate host mem & read back the result from device mem */
h_C = (float*)malloc(N * N * sizeof(h_C[0]));
status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);
/* Memory clean up */
cudaFree(d_A);
...
/* Shutdown */
status = cublasDestroy(handle);
cublas
34. cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate & init. host memory for the signal
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);
...
// Pad signal
Complex* h_padded_signal;
...
// Allocate device memory for signal
Complex* d_signal;
cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );
// Copy host memory to device
cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size,
cudaMemcpyHostToDevice) );
// CUFFT plan
cufftHandle plan;
cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );
// Transform signal
cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal,
(cufftComplex *)d_signal, CUFFT_FORWARD) );
// Destroy CUFFT context
cufftSafeCall( cufftDestroy(plan) );
// Cleanup memory
cutilSafeCall( cudaFree(d_signal) );
...
cutilDeviceReset(); cufft
35. cusparseHandle_t handle = 0;
cusparseStatus_t status = cusparseCreate(&handle);
// create a matrix description for the matrix M
cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);
cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR );
cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );
cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );
cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );
// create & perform analysis info for the non-trans & trans case
cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;
cusparseCreateSolveAnalysisInfo(&info);
cusparseCreateSolveAnalysisInfo(&infoTrans);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, info);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);
...
// Solve M z = H H^T z = r by first doing a forward solve: H y = r
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);
// and then a back substitution: H^T z = y
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z);
...
cusparseDestroy(handle);
cusparse
37. // declare a host image object for an 8-bit grayscale image
npp::ImageCPU_8u_C1 oHostSrc;
// load gray-scale image from disk
npp::loadImage(sFilename, oHostSrc);
// declare a device image and copy from the host image to the device
npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);
// create struct with box-filter mask size
NppiSize oMaskSize = {5, 5};
// create struct with ROI size given the current mask
NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1,
oDeviceSrc.height() - oMaskSize.height + 1};
// allocate device image of appropriately reduced size
npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);
// set anchor point inside the mask to (0, 0)
NppiPoint oAnchor = {0, 0};
// run box filter
nppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(),
oDeviceDst.data(), oDeviceDst.pitch(),
oSizeROI, oMaskSize, oAnchor);
// declare a host image for the result
npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());
// and copy the device result data into it
oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());
npp
40. // loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
// copy the locked memory to the device, async
cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream) );
cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream) );
kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c);
// copy the data from device to locked memory
cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream) );
}
// wait for all operations to finish
cutilSafeCall( cudaStreamSynchronize(stream) );
chunked computation
41. cudaStream_t *streamArray = 0;
streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));
...
for ( int i = 0; i < N ; i++) {
cudaStreamCreate(&streamArray[i]);
...
}
...
for ( int i = 0; i < N ; i++) {
cublasSetMatrix (..., devPtrA[i], ...);
...
}
...
for ( int i = 0; i < N ; i++) {
cublasSetStream(handle, streamArray[i]);
cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);
}
cudaThreadSynchronize();
batched computation
42. • use it to specify in which order
operations get executed async.
• idea is to use more than 1 stream
• requires a new kind of mem. copy
which in turn requires pinned: paged
locked mem.
• free pinned mem. when not needed
keep in mind
43. // Allocate resources
for( int i =0; i<STREAM_COUNT; ++i ) {
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);
cudaMalloc(&d_data_in[i], memsize);
...
}
int current_stream = 0;
// Do processing in a loop...
{
int next_stream = (current_stream + 1 ) % STREAM_COUNT;
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream],
d_data_in[current_stream],
N, ...);
// Upload next frame
cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice,
stream[next_stream]);
// Download current frame
cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost,
stream[current_stream]);
cudaEventRecord(cycleDone[current_stream], stream[current_stream]);
current_stream = next_stream;
}
overlap kernel exec. & memcpy
44. • devices with CC 1.1 and above can
overlap a kernel exec & memcpy as
long as they are issued from
different streams
• kernels are serialized
• queue in a way that independent
streams can execute in parallel
keep in mind
46. float *a, *d_a;
...
/* Allocate mapped CPU memory. */
cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );
...
/* Initialize the vectors. */
for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }
/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */
cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );
...
/* Call the GPU kernel using the device pointers for the mapped memory. */
...
kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);
...
/* Memory clean up */
cutilSafeCall( cudaFreeHost(a) );
...
zero-copy host memory
47. //Create streams for issuing GPU command asynchronously and allocate memory
for(int i = 0; i < GPU_N; i++) {
cutilSafeCall( cudaStreamCreate(&stream[i]) );
cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) );
cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) );
//init h_Data
}
//Copy data to GPU, launch the kernel and copy data back. All asynchronously
for(int i = 0; i < GPU_N; i++) {
//Set device
cutilSafeCall( cudaSetDevice(i) );
// Copy input data from CPU
cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float),
cudaMemcpyHostToDevice, stream[i]) );
// Perform GPU computations
kernel<<<blocks, threads, 0, stream[i]>>>(...)
// Copy back the result
cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i],
ACCUM_N * sizeof(float),
cudaMemcpyDeviceToHost, stream[i]) );
}
streams
48. // Process GPU results
for(i = 0; i < GPU_N; i++) {
// Set device
cutilSafeCall( cudaSetDevice(i) );
// Wait for all operations to finish
cudaStreamSynchronize(stream[i]);
// Shut down this GPU
cutilSafeCall( cudaFreeHost(h_Data[i]) );
cutilSafeCall( cudaFree(d_Data[i]) );
cutilSafeCall( cudaStreamDestroy(stream[i]) );
}
// shutdown
for(int i = 0; i < GPU_N; i++) {
cutilSafeCall( cudaSetDevice(i) );
cutilDeviceReset();
}
process the result
49. • can also control each GPU by a
separate CPU thread
• need to assign portable pinned
memory if a different thread needs
access to one thread’s memory
• use the flag cudaHostAllocPortable
to cudaHostAlloc()
keep in mind
50. // Initialize MPI state
MPI_CHECK( MPI_Init(&argc, &argv) );
// Get our MPI node number and node count
int commSize, commRank;
MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );
MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );
if(commRank == 0) {// Are we the root node?
//initialize dataRoot...
}
// Allocate a buffer on each node
float * dataNode = new float[dataSizePerNode];
// Dispatch a portion of the input data to each node
MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );
// if commRank == 0 then free dataRoot...
kernel<<<gridSize, blockSize>>>(dataNode, ...);
// Reduction to the root node
float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot;
MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0,
MPI_COMM_WORLD) );
MPI_CHECK( MPI_Finalize() ); mpi + cuda
51. // Enable peer access
cutilSafeCall(cudaSetDevice(gpuid_tesla[0]));
cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));
...
// Allocate buffers
cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);
cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);
// Ping-pong copy between GPUs
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
// Prepare host buffer and copy to GPU 0
cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f
cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);
cutilDeviceSynchronize();
// Disable peer access (also unregisters memory for non-UVA cases)
cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);
cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);
cudaFree(g0);
...
P2P & unified virtual address space