SlideShare una empresa de Scribd logo
1 de 54
Descargar para leer sin conexión
CUDA
Deep
 Dive
 Kashif Rasul
    @krasul
Hello
my name is Kashif
objective: Deeper
understanding
Prerequisites

         1
#include <cutil_inline.h>

int main( void )
{
  int N = 50000;
  size_t size = N * sizeof(float);

    cudaSetDevice( cutGetMaxGflopsDeviceId() );
    ...
    cutilSafeCall( cudaMalloc((void**)&d_A, size) );
    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
    ...

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    ...
    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
    cudaFree(d_A);
    ...
    cutilDeviceReset();
}


                                                 blocks & threads
__global__ void dot( float *a, float *b, float *c )
{
  __shared__ float cache[threadsPerBlock];
  int cacheIndex = threadIdx.x;
  ...
  // set the cache values
  cache[cacheIndex] = temp;
  // synchronize threads in this block
  __syncthreads();
  ...
}

int main( void )
{
  ...
  dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c );
  ...
}




                                             shared memory
•   thread coop. & shared mem. useful
    for reduction algorithms

•   avoid race conditions by using
    __syncthreads()

•   avoid bank conflicts

•   every thread in the block needs to
    call __syncthreads()


                                 keep in mind
Memory

     2
__constant__ float constFloat;

__device__ float getConstFloat() { return constFloat; }

__global__ void addConstant(float *vec, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i<N)
        vec[i] += getConstFloat();
}

#include <cutil_inline.h>

int main( int argc, char** argv)
{
    float constValue = 4.0f;

    cutilSafeCall( cudaMemcpyToSymbol(constFloat,
                                      &constValue,
                                      sizeof(float), 0,
                                      cudaMemcpyHostToDevice) );
    ...
}

                                                 constant mem.
•   read-only, but conserves mem.
    bandwidth

•   a single read can be broadcasted and
    cached for additional reads

•   painfully slow when each thread
    reads a different address from
    constant memory


                                 keep in mind
// textures containing look-up tables
texture<uint> edgeTex;
texture<uint, 2> edge2dTex;

int main(int argc, char** argv)
{
    ...
    cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) );
    cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable,
                              256*sizeof(uint), cudaMemcpyHostToDevice) );

    cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable,
                                   256*sizeof(uint)) );

    // run kernel
    kernel<<<blocks, threads>>>(...)

    //cleanup
    cutilSafeCall( cudaUnbindTexture(edgeTex) );
}

__global__ void kernel(...)
{
    ...
    uint edge = tex1Dfetch(edgeTex, index*16 + i);
    ...
}
                                                        texture mem.
•   read-only, like for const. mem.

•   great when memory access exhibits
    spatial locality, i.e. each thread
    reads a loc. near where the next or
    previous thread reads

•   comes in 1-D, 2-D and 3-D versions
    & typically used in finite diff. apps


                                   keep in mind
surface<void, 2> output_surface;

__global__ void surfaceWrite(float* g_idata, int width, int height) {
    // calculate surface coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // read from global memory and write to cuarray (via surface reference)
    surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);
}

int main( int argc, char** argv) {
    ...
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
                                                              cudaChannelFormatKindFloat);
    cudaArray* cu_array;
    cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height,
                                   cudaArraySurfaceLoadStore) );
    cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) );
    cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) );

    surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height);
    ...
    cutilSafeCall( cudaFree(d_data) );
    cutilSafeCall( cudaFreeArray(cu_array) );
}

                                                                  surface mem.
InterOp.

       3
// OpenGL Graphics includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif

int main(int argc, char **argv) {
    // Initialize GL
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
    glutInitWindowSize(1000, 1000);

    // Create a window with rendering context and all else we need
    glutCreateWindow("CUDA Interop.");

    // initialize necessary OpenGL extensions
    glewInit();

    // Select CUDA device with OpenGL interoperability
    if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) {
        cutilGLDeviceInit(argc, argv);
    }
    else {
        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
    }
}
                                                         set device
// vbo variables
GLuint vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

// create buffer object
glGenBuffers(1, vbo);
glBindBuffer(GL_ARRAY_BUFFER, *vbo);

// initialize buffer object
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER, 0);

// register this buffer object with CUDA
cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo,
                                         cudaGraphicsMapFlagsWriteDiscard));




                                       register data with CUDA
// map OpenGL buffer object for writing from CUDA
float4 *dptr;
cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );

size_t num_bytes;
cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr,
                                                    &num_bytes,
                                                    *cuda_vbo_resource) );

// run kernel
kernel<<<blocks,threads>>>(dptr,...);

// unmap buffer object
cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) );




                             pass data via shared buffers
•   need to tell the CUDA runtime the
    device we intend to use for CUDA
    and OpenGL

•   initialize OpenGL first and then use
    the cudaGLSetGLDevice() method

•   DirectX interop. is nearly identical


                                   keep in mind
Pro Tip

      4
➜ git clone https://github.com/kashif/cuda-workshop.git
Cloning into cuda-workshop...
...

➜ cd cuda-workshop

➜ cmake CMakeLists.txt
-- The C compiler identification is GNU
...

➜ make
Scanning dependencies of target cutil
[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o
...
[100%] Built target matrixMul

➜ ./bin/matrixMul
[ matrixMul ]
bin/matrixMul Starting (CUDA and CUBLAS tests)...

Device 0: "GeForce GTX 480" with Compute 2.0 capability
...


                                   install CMake, glut & glew
➜ ls src/matrixMul
CMakeLists.txt     matrixMul.cu        matrixMul.h
matrixMul_gold.cpp matrixMul_kernel.cu

➜ cat src/matrixMul/CMakeLists.txt
CUDA_ADD_EXECUTABLE( matrixMul
  matrixMul.cu
  matrixMul_gold.cpp
)

TARGET_LINK_LIBRARIES( matrixMul
  cutil
  shrutil
  ${CUDA_CUBLAS_LIBRARIES}
)

➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt
...




                            great for experimenting
Events &
 Timers
       5
cudaEvent_t start, stop;
float time;

// initialize events
cutilSafeCall( cudaEventCreate(&start) );
cutilSafeCall( cudaEventCreate(&stop) );

// warmup to avoid timing startup
kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);

// take measurements for loop over kernel launches
cutilSafeCall( cudaEventRecord(start, 0) );
for (int i=0; i < NUM_REPS; i++) {
    kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);
    // Ensure no launch failure
    cutilSafeCall( cudaGetLastError() );
}
cutilSafeCall( cudaEventRecord(stop, 0) );
cutilSafeCall( cudaEventSynchronize(stop) );
cutilSafeCall( cudaEventElapsedTime(&time, start, stop) );

// report effective bandwidth in GB/s (2.0f due to read + write)
float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS);

cutilSafeCall( cudaEventDestroy(stop) );
cutilSafeCall( cudaEventDestroy(start) );
                                   events: GPU timestamp
#include <cutil_inline.h>
...
unsigned int timer_matrixMul = 0;

// start timing
cutilCheckError( cutStartTimer(timer_matrixMul) );

// do some work
kernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
cutilDeviceSynchronize();

// stop timer
cutilCheckError( cutStopTimer(timer_matrixMul) );

double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0);
double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
double gflops = 1.0e-9 * dNumOps/dSeconds;

// destroy timer
cutilCheckError( cutDeleteTimer(timer_matrixMul) );



                                                               os timers
•   creating and recording events is
    tricky since some CUDA calls are
    asynch.

•   all kernel launches are asynch.

•   instruct the CPU to synch. on an
    event via cudaDeviceSynchronize()


                                 keep in mind
Bindings

       6
➜ cat hello_gpu.py
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400,1,1))

print dest-a*b
                                                             pycuda
➜ python   hello_gpu.py
[ 0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0. 0. 0.    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0. 0.    0. 0.]
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();       cuDeviceGet(device, 0);
CUcontext context = new CUcontext();    cuCtxCreate(context, 0, device);

// Create the PTX file by calling the NVCC and load it
String ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu");
CUmodule module = new CUmodule();       cuModuleLoad(module, ptxFileName);

// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");

// Allocate the device input data
float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT);
...
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...);

// Call the kernel function
int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
cuLaunchKernel(function,
               gridSizeX, 1, 1,         // Grid dimension
               blockSizeX, 1, 1,        // Block dimension
               0, null,                 // Shared memory size and stream
               kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize();
                                                                                 jcuda
➜ ls
License.txt                          jcuda-0.4.0-beta1.jar
jcurand-0.4.0-beta1.jar              libJCublas-apple-x86_64.dylib
libJCudaRuntime-apple-x86_64.dylib   libJCurand-apple-x86_64.dylib
jcublas-0.4.0-beta1.jar              jcufft-0.4.0-beta1.jar
jcusparse-0.4.0-beta1.jar            libJCudaDriver-apple-x86_64.dylib
libJCufft-apple-x86_64.dylib         libJCusparse-apple-x86_64.dylib
JCudaVectorAdd.java                  JCudaVectorAddKernel.cu

➜ cat JCudaVectorAddKernel.cu
extern "C"
__global__ void add(float *a, float *b, float *sum, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i<n)
    {
        sum[i] = a[i] + b[i];
    }
}

➜ javac -classpath jcuda-0.4.0-beta1.jar JCudaVectorAdd.java

➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAdd
Executing
nvcc -m64 -ptx JCudaVectorAddKernel.cu -o JCudaVectorAddKernel.ptx
Finished creating PTX file
Test PASSED
➜ gem install sgc-ruby-cuda
Successfully installed sgc-ruby-cuda-0.1.1
1 gem installed

➜ cat vector_add.rb
...
# Prepare and load vadd kernel.
kernel_lib_file = compile(vadd_kernel_src)
CudaFunction.load_lib_file(kernel_lib_file.path)

# Copy input buffers from host memory to device memory.
memcpy_htod(da, ha, nbytes)
memcpy_htod(db, hb, nbytes)

# Invoke vadd kernel.
nthreads_per_block = 256
block_dim = Dim3.new(nthreads_per_block, 1, 1)
grid_dim = Dim3.new((N + nthreads_per_block - 1) / nthreads_per_block, 1, 1)
CudaFunction.configure(block_dim, grid_dim)
CudaFunction.setup(da, db, dc, N)
f = CudaFunction.new("vadd")
f.launch

# Copy output buffer from device memory to host memory.
memcpy_dtoh(hc, dc, nbytes)
...

➜ ruby vector_add.rb
Vector Addition
Verification completed. All matches? YES                        ruby-cuda
Libraries

       7
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);

float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));
...
/* Fill the matrices with test data */
...
/* Allocate device memory for the matrices */
cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));
...
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);
...
/* Performs Sgemm: C <- alphaAB + betaC */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
                     &alpha, d_A, N, d_B, N, &beta, d_C, N);

/* Allocate host mem & read back the result from device mem */
h_C = (float*)malloc(N * N * sizeof(h_C[0]));
status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);

/* Memory clean up */
cudaFree(d_A);
...
/* Shutdown */
status = cublasDestroy(handle);
                                                              cublas
cudaSetDevice( cutGetMaxGflopsDeviceId() );

// Allocate & init. host memory for the signal
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);
...
// Pad signal
Complex* h_padded_signal;
...
// Allocate device memory for signal
Complex* d_signal;
cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );
// Copy host memory to device
cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size,
                          cudaMemcpyHostToDevice) );

// CUFFT plan
cufftHandle plan;
cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );

// Transform signal
cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal,
                            (cufftComplex *)d_signal, CUFFT_FORWARD) );

// Destroy CUFFT context
cufftSafeCall( cufftDestroy(plan) );

// Cleanup memory
cutilSafeCall( cudaFree(d_signal) );
...
cutilDeviceReset();                                                    cufft
cusparseHandle_t handle = 0;
cusparseStatus_t status = cusparseCreate(&handle);

// create a matrix description for the matrix M
cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);
cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR );
cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );
cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );
cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );

// create & perform analysis info for the non-trans & trans case
cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;
cusparseCreateSolveAnalysisInfo(&info);
cusparseCreateSolveAnalysisInfo(&infoTrans);

cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM,
                         d_valsICP, d_rowPtrsICP, d_colIndsICP, info);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM,
                         d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);
...
// Solve M z = H H^T z = r by first doing a forward solve: H y = r
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM,
                     d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);
// and then a back substitution: H^T z = y
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM,
                     d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z);
...
cusparseDestroy(handle);
                                                                     cusparse
cudaError_t cudaResult = cudaSuccess;

// Allocate memory for points
float *d_points = 0;
cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));

// Generate random points in unit square
curandStatus_t curandResult;
curandGenerator_t qrng;

curandResult   =   curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
curandResult   =   curandSetQuasiRandomGeneratorDimensions(qrng, 2);
curandResult   =   curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);
curandResult   =   curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);

// Cleanup
curandResult = curandDestroyGenerator(qrng);
cudaFree(d_points);




                                                                          curand
// declare a host image object for an 8-bit grayscale image
npp::ImageCPU_8u_C1 oHostSrc;
// load gray-scale image from disk
npp::loadImage(sFilename, oHostSrc);
// declare a device image and copy from the host image to the device
npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);

// create struct with box-filter mask size
NppiSize oMaskSize = {5, 5};
// create struct with ROI size given the current mask
NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1,
                     oDeviceSrc.height() - oMaskSize.height + 1};

// allocate device image of appropriately reduced size
npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);

// set anchor point inside the mask to (0, 0)
NppiPoint oAnchor = {0, 0};
// run box filter
nppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(),
                     oDeviceDst.data(), oDeviceDst.pitch(),
                     oSizeROI, oMaskSize, oAnchor);

// declare a host image for the result
npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());
// and copy the device result data into it
oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());
                                                                       npp
Streams

      8
cudaStream_t stream;

cutilSafeCall( cudaStreamCreate(&stream) );

// allocate page locked memory
cutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) );

// allocate device memory
cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );
cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice,
                               stream) );

// run kernel and copy result back
cutilSafeCall( cudaEventRecord(start, stream) );
kernel<<<N,M,0,stream>>>(&d_a, ... );
cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost,
                               stream) );

// free
cudaStreamDestroy(stream);
cudaFreeHost(a);
cudaFree(d_a);


                                                      pinned memory
// loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
    // copy the locked memory to the device, async
    cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i,
                                   N * sizeof(int),
                                   cudaMemcpyHostToDevice,
                                   stream) );
    cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i,
                                   N * sizeof(int),
                                   cudaMemcpyHostToDevice,
                                   stream) );

    kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c);

    // copy the data from device to locked memory
    cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c,
                                   N * sizeof(int),
                                   cudaMemcpyDeviceToHost,
                                   stream) );
}

// wait for all operations to finish
cutilSafeCall( cudaStreamSynchronize(stream) );

                                  chunked computation
cudaStream_t *streamArray = 0;
streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));

...
for ( int i = 0; i < N ; i++) {
    cudaStreamCreate(&streamArray[i]);
    ...
}

...
for ( int i = 0; i < N ; i++) {
    cublasSetMatrix (..., devPtrA[i], ...);
    ...
}

...
for ( int i = 0; i < N ; i++) {
    cublasSetStream(handle, streamArray[i]);
    cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);
}
cudaThreadSynchronize();


                                         batched computation
•   use it to specify in which order
    operations get executed async.

•   idea is to use more than 1 stream

•   requires a new kind of mem. copy
    which in turn requires pinned: paged
    locked mem.

•   free pinned mem. when not needed

                                 keep in mind
// Allocate resources
for( int i =0; i<STREAM_COUNT; ++i ) {
    cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);
    cudaMalloc(&d_data_in[i], memsize);
    ...
}

int current_stream = 0;
// Do processing in a loop...
{
    int next_stream = (current_stream + 1 ) % STREAM_COUNT;
    // Ensure that processing and copying of the last cycle has finished
    cudaEventSynchronize(cycleDone[next_stream]);

    // Process current frame
    kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream],
                                                       d_data_in[current_stream],
                                                       N, ...);
    // Upload next frame
    cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice,
                    stream[next_stream]);

    // Download current frame
    cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost,
                    stream[current_stream]);

    cudaEventRecord(cycleDone[current_stream], stream[current_stream]);
    current_stream = next_stream;
}
                               overlap kernel exec. & memcpy
•   devices with CC 1.1 and above can
    overlap a kernel exec & memcpy as
    long as they are issued from
    different streams

•   kernels are serialized

•   queue in a way that independent
    streams can execute in parallel


                                keep in mind
Multi-GPU

       9
float *a, *d_a;
...

/* Allocate mapped CPU memory. */
cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );
...

/* Initialize the vectors. */
for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }

/* Get the device pointers for the pinned CPU memory mapped into the GPU
    memory space. */
cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );
...

/* Call the GPU kernel using the device pointers for the mapped memory. */
...
kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);
...

/* Memory clean up */
cutilSafeCall( cudaFreeHost(a) );
...

                                     zero-copy host memory
//Create streams for issuing GPU command asynchronously and allocate memory
for(int i = 0; i < GPU_N; i++) {
    cutilSafeCall( cudaStreamCreate(&stream[i]) );
    cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) );
    cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) );
    //init h_Data
}

//Copy data to GPU, launch the kernel and copy data back. All asynchronously
for(int i = 0; i < GPU_N; i++) {
    //Set device
    cutilSafeCall( cudaSetDevice(i) );

    // Copy input data from CPU
    cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float),
                                   cudaMemcpyHostToDevice, stream[i]) );

    // Perform GPU computations
    kernel<<<blocks, threads, 0, stream[i]>>>(...)

    // Copy back the result
    cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i],
                                   ACCUM_N * sizeof(float),
                                   cudaMemcpyDeviceToHost, stream[i]) );
}


                                                                      streams
// Process GPU results
for(i = 0; i < GPU_N; i++) {
    // Set device
    cutilSafeCall( cudaSetDevice(i) );

    // Wait for all operations to finish
    cudaStreamSynchronize(stream[i]);

    // Shut down this GPU
    cutilSafeCall( cudaFreeHost(h_Data[i]) );
    cutilSafeCall( cudaFree(d_Data[i]) );
    cutilSafeCall( cudaStreamDestroy(stream[i]) );
}

// shutdown
for(int i = 0; i < GPU_N; i++) {
    cutilSafeCall( cudaSetDevice(i) );
    cutilDeviceReset();
}




                                  process the result
•   can also control each GPU by a
    separate CPU thread

•   need to assign portable pinned
    memory if a different thread needs
    access to one thread’s memory

•   use the flag cudaHostAllocPortable
    to cudaHostAlloc()


                                keep in mind
// Initialize MPI state
MPI_CHECK( MPI_Init(&argc, &argv) );

// Get our MPI node number and node count
int commSize, commRank;
MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );
MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );

if(commRank == 0) {// Are we the root node?
    //initialize dataRoot...
}

// Allocate a buffer on each node
float * dataNode = new float[dataSizePerNode];

// Dispatch a portion of the input data to each node
MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
                       dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );

// if commRank == 0 then free dataRoot...

kernel<<<gridSize, blockSize>>>(dataNode, ...);

// Reduction to the root node
float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot;
MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0,
                      MPI_COMM_WORLD) );

MPI_CHECK( MPI_Finalize() );                                mpi + cuda
// Enable peer access
cutilSafeCall(cudaSetDevice(gpuid_tesla[0]));
cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));
...

// Allocate buffers
cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);
cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);

// Ping-pong copy between GPUs
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

// Prepare host buffer and copy to GPU 0
cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f
cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);
cutilDeviceSynchronize();

// Disable peer access (also unregisters memory for non-UVA cases)
cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);
cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);

cudaFree(g0);
...

                   P2P & unified virtual address space
References
Thank you
download slides (2MB pdf) from
  http://bit.ly/cuda-deep-dive

Más contenido relacionado

La actualidad más candente

Nvidia® cuda™ 5 sample evaluationresult_2
Nvidia® cuda™ 5 sample evaluationresult_2Nvidia® cuda™ 5 sample evaluationresult_2
Nvidia® cuda™ 5 sample evaluationresult_2Yukio Saito
 
Writing a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasWriting a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasSteve Purkis
 
Kato Mivule: An Overview of CUDA for High Performance Computing
Kato Mivule: An Overview of CUDA for High Performance ComputingKato Mivule: An Overview of CUDA for High Performance Computing
Kato Mivule: An Overview of CUDA for High Performance ComputingKato Mivule
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpuMarco Parenzan
 
OOP for Hardware Verification--Demystified!
OOP for Hardware Verification--Demystified! OOP for Hardware Verification--Demystified!
OOP for Hardware Verification--Demystified! DVClub
 
Clojure ♥ cassandra
Clojure ♥ cassandra Clojure ♥ cassandra
Clojure ♥ cassandra Max Penet
 
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)Ontico
 
자바스크립트 비동기 코드(Javascript asyncronous code)
자바스크립트 비동기 코드(Javascript asyncronous code)자바스크립트 비동기 코드(Javascript asyncronous code)
자바스크립트 비동기 코드(Javascript asyncronous code)Kongson Park
 
Encryption Boot Camp on the JVM
Encryption Boot Camp on the JVMEncryption Boot Camp on the JVM
Encryption Boot Camp on the JVMMatthew McCullough
 
MySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsMySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsAlexander Rubin
 
Engineering fast indexes (Deepdive)
Engineering fast indexes (Deepdive)Engineering fast indexes (Deepdive)
Engineering fast indexes (Deepdive)Daniel Lemire
 
Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Remy Sharp
 
C++ game development with oxygine
C++ game development with oxygineC++ game development with oxygine
C++ game development with oxyginecorehard_by
 
sizeof(Object): how much memory objects take on JVMs and when this may matter
sizeof(Object): how much memory objects take on JVMs and when this may mattersizeof(Object): how much memory objects take on JVMs and when this may matter
sizeof(Object): how much memory objects take on JVMs and when this may matterDawid Weiss
 

La actualidad más candente (19)

Nvidia® cuda™ 5 sample evaluationresult_2
Nvidia® cuda™ 5 sample evaluationresult_2Nvidia® cuda™ 5 sample evaluationresult_2
Nvidia® cuda™ 5 sample evaluationresult_2
 
Writing a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasWriting a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 Canvas
 
3
33
3
 
Kato Mivule: An Overview of CUDA for High Performance Computing
Kato Mivule: An Overview of CUDA for High Performance ComputingKato Mivule: An Overview of CUDA for High Performance Computing
Kato Mivule: An Overview of CUDA for High Performance Computing
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
 
OOP for Hardware Verification--Demystified!
OOP for Hardware Verification--Demystified! OOP for Hardware Verification--Demystified!
OOP for Hardware Verification--Demystified!
 
Clojure ♥ cassandra
Clojure ♥ cassandra Clojure ♥ cassandra
Clojure ♥ cassandra
 
Npc14
Npc14Npc14
Npc14
 
Disruptor
DisruptorDisruptor
Disruptor
 
Ac2
Ac2Ac2
Ac2
 
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)
Java и Linux — особенности эксплуатации / Алексей Рагозин (Дойче Банк)
 
자바스크립트 비동기 코드(Javascript asyncronous code)
자바스크립트 비동기 코드(Javascript asyncronous code)자바스크립트 비동기 코드(Javascript asyncronous code)
자바스크립트 비동기 코드(Javascript asyncronous code)
 
The State of JavaScript
The State of JavaScriptThe State of JavaScript
The State of JavaScript
 
Encryption Boot Camp on the JVM
Encryption Boot Camp on the JVMEncryption Boot Camp on the JVM
Encryption Boot Camp on the JVM
 
MySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsMySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of Things
 
Engineering fast indexes (Deepdive)
Engineering fast indexes (Deepdive)Engineering fast indexes (Deepdive)
Engineering fast indexes (Deepdive)
 
Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)
 
C++ game development with oxygine
C++ game development with oxygineC++ game development with oxygine
C++ game development with oxygine
 
sizeof(Object): how much memory objects take on JVMs and when this may matter
sizeof(Object): how much memory objects take on JVMs and when this may mattersizeof(Object): how much memory objects take on JVMs and when this may matter
sizeof(Object): how much memory objects take on JVMs and when this may matter
 

Destacado

GPUdb: A Distributed Database for Many-Core Devices
GPUdb: A Distributed Database for Many-Core DevicesGPUdb: A Distributed Database for Many-Core Devices
GPUdb: A Distributed Database for Many-Core Devicesinside-BigData.com
 
FOSDEM 2016: The State of XMPP and Instant Messaging, The Awakening
FOSDEM 2016: The State of XMPP and Instant Messaging, The AwakeningFOSDEM 2016: The State of XMPP and Instant Messaging, The Awakening
FOSDEM 2016: The State of XMPP and Instant Messaging, The AwakeningNyco
 
Nvidia cuda tutorial_no_nda_apr08
Nvidia cuda tutorial_no_nda_apr08Nvidia cuda tutorial_no_nda_apr08
Nvidia cuda tutorial_no_nda_apr08Angela Mendoza M.
 
NVidia CUDA Tutorial - June 15, 2009
NVidia CUDA Tutorial - June 15, 2009NVidia CUDA Tutorial - June 15, 2009
NVidia CUDA Tutorial - June 15, 2009Randall Hand
 
GPU, CUDA, OpenCL and OpenACC for Parallel Applications
GPU, CUDA, OpenCL and OpenACC for Parallel ApplicationsGPU, CUDA, OpenCL and OpenACC for Parallel Applications
GPU, CUDA, OpenCL and OpenACC for Parallel ApplicationsMarcos Gonzalez
 
Gayle Laakmann McDowell - Talent42 2015
Gayle Laakmann McDowell - Talent42 2015Gayle Laakmann McDowell - Talent42 2015
Gayle Laakmann McDowell - Talent42 2015Talent42
 

Destacado (7)

Bootcamp
BootcampBootcamp
Bootcamp
 
GPUdb: A Distributed Database for Many-Core Devices
GPUdb: A Distributed Database for Many-Core DevicesGPUdb: A Distributed Database for Many-Core Devices
GPUdb: A Distributed Database for Many-Core Devices
 
FOSDEM 2016: The State of XMPP and Instant Messaging, The Awakening
FOSDEM 2016: The State of XMPP and Instant Messaging, The AwakeningFOSDEM 2016: The State of XMPP and Instant Messaging, The Awakening
FOSDEM 2016: The State of XMPP and Instant Messaging, The Awakening
 
Nvidia cuda tutorial_no_nda_apr08
Nvidia cuda tutorial_no_nda_apr08Nvidia cuda tutorial_no_nda_apr08
Nvidia cuda tutorial_no_nda_apr08
 
NVidia CUDA Tutorial - June 15, 2009
NVidia CUDA Tutorial - June 15, 2009NVidia CUDA Tutorial - June 15, 2009
NVidia CUDA Tutorial - June 15, 2009
 
GPU, CUDA, OpenCL and OpenACC for Parallel Applications
GPU, CUDA, OpenCL and OpenACC for Parallel ApplicationsGPU, CUDA, OpenCL and OpenACC for Parallel Applications
GPU, CUDA, OpenCL and OpenACC for Parallel Applications
 
Gayle Laakmann McDowell - Talent42 2015
Gayle Laakmann McDowell - Talent42 2015Gayle Laakmann McDowell - Talent42 2015
Gayle Laakmann McDowell - Talent42 2015
 

Similar a CUDA Deep Dive

Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda MoayadMoayadhn
 
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...mouhouioui
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfpepe464163
 
An Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAn Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAnirudhGarg35
 
Lecture 6 Kernel Debugging + Ports Development
Lecture 6 Kernel Debugging + Ports DevelopmentLecture 6 Kernel Debugging + Ports Development
Lecture 6 Kernel Debugging + Ports DevelopmentMohammed Farrag
 
ISCA Final Presentaiton - Compilations
ISCA Final Presentaiton -  CompilationsISCA Final Presentaiton -  Compilations
ISCA Final Presentaiton - CompilationsHSA Foundation
 
Conflux: gpgpu for .net (en)
Conflux: gpgpu for .net (en)Conflux: gpgpu for .net (en)
Conflux: gpgpu for .net (en)Andrei Varanovich
 
Linux seccomp(2) vs OpenBSD pledge(2)
Linux seccomp(2) vs OpenBSD pledge(2)Linux seccomp(2) vs OpenBSD pledge(2)
Linux seccomp(2) vs OpenBSD pledge(2)Giovanni Bechis
 
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the bfinalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the bChereCheek752
 
Roll your own toy unix clone os
Roll your own toy unix clone osRoll your own toy unix clone os
Roll your own toy unix clone oseramax
 
introduction to CUDA_C.pptx it is widely used
introduction to CUDA_C.pptx it is widely usedintroduction to CUDA_C.pptx it is widely used
introduction to CUDA_C.pptx it is widely usedHimanshu577858
 
C++ amp on linux
C++ amp on linuxC++ amp on linux
C++ amp on linuxMiller Lee
 
GPU Programming on CPU - Using C++AMP
GPU Programming on CPU - Using C++AMPGPU Programming on CPU - Using C++AMP
GPU Programming on CPU - Using C++AMPMiller Lee
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computingArka Ghosh
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computingArka Ghosh
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computingArka Ghosh
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computingArka Ghosh
 

Similar a CUDA Deep Dive (20)

Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda Moayad
 
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdf
 
An Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAn Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptx
 
Lecture 6 Kernel Debugging + Ports Development
Lecture 6 Kernel Debugging + Ports DevelopmentLecture 6 Kernel Debugging + Ports Development
Lecture 6 Kernel Debugging + Ports Development
 
ISCA Final Presentaiton - Compilations
ISCA Final Presentaiton -  CompilationsISCA Final Presentaiton -  Compilations
ISCA Final Presentaiton - Compilations
 
Conflux:gpgpu for .net (en)
Conflux:gpgpu for .net (en)Conflux:gpgpu for .net (en)
Conflux:gpgpu for .net (en)
 
Conflux: gpgpu for .net (en)
Conflux: gpgpu for .net (en)Conflux: gpgpu for .net (en)
Conflux: gpgpu for .net (en)
 
Linux seccomp(2) vs OpenBSD pledge(2)
Linux seccomp(2) vs OpenBSD pledge(2)Linux seccomp(2) vs OpenBSD pledge(2)
Linux seccomp(2) vs OpenBSD pledge(2)
 
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the bfinalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
 
Lecture 04
Lecture 04Lecture 04
Lecture 04
 
Roll your own toy unix clone os
Roll your own toy unix clone osRoll your own toy unix clone os
Roll your own toy unix clone os
 
introduction to CUDA_C.pptx it is widely used
introduction to CUDA_C.pptx it is widely usedintroduction to CUDA_C.pptx it is widely used
introduction to CUDA_C.pptx it is widely used
 
C++ amp on linux
C++ amp on linuxC++ amp on linux
C++ amp on linux
 
GPU Programming on CPU - Using C++AMP
GPU Programming on CPU - Using C++AMPGPU Programming on CPU - Using C++AMP
GPU Programming on CPU - Using C++AMP
 
Cuda 3
Cuda 3Cuda 3
Cuda 3
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computing
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computing
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computing
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computing
 

Último

How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoffsammart93
 
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdfUnderstanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdfUK Journal
 
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024The Digital Insurer
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilV3cube
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)Gabriella Davis
 
Advantages of Hiring UIUX Design Service Providers for Your Business
Advantages of Hiring UIUX Design Service Providers for Your BusinessAdvantages of Hiring UIUX Design Service Providers for Your Business
Advantages of Hiring UIUX Design Service Providers for Your BusinessPixlogix Infotech
 
Tech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdfTech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdfhans926745
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024The Digital Insurer
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...apidays
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityPrincipled Technologies
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slidevu2urc
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)wesley chun
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsJoaquim Jorge
 

Último (20)

How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdfUnderstanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
 
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of Brazil
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
Advantages of Hiring UIUX Design Service Providers for Your Business
Advantages of Hiring UIUX Design Service Providers for Your BusinessAdvantages of Hiring UIUX Design Service Providers for Your Business
Advantages of Hiring UIUX Design Service Providers for Your Business
 
Tech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdfTech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdf
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 

CUDA Deep Dive

  • 1. CUDA Deep Dive Kashif Rasul @krasul
  • 5. #include <cutil_inline.h> int main( void ) { int N = 50000; size_t size = N * sizeof(float); cudaSetDevice( cutGetMaxGflopsDeviceId() ); ... cutilSafeCall( cudaMalloc((void**)&d_A, size) ); cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) ); ... int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N); ... cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) ); cudaFree(d_A); ... cutilDeviceReset(); } blocks & threads
  • 6. __global__ void dot( float *a, float *b, float *c ) { __shared__ float cache[threadsPerBlock]; int cacheIndex = threadIdx.x; ... // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads(); ... } int main( void ) { ... dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c ); ... } shared memory
  • 7. thread coop. & shared mem. useful for reduction algorithms • avoid race conditions by using __syncthreads() • avoid bank conflicts • every thread in the block needs to call __syncthreads() keep in mind
  • 8. Memory 2
  • 9. __constant__ float constFloat; __device__ float getConstFloat() { return constFloat; } __global__ void addConstant(float *vec, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i<N) vec[i] += getConstFloat(); } #include <cutil_inline.h> int main( int argc, char** argv) { float constValue = 4.0f; cutilSafeCall( cudaMemcpyToSymbol(constFloat, &constValue, sizeof(float), 0, cudaMemcpyHostToDevice) ); ... } constant mem.
  • 10. read-only, but conserves mem. bandwidth • a single read can be broadcasted and cached for additional reads • painfully slow when each thread reads a different address from constant memory keep in mind
  • 11. // textures containing look-up tables texture<uint> edgeTex; texture<uint, 2> edge2dTex; int main(int argc, char** argv) { ... cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) ); cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256*sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable, 256*sizeof(uint)) ); // run kernel kernel<<<blocks, threads>>>(...) //cleanup cutilSafeCall( cudaUnbindTexture(edgeTex) ); } __global__ void kernel(...) { ... uint edge = tex1Dfetch(edgeTex, index*16 + i); ... } texture mem.
  • 12. read-only, like for const. mem. • great when memory access exhibits spatial locality, i.e. each thread reads a loc. near where the next or previous thread reads • comes in 1-D, 2-D and 3-D versions & typically used in finite diff. apps keep in mind
  • 13. surface<void, 2> output_surface; __global__ void surfaceWrite(float* g_idata, int width, int height) { // calculate surface coordinates unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; // read from global memory and write to cuarray (via surface reference) surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap); } int main( int argc, char** argv) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cu_array; cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height, cudaArraySurfaceLoadStore) ); cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) ); surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height); ... cutilSafeCall( cudaFree(d_data) ); cutilSafeCall( cudaFreeArray(cu_array) ); } surface mem.
  • 14. InterOp. 3
  • 15. // OpenGL Graphics includes #include <GL/glew.h> #if defined (__APPLE__) || defined(MACOSX) #include <GLUT/glut.h> #else #include <GL/freeglut.h> #endif int main(int argc, char **argv) { // Initialize GL glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); glutInitWindowSize(1000, 1000); // Create a window with rendering context and all else we need glutCreateWindow("CUDA Interop."); // initialize necessary OpenGL extensions glewInit(); // Select CUDA device with OpenGL interoperability if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } } set device
  • 16. // vbo variables GLuint vbo; struct cudaGraphicsResource *cuda_vbo_resource; void *d_vbo_buffer = NULL; // create buffer object glGenBuffers(1, vbo); glBindBuffer(GL_ARRAY_BUFFER, *vbo); // initialize buffer object unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // register this buffer object with CUDA cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo, cudaGraphicsMapFlagsWriteDiscard)); register data with CUDA
  • 17. // map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) ); size_t num_bytes; cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *cuda_vbo_resource) ); // run kernel kernel<<<blocks,threads>>>(dptr,...); // unmap buffer object cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) ); pass data via shared buffers
  • 18. need to tell the CUDA runtime the device we intend to use for CUDA and OpenGL • initialize OpenGL first and then use the cudaGLSetGLDevice() method • DirectX interop. is nearly identical keep in mind
  • 19. Pro Tip 4
  • 20. ➜ git clone https://github.com/kashif/cuda-workshop.git Cloning into cuda-workshop... ... ➜ cd cuda-workshop ➜ cmake CMakeLists.txt -- The C compiler identification is GNU ... ➜ make Scanning dependencies of target cutil [ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o ... [100%] Built target matrixMul ➜ ./bin/matrixMul [ matrixMul ] bin/matrixMul Starting (CUDA and CUBLAS tests)... Device 0: "GeForce GTX 480" with Compute 2.0 capability ... install CMake, glut & glew
  • 21. ➜ ls src/matrixMul CMakeLists.txt matrixMul.cu matrixMul.h matrixMul_gold.cpp matrixMul_kernel.cu ➜ cat src/matrixMul/CMakeLists.txt CUDA_ADD_EXECUTABLE( matrixMul matrixMul.cu matrixMul_gold.cpp ) TARGET_LINK_LIBRARIES( matrixMul cutil shrutil ${CUDA_CUBLAS_LIBRARIES} ) ➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt ... great for experimenting
  • 23. cudaEvent_t start, stop; float time; // initialize events cutilSafeCall( cudaEventCreate(&start) ); cutilSafeCall( cudaEventCreate(&stop) ); // warmup to avoid timing startup kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // take measurements for loop over kernel launches cutilSafeCall( cudaEventRecord(start, 0) ); for (int i=0; i < NUM_REPS; i++) { kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // Ensure no launch failure cutilSafeCall( cudaGetLastError() ); } cutilSafeCall( cudaEventRecord(stop, 0) ); cutilSafeCall( cudaEventSynchronize(stop) ); cutilSafeCall( cudaEventElapsedTime(&time, start, stop) ); // report effective bandwidth in GB/s (2.0f due to read + write) float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS); cutilSafeCall( cudaEventDestroy(stop) ); cutilSafeCall( cudaEventDestroy(start) ); events: GPU timestamp
  • 24. #include <cutil_inline.h> ... unsigned int timer_matrixMul = 0; // start timing cutilCheckError( cutStartTimer(timer_matrixMul) ); // do some work kernel<<<grid, threads, mem_size>>>(d_idata, d_odata); cutilDeviceSynchronize(); // stop timer cutilCheckError( cutStopTimer(timer_matrixMul) ); double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0); double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; // destroy timer cutilCheckError( cutDeleteTimer(timer_matrixMul) ); os timers
  • 25. creating and recording events is tricky since some CUDA calls are asynch. • all kernel launches are asynch. • instruct the CPU to synch. on an event via cudaDeviceSynchronize() keep in mind
  • 26. Bindings 6
  • 27. ➜ cat hello_gpu.py import pycuda.driver as drv import pycuda.tools import pycuda.autoinit import numpy import numpy.linalg as la from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1)) print dest-a*b pycuda
  • 28. ➜ python hello_gpu.py [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  • 29. // Initialize the driver and create a context for the first device. cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device); // Create the PTX file by calling the NVCC and load it String ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu"); CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName); // Obtain a function pointer to the "add" function. CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add"); // Allocate the device input data float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr(); cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT); cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT); ... // Set up the kernel parameters Pointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...); // Call the kernel function int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX); cuLaunchKernel(function, gridSizeX, 1, 1, // Grid dimension blockSizeX, 1, 1, // Block dimension 0, null, // Shared memory size and stream kernelParameters, null); // Kernel- and extra parameters cuCtxSynchronize(); jcuda
  • 30. ➜ ls License.txt jcuda-0.4.0-beta1.jar jcurand-0.4.0-beta1.jar libJCublas-apple-x86_64.dylib libJCudaRuntime-apple-x86_64.dylib libJCurand-apple-x86_64.dylib jcublas-0.4.0-beta1.jar jcufft-0.4.0-beta1.jar jcusparse-0.4.0-beta1.jar libJCudaDriver-apple-x86_64.dylib libJCufft-apple-x86_64.dylib libJCusparse-apple-x86_64.dylib JCudaVectorAdd.java JCudaVectorAddKernel.cu ➜ cat JCudaVectorAddKernel.cu extern "C" __global__ void add(float *a, float *b, float *sum, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i<n) { sum[i] = a[i] + b[i]; } } ➜ javac -classpath jcuda-0.4.0-beta1.jar JCudaVectorAdd.java ➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAdd Executing nvcc -m64 -ptx JCudaVectorAddKernel.cu -o JCudaVectorAddKernel.ptx Finished creating PTX file Test PASSED
  • 31. ➜ gem install sgc-ruby-cuda Successfully installed sgc-ruby-cuda-0.1.1 1 gem installed ➜ cat vector_add.rb ... # Prepare and load vadd kernel. kernel_lib_file = compile(vadd_kernel_src) CudaFunction.load_lib_file(kernel_lib_file.path) # Copy input buffers from host memory to device memory. memcpy_htod(da, ha, nbytes) memcpy_htod(db, hb, nbytes) # Invoke vadd kernel. nthreads_per_block = 256 block_dim = Dim3.new(nthreads_per_block, 1, 1) grid_dim = Dim3.new((N + nthreads_per_block - 1) / nthreads_per_block, 1, 1) CudaFunction.configure(block_dim, grid_dim) CudaFunction.setup(da, db, dc, N) f = CudaFunction.new("vadd") f.launch # Copy output buffer from device memory to host memory. memcpy_dtoh(hc, dc, nbytes) ... ➜ ruby vector_add.rb Vector Addition Verification completed. All matches? YES ruby-cuda
  • 33. cublasHandle_t handle; cublasStatus_t status = cublasCreate(&handle); float* h_A = (float*)malloc(N * N * sizeof(h_A[0])); ... /* Fill the matrices with test data */ ... /* Allocate device memory for the matrices */ cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0])); ... /* Initialize the device matrices with the host matrices */ status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1); ... /* Performs Sgemm: C <- alphaAB + betaC */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); /* Allocate host mem & read back the result from device mem */ h_C = (float*)malloc(N * N * sizeof(h_C[0])); status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1); /* Memory clean up */ cudaFree(d_A); ... /* Shutdown */ status = cublasDestroy(handle); cublas
  • 34. cudaSetDevice( cutGetMaxGflopsDeviceId() ); // Allocate & init. host memory for the signal Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE); ... // Pad signal Complex* h_padded_signal; ... // Allocate device memory for signal Complex* d_signal; cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) ); // Copy host memory to device cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice) ); // CUFFT plan cufftHandle plan; cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) ); // Transform signal cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD) ); // Destroy CUFFT context cufftSafeCall( cufftDestroy(plan) ); // Cleanup memory cutilSafeCall( cudaFree(d_signal) ); ... cutilDeviceReset(); cufft
  • 35. cusparseHandle_t handle = 0; cusparseStatus_t status = cusparseCreate(&handle); // create a matrix description for the matrix M cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM); cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR ); cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO ); cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT ); cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER ); // create & perform analysis info for the non-trans & trans case cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0; cusparseCreateSolveAnalysisInfo(&info); cusparseCreateSolveAnalysisInfo(&infoTrans); cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info); cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans); ... // Solve M z = H H^T z = r by first doing a forward solve: H y = r cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y); // and then a back substitution: H^T z = y cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z); ... cusparseDestroy(handle); cusparse
  • 36. cudaError_t cudaResult = cudaSuccess; // Allocate memory for points float *d_points = 0; cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float)); // Generate random points in unit square curandStatus_t curandResult; curandGenerator_t qrng; curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32); curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2); curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT); curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims); // Cleanup curandResult = curandDestroyGenerator(qrng); cudaFree(d_points); curand
  • 37. // declare a host image object for an 8-bit grayscale image npp::ImageCPU_8u_C1 oHostSrc; // load gray-scale image from disk npp::loadImage(sFilename, oHostSrc); // declare a device image and copy from the host image to the device npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); // create struct with box-filter mask size NppiSize oMaskSize = {5, 5}; // create struct with ROI size given the current mask NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1, oDeviceSrc.height() - oMaskSize.height + 1}; // allocate device image of appropriately reduced size npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); // set anchor point inside the mask to (0, 0) NppiPoint oAnchor = {0, 0}; // run box filter nppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor); // declare a host image for the result npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); // and copy the device result data into it oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); npp
  • 38. Streams 8
  • 39. cudaStream_t stream; cutilSafeCall( cudaStreamCreate(&stream) ); // allocate page locked memory cutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) ); // allocate device memory cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) ); cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, stream) ); // run kernel and copy result back cutilSafeCall( cudaEventRecord(start, stream) ); kernel<<<N,M,0,stream>>>(&d_a, ... ); cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, stream) ); // free cudaStreamDestroy(stream); cudaFreeHost(a); cudaFree(d_a); pinned memory
  • 40. // loop over full data, in bite-sized chunks for (int i=0; i<FULL_DATA_SIZE; i+= N) { // copy the locked memory to the device, async cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c); // copy the data from device to locked memory cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream) ); } // wait for all operations to finish cutilSafeCall( cudaStreamSynchronize(stream) ); chunked computation
  • 41. cudaStream_t *streamArray = 0; streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *)); ... for ( int i = 0; i < N ; i++) { cudaStreamCreate(&streamArray[i]); ... } ... for ( int i = 0; i < N ; i++) { cublasSetMatrix (..., devPtrA[i], ...); ... } ... for ( int i = 0; i < N ; i++) { cublasSetStream(handle, streamArray[i]); cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...); } cudaThreadSynchronize(); batched computation
  • 42. use it to specify in which order operations get executed async. • idea is to use more than 1 stream • requires a new kind of mem. copy which in turn requires pinned: paged locked mem. • free pinned mem. when not needed keep in mind
  • 43. // Allocate resources for( int i =0; i<STREAM_COUNT; ++i ) { cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); cudaMalloc(&d_data_in[i], memsize); ... } int current_stream = 0; // Do processing in a loop... { int next_stream = (current_stream + 1 ) % STREAM_COUNT; // Ensure that processing and copying of the last cycle has finished cudaEventSynchronize(cycleDone[next_stream]); // Process current frame kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream], d_data_in[current_stream], N, ...); // Upload next frame cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice, stream[next_stream]); // Download current frame cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost, stream[current_stream]); cudaEventRecord(cycleDone[current_stream], stream[current_stream]); current_stream = next_stream; } overlap kernel exec. & memcpy
  • 44. devices with CC 1.1 and above can overlap a kernel exec & memcpy as long as they are issued from different streams • kernels are serialized • queue in a way that independent streams can execute in parallel keep in mind
  • 46. float *a, *d_a; ... /* Allocate mapped CPU memory. */ cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) ); ... /* Initialize the vectors. */ for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... } /* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */ cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) ); ... /* Call the GPU kernel using the device pointers for the mapped memory. */ ... kernel<<<grid, block>>>(d_a, d_b, d_c, nelem); ... /* Memory clean up */ cutilSafeCall( cudaFreeHost(a) ); ... zero-copy host memory
  • 47. //Create streams for issuing GPU command asynchronously and allocate memory for(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaStreamCreate(&stream[i]) ); cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) ); cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) ); //init h_Data } //Copy data to GPU, launch the kernel and copy data back. All asynchronously for(int i = 0; i < GPU_N; i++) { //Set device cutilSafeCall( cudaSetDevice(i) ); // Copy input data from CPU cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float), cudaMemcpyHostToDevice, stream[i]) ); // Perform GPU computations kernel<<<blocks, threads, 0, stream[i]>>>(...) // Copy back the result cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i], ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) ); } streams
  • 48. // Process GPU results for(i = 0; i < GPU_N; i++) { // Set device cutilSafeCall( cudaSetDevice(i) ); // Wait for all operations to finish cudaStreamSynchronize(stream[i]); // Shut down this GPU cutilSafeCall( cudaFreeHost(h_Data[i]) ); cutilSafeCall( cudaFree(d_Data[i]) ); cutilSafeCall( cudaStreamDestroy(stream[i]) ); } // shutdown for(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaSetDevice(i) ); cutilDeviceReset(); } process the result
  • 49. can also control each GPU by a separate CPU thread • need to assign portable pinned memory if a different thread needs access to one thread’s memory • use the flag cudaHostAllocPortable to cudaHostAlloc() keep in mind
  • 50. // Initialize MPI state MPI_CHECK( MPI_Init(&argc, &argv) ); // Get our MPI node number and node count int commSize, commRank; MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) ); MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) ); if(commRank == 0) {// Are we the root node? //initialize dataRoot... } // Allocate a buffer on each node float * dataNode = new float[dataSizePerNode]; // Dispatch a portion of the input data to each node MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) ); // if commRank == 0 then free dataRoot... kernel<<<gridSize, blockSize>>>(dataNode, ...); // Reduction to the root node float sumNode = sum(dataNode, dataSizePerNode); float sumRoot; MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD) ); MPI_CHECK( MPI_Finalize() ); mpi + cuda
  • 51. // Enable peer access cutilSafeCall(cudaSetDevice(gpuid_tesla[0])); cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0])); ... // Allocate buffers cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size); cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size); // Ping-pong copy between GPUs cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault); // Prepare host buffer and copy to GPU 0 cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault); // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing // output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1); cutilDeviceSynchronize(); // Disable peer access (also unregisters memory for non-UVA cases) cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]); cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]); cudaFree(g0); ... P2P & unified virtual address space
  • 53.
  • 54. Thank you download slides (2MB pdf) from http://bit.ly/cuda-deep-dive