SlideShare una empresa de Scribd logo
1 de 29
HSA COMPILATION
WEN-MEI HWU, CTO, MULTICOREWARE INC
WITH RAY I-JUI SUNG
KEY HSA FEATURES FOR COMPILATION
ALL-PROCESSORS-EQUAL
 GPU and CPU have equal
flexibility to create and
dispatch work items
EQUAL ACCESS TO
ENTIRE SYSTEM MEMORY
 GPU and CPU have
uniform visibility into entire
memory space
Unified Coherent
Memory
GPUCPU
Single Dispatch Path
GPUCPU
© Copyright 2014 HSA Foundation. All Rights Reserved
A QUICK REVIEW OF OPENCL
CURRENT STATE OF PORTABLE HETEROGENEOUS
PARALLEL PROGRAMMING
DEVICE CODE IN OPENCL
SIMPLE MATRIX MULTIPLICATION
__kernel void
matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;
}
Explicit thread index usage.
Reasonably readable.
Portable across CPUs, GPUs, and FPGAs
© Copyright 2014 HSA Foundation. All Rights Reserved
HOST CODE IN OPENCL -
CONCEPTUAL
1. allocate and initialize memory on host side
2. Initialize OpenCL
3. allocate device memory and move the data
4. Load and build device code
5. Launch kernel
a. append arguments
6. move the data back from device
© Copyright 2014 HSA Foundation. All Rights Reserved
int main(int argc, char** argv){
// set seed for rand()
srand(2006);
/****************************************************/
/* Allocate and initialize memory on Host Side */
/****************************************************/
// allocate and initialize host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory pointers for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 3. Allocate device memory and move data
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 4. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My commentn",
&kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 5. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 6. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 7. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);}
almost 100 lines of code
– tedious and hard to maintain
It does not take advantage of HAS features.
It will likely need to be changed for OpenCL 2.0.
COMPARING SEVERAL HIGH-LEVEL
PROGRAMMING INTERFACES
C++AMP Thrust Bolt OpenACC SYCL
C++ Language
extension
proposed by
Microsoft
library
proposed
by CUDA
library
proposed
by AMD
Annotation
and
Pragmas
proposed
by PGI
C++
wrapper
for
OpenCL
All these proposals aim to reduce tedious boiler
plate code and provide transparent porting to future
systems (future proofing).
© Copyright 2014 HSA Foundation. All Rights Reserved
OPENACC
HSA ENABLES SIMPLER IMPLEMENTATION OR
BETTER OPTIMIZATION
© Copyright 2014 HSA Foundation. All Rights Reserved
OPENACC
- SIMPLE MATRIX MULTIPLICATION EXAMPLE
1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB)
2 {
3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB])
4 for (int i=0; i<hA; i++) {
5 #pragma acc loop
6 for (int j=0; j<wB; j++) {
7 float sum = 0;
8 for (int k=0; k<wA; k++) {
9 float a = A[i*wA+k];
10 float b = B[k*wB+j];
11 sum += a*b;
12 }
13 C[i*Nw+j] = sum;
14 }
15 }
16 }
Little Host Code Overhead
Programmer annotation of
kernel computation
Programmer annotation of data movement
© Copyright 2014 HSA Foundation. All Rights Reserved
ADVANTAGE OF HSA FOR OPENACC
 Flexibility in copyin and copyout implementation
 Flexible code generation for nested acc parallel loops
 E.g., inner loop bounds that depend on outer loop iterations
 Compiler data affinity optimization (especially OpenACC kernel regions)
 The compiler does not have to undo programmer managed data transfers
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP
HSA ENABLES EFFICIENT COMPILATION OF AN
EVEN HIGHER LEVEL OF PROGRAMMING
INTERFACE
© Copyright 2014 HSA Foundation. All Rights Reserved
C++ AMP
● C++ Accelerated Massive Parallelism
● Designed for data level parallelism
● Extension of C++11 proposed by Microsoft
● An open specification with multiple implementations aiming at standardization
● MS Visual Studio 2013
● MulticoreWare CLAMP
● GPU data modeled as C++14-like containers for multidimensional arrays
● GPU kernels modeled as C++11 lambda
● Minimal extension to C++ for simplicity and future proofing
© Copyright 2014 HSA Foundation. All Rights Reserved
MATRIX MULTIPLICATION IN C++AMP
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix,
int ha, int hb, int hc) {
array_view<int, 2> a(ha, hb, aMatrix);
array_view<int, 2> b(hb, hc, bMatrix);
array_view<int, 2> product(ha, hc, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue =
clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
GPU data
modeled as
data container
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
Kernels modeled as
lambdas; arguments are
implicitly modeled as
captured variables,
programmer do not need to
specify copyin and copyout
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();
}
Execution
interface; marking
an implicitly
parallel region for
GPU execution
© Copyright 2014 HSA Foundation. All Rights Reserved
MCW C++AMP (CLAMP)
● Runs on Linux and Mac OS X
● Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X),
NVIDIA and even POCL
● Clang/LLVM-based, open source
o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR
o With template helper library
● Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems
● One of the two C++ AMP implementations recognized by HSA foundation
© Copyright 2014 HSA Foundation. All Rights Reserved
MCW C++ AMP COMPILER
● Device Path
o generate OpenCL C code and SPIR
o emit kernel function
● Host Path
o preparation to launch the code
C++ AMP
source code
Clang/LLVM 3.3
Device
Code
Host
Code
© Copyright 2014 HSA Foundation. All Rights Reserved
TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float*
A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
● Append the arguments
● Set the index
● emit kernel function
● implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved
EXECUTION ON NON-HSA OPENCL
PLATFORMS
C++ AMP
source code
Clang/LLVM
3.3
Device Code
C++ AMP
source code
Clang/LLVM
3.3
Host Code
gmac
OpenCL
Our work
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
GMAC
● unified virtual address space in
software
● Can have high overhead
sometimes
● In HSA (e.g., AMD Kaveri), GMAC
is not longer needed
Gelado, et al, ASPLOS 2010
© Copyright 2014 HSA Foundation. All Rights Reserved
CASE STUDY:
BINOMIAL OPTION PRICING
 Line of Codes
0
50
100
150
200
250
300
350
C++AMP OpenCL
Lines of Code by Cloc
Host
Kernel
© Copyright 2014 HSA Foundation. All Rights Reserved
PERFORMANCE ON NON-HSA SYSTEMS
BINOMIAL OPTION PRICING
0
0.02
0.04
0.06
0.08
0.1
0.12
Total GPU Time Kernel-only
TimeinSeconds
Performance on an NV Tesla C2050
OpenCL
C++AMP
© Copyright 2014 HSA Foundation. All Rights Reserved
EXECUTION ON HSA
C++ AMP
source code
Clang/LLVM
3.3
Device SPIR
C++ AMP
source code
Clang/LLVM
3.3
Host SPIR
HSA Runtime
Compile Time
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
WHAT WE NEED TO DO?
● Kernel function
o emit the kernel function with required arguments
● On Host side
o a function that recursively traverses the object and append the arguments to OpenCL
stack.
● On Device side
o reconstruct it on the device code for future use.
© Copyright 2014 HSA Foundation. All Rights Reserved
WHY COMPILING C++AMP TO OPENCL IS
NOT TRIVIAL
● C++AMP → LLVM IR → OpenCL C or SPIR
● arguments passing (lambda capture vs function calls)
● explicit V.S. implicit memory transfer
● Heavy lifting is done by compiler and runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
EXAMPLE
struct A { int a; };struct B : A { int b; };struct C { B b; int c; };
struct C c;
c.c = 100;
auto fn = [=] () { int qq = c.c; };
© Copyright 2014 HSA Foundation. All Rights Reserved
TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
● Compiler
● Turn captured variables into
OpenCL arguments
● Populate the index<N> in OCL
kernel
● Runtime
● Implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved
QUESTIONS?
© Copyright 2014 HSA Foundation. All Rights Reserved

Más contenido relacionado

La actualidad más candente

The OpenCL C++ Wrapper 1.2 Reference Card
The OpenCL C++ Wrapper 1.2 Reference CardThe OpenCL C++ Wrapper 1.2 Reference Card
The OpenCL C++ Wrapper 1.2 Reference CardThe Khronos Group Inc.
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CSteffen Wenz
 
Architecture for Massively Parallel HDL Simulations
Architecture for Massively Parallel HDL Simulations Architecture for Massively Parallel HDL Simulations
Architecture for Massively Parallel HDL Simulations DVClub
 
Vc4c development of opencl compiler for videocore4
Vc4c  development of opencl compiler for videocore4Vc4c  development of opencl compiler for videocore4
Vc4c development of opencl compiler for videocore4nomaddo
 
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...DevGAMM Conference
 
Ghost Vulnerability CVE-2015-0235
Ghost Vulnerability CVE-2015-0235Ghost Vulnerability CVE-2015-0235
Ghost Vulnerability CVE-2015-0235Rajivarnan (Rajiv)
 
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...Tzung-Bi Shih
 

La actualidad más candente (19)

Zone IDA Proc
Zone IDA ProcZone IDA Proc
Zone IDA Proc
 
OpenCL 2.2 Reference Guide
OpenCL 2.2 Reference GuideOpenCL 2.2 Reference Guide
OpenCL 2.2 Reference Guide
 
TVM VTA (TSIM)
TVM VTA (TSIM) TVM VTA (TSIM)
TVM VTA (TSIM)
 
OpenCL 3.0 Reference Guide
OpenCL 3.0 Reference GuideOpenCL 3.0 Reference Guide
OpenCL 3.0 Reference Guide
 
The OpenCL C++ Wrapper 1.2 Reference Card
The OpenCL C++ Wrapper 1.2 Reference CardThe OpenCL C++ Wrapper 1.2 Reference Card
The OpenCL C++ Wrapper 1.2 Reference Card
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in C
 
Architecture for Massively Parallel HDL Simulations
Architecture for Massively Parallel HDL Simulations Architecture for Massively Parallel HDL Simulations
Architecture for Massively Parallel HDL Simulations
 
OpenGL SC 2.0 Quick Reference
OpenGL SC 2.0 Quick ReferenceOpenGL SC 2.0 Quick Reference
OpenGL SC 2.0 Quick Reference
 
Sycl 1.2 Reference Card
Sycl 1.2 Reference CardSycl 1.2 Reference Card
Sycl 1.2 Reference Card
 
Vc4c development of opencl compiler for videocore4
Vc4c  development of opencl compiler for videocore4Vc4c  development of opencl compiler for videocore4
Vc4c development of opencl compiler for videocore4
 
OpenCL 2.1 Reference Guide
OpenCL 2.1 Reference GuideOpenCL 2.1 Reference Guide
OpenCL 2.1 Reference Guide
 
TensorFlow XLA RPC
TensorFlow XLA RPCTensorFlow XLA RPC
TensorFlow XLA RPC
 
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
 
Vulkan 1.0 Quick Reference
Vulkan 1.0 Quick ReferenceVulkan 1.0 Quick Reference
Vulkan 1.0 Quick Reference
 
Modern c++
Modern c++Modern c++
Modern c++
 
3
33
3
 
Ghost Vulnerability CVE-2015-0235
Ghost Vulnerability CVE-2015-0235Ghost Vulnerability CVE-2015-0235
Ghost Vulnerability CVE-2015-0235
 
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...
Feldo: Function Event Listing and Dynamic Observing for Detecting and Prevent...
 
verilog code
verilog codeverilog code
verilog code
 

Destacado

ISCA final presentation - Queuing Model
ISCA final presentation - Queuing ModelISCA final presentation - Queuing Model
ISCA final presentation - Queuing ModelHSA Foundation
 
HSA From A Software Perspective
HSA From A Software Perspective HSA From A Software Perspective
HSA From A Software Perspective HSA Foundation
 
ISCA Final Presentation - HSAIL
ISCA Final Presentation - HSAILISCA Final Presentation - HSAIL
ISCA Final Presentation - HSAILHSA Foundation
 
ISCA Final Presentation - Applications
ISCA Final Presentation - ApplicationsISCA Final Presentation - Applications
ISCA Final Presentation - ApplicationsHSA Foundation
 
ISCA final presentation - Runtime
ISCA final presentation - RuntimeISCA final presentation - Runtime
ISCA final presentation - RuntimeHSA Foundation
 
ISCA final presentation - Memory Model
ISCA final presentation - Memory ModelISCA final presentation - Memory Model
ISCA final presentation - Memory ModelHSA Foundation
 
HSA Foundation BoF -Siggraph 2013 Flyer
HSA Foundation BoF -Siggraph 2013 Flyer HSA Foundation BoF -Siggraph 2013 Flyer
HSA Foundation BoF -Siggraph 2013 Flyer HSA Foundation
 

Destacado (7)

ISCA final presentation - Queuing Model
ISCA final presentation - Queuing ModelISCA final presentation - Queuing Model
ISCA final presentation - Queuing Model
 
HSA From A Software Perspective
HSA From A Software Perspective HSA From A Software Perspective
HSA From A Software Perspective
 
ISCA Final Presentation - HSAIL
ISCA Final Presentation - HSAILISCA Final Presentation - HSAIL
ISCA Final Presentation - HSAIL
 
ISCA Final Presentation - Applications
ISCA Final Presentation - ApplicationsISCA Final Presentation - Applications
ISCA Final Presentation - Applications
 
ISCA final presentation - Runtime
ISCA final presentation - RuntimeISCA final presentation - Runtime
ISCA final presentation - Runtime
 
ISCA final presentation - Memory Model
ISCA final presentation - Memory ModelISCA final presentation - Memory Model
ISCA final presentation - Memory Model
 
HSA Foundation BoF -Siggraph 2013 Flyer
HSA Foundation BoF -Siggraph 2013 Flyer HSA Foundation BoF -Siggraph 2013 Flyer
HSA Foundation BoF -Siggraph 2013 Flyer
 

Similar a ISCA Final Presentaiton - Compilations

Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaJuan Fumero
 
CUDA lab's slides of "parallel programming" course
CUDA lab's slides of "parallel programming" courseCUDA lab's slides of "parallel programming" course
CUDA lab's slides of "parallel programming" courseShuai Yuan
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpuMarco Parenzan
 
망고100 보드로 놀아보자 15
망고100 보드로 놀아보자 15망고100 보드로 놀아보자 15
망고100 보드로 놀아보자 15종인 전
 
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdfJunZhao68
 
CUDA Deep Dive
CUDA Deep DiveCUDA Deep Dive
CUDA Deep Divekrasul
 
clWrap: Nonsense free control of your GPU
clWrap: Nonsense free control of your GPUclWrap: Nonsense free control of your GPU
clWrap: Nonsense free control of your GPUJohn Colvin
 
Lee 2020 what the clock !
Lee 2020  what the clock !Lee 2020  what the clock !
Lee 2020 what the clock !Neil Armstrong
 
JVM code reading -- C2
JVM code reading -- C2JVM code reading -- C2
JVM code reading -- C2ytoshima
 
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략 C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략 명신 김
 
Samsung WebCL Prototype API
Samsung WebCL Prototype APISamsung WebCL Prototype API
Samsung WebCL Prototype APIRyo Jin
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfpepe464163
 
Apache Commons - Don\'t re-invent the wheel
Apache Commons - Don\'t re-invent the wheelApache Commons - Don\'t re-invent the wheel
Apache Commons - Don\'t re-invent the wheeltcurdt
 
Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda MoayadMoayadhn
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기Ji Hun Kim
 
JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?Doug Hawkins
 
Building Hierarchy
Building HierarchyBuilding Hierarchy
Building HierarchyMohamed Samy
 

Similar a ISCA Final Presentaiton - Compilations (20)

Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
 
CUDA lab's slides of "parallel programming" course
CUDA lab's slides of "parallel programming" courseCUDA lab's slides of "parallel programming" course
CUDA lab's slides of "parallel programming" course
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
 
망고100 보드로 놀아보자 15
망고100 보드로 놀아보자 15망고100 보드로 놀아보자 15
망고100 보드로 놀아보자 15
 
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
 
Marat-Slides
Marat-SlidesMarat-Slides
Marat-Slides
 
CUDA Deep Dive
CUDA Deep DiveCUDA Deep Dive
CUDA Deep Dive
 
clWrap: Nonsense free control of your GPU
clWrap: Nonsense free control of your GPUclWrap: Nonsense free control of your GPU
clWrap: Nonsense free control of your GPU
 
Lee 2020 what the clock !
Lee 2020  what the clock !Lee 2020  what the clock !
Lee 2020 what the clock !
 
Bluespec @waseda
Bluespec @wasedaBluespec @waseda
Bluespec @waseda
 
Microkernel Development
Microkernel DevelopmentMicrokernel Development
Microkernel Development
 
JVM code reading -- C2
JVM code reading -- C2JVM code reading -- C2
JVM code reading -- C2
 
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략 C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
 
Samsung WebCL Prototype API
Samsung WebCL Prototype APISamsung WebCL Prototype API
Samsung WebCL Prototype API
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdf
 
Apache Commons - Don\'t re-invent the wheel
Apache Commons - Don\'t re-invent the wheelApache Commons - Don\'t re-invent the wheel
Apache Commons - Don\'t re-invent the wheel
 
Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda Moayad
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기
 
JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?
 
Building Hierarchy
Building HierarchyBuilding Hierarchy
Building Hierarchy
 

Más de HSA Foundation

KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPU
KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPUKeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPU
KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPUHSA Foundation
 
Hsa Runtime version 1.00 Provisional
Hsa Runtime version  1.00  ProvisionalHsa Runtime version  1.00  Provisional
Hsa Runtime version 1.00 ProvisionalHSA Foundation
 
Hsa programmers reference manual (version 1.0 provisional)
Hsa programmers reference manual (version 1.0 provisional)Hsa programmers reference manual (version 1.0 provisional)
Hsa programmers reference manual (version 1.0 provisional)HSA Foundation
 
ISCA Final Presentation - Intro
ISCA Final Presentation - IntroISCA Final Presentation - Intro
ISCA Final Presentation - IntroHSA Foundation
 
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...HSA Foundation
 
Hsa Platform System Architecture Specification Provisional verl 1.0 ratifed
Hsa Platform System Architecture Specification Provisional  verl 1.0 ratifed Hsa Platform System Architecture Specification Provisional  verl 1.0 ratifed
Hsa Platform System Architecture Specification Provisional verl 1.0 ratifed HSA Foundation
 
Apu13 cp lu-keynote-final-slideshare
Apu13 cp lu-keynote-final-slideshareApu13 cp lu-keynote-final-slideshare
Apu13 cp lu-keynote-final-slideshareHSA Foundation
 
HSAemu a Full System Emulator for HSA
HSAemu a Full System Emulator for HSA HSAemu a Full System Emulator for HSA
HSAemu a Full System Emulator for HSA HSA Foundation
 
HSA Queuing Hot Chips 2013
HSA Queuing Hot Chips 2013 HSA Queuing Hot Chips 2013
HSA Queuing Hot Chips 2013 HSA Foundation
 
HSA Memory Model Hot Chips 2013
HSA Memory Model Hot Chips 2013HSA Memory Model Hot Chips 2013
HSA Memory Model Hot Chips 2013HSA Foundation
 
HSA Introduction Hot Chips 2013
HSA Introduction  Hot Chips 2013HSA Introduction  Hot Chips 2013
HSA Introduction Hot Chips 2013HSA Foundation
 
HSA HSAIL Introduction Hot Chips 2013
HSA HSAIL Introduction  Hot Chips 2013 HSA HSAIL Introduction  Hot Chips 2013
HSA HSAIL Introduction Hot Chips 2013 HSA Foundation
 
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...HSA Foundation
 
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...HSA Foundation
 
Phil Rogers IFA Keynote 2012
Phil Rogers IFA Keynote 2012Phil Rogers IFA Keynote 2012
Phil Rogers IFA Keynote 2012HSA Foundation
 
Deeper Look Into HSAIL And It's Runtime
Deeper Look Into HSAIL And It's Runtime Deeper Look Into HSAIL And It's Runtime
Deeper Look Into HSAIL And It's Runtime HSA Foundation
 
Bolt C++ Standard Template Libary for HSA by Ben Sanders, AMD
Bolt C++ Standard Template Libary for HSA  by Ben Sanders, AMDBolt C++ Standard Template Libary for HSA  by Ben Sanders, AMD
Bolt C++ Standard Template Libary for HSA by Ben Sanders, AMDHSA Foundation
 
Hsa2012 logo guidelines.
Hsa2012 logo guidelines.Hsa2012 logo guidelines.
Hsa2012 logo guidelines.HSA Foundation
 
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...HSA Foundation
 

Más de HSA Foundation (20)

KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPU
KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPUKeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPU
KeynoteTHE HETEROGENEOUS SYSTEM ARCHITECTURE ITS (NOT) ALL ABOUT THE GPU
 
Hsa Runtime version 1.00 Provisional
Hsa Runtime version  1.00  ProvisionalHsa Runtime version  1.00  Provisional
Hsa Runtime version 1.00 Provisional
 
Hsa programmers reference manual (version 1.0 provisional)
Hsa programmers reference manual (version 1.0 provisional)Hsa programmers reference manual (version 1.0 provisional)
Hsa programmers reference manual (version 1.0 provisional)
 
ISCA Final Presentation - Intro
ISCA Final Presentation - IntroISCA Final Presentation - Intro
ISCA Final Presentation - Intro
 
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...
ISCA 2014 | Heterogeneous System Architecture (HSA): Architecture and Algorit...
 
Hsa Platform System Architecture Specification Provisional verl 1.0 ratifed
Hsa Platform System Architecture Specification Provisional  verl 1.0 ratifed Hsa Platform System Architecture Specification Provisional  verl 1.0 ratifed
Hsa Platform System Architecture Specification Provisional verl 1.0 ratifed
 
Apu13 cp lu-keynote-final-slideshare
Apu13 cp lu-keynote-final-slideshareApu13 cp lu-keynote-final-slideshare
Apu13 cp lu-keynote-final-slideshare
 
HSAemu a Full System Emulator for HSA
HSAemu a Full System Emulator for HSA HSAemu a Full System Emulator for HSA
HSAemu a Full System Emulator for HSA
 
HSA Queuing Hot Chips 2013
HSA Queuing Hot Chips 2013 HSA Queuing Hot Chips 2013
HSA Queuing Hot Chips 2013
 
HSA Memory Model Hot Chips 2013
HSA Memory Model Hot Chips 2013HSA Memory Model Hot Chips 2013
HSA Memory Model Hot Chips 2013
 
HSA Introduction Hot Chips 2013
HSA Introduction  Hot Chips 2013HSA Introduction  Hot Chips 2013
HSA Introduction Hot Chips 2013
 
HSA HSAIL Introduction Hot Chips 2013
HSA HSAIL Introduction  Hot Chips 2013 HSA HSAIL Introduction  Hot Chips 2013
HSA HSAIL Introduction Hot Chips 2013
 
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...
HSA Programmer’s Reference Manual: HSAIL Virtual ISA and Programming Model, C...
 
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...
ARM Techcon Keynote 2012: Sensor Integration and Improved User Experiences at...
 
Hsa10 whitepaper
Hsa10 whitepaperHsa10 whitepaper
Hsa10 whitepaper
 
Phil Rogers IFA Keynote 2012
Phil Rogers IFA Keynote 2012Phil Rogers IFA Keynote 2012
Phil Rogers IFA Keynote 2012
 
Deeper Look Into HSAIL And It's Runtime
Deeper Look Into HSAIL And It's Runtime Deeper Look Into HSAIL And It's Runtime
Deeper Look Into HSAIL And It's Runtime
 
Bolt C++ Standard Template Libary for HSA by Ben Sanders, AMD
Bolt C++ Standard Template Libary for HSA  by Ben Sanders, AMDBolt C++ Standard Template Libary for HSA  by Ben Sanders, AMD
Bolt C++ Standard Template Libary for HSA by Ben Sanders, AMD
 
Hsa2012 logo guidelines.
Hsa2012 logo guidelines.Hsa2012 logo guidelines.
Hsa2012 logo guidelines.
 
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...
AFDS 2012 Phil Rogers Keynote: THE PROGRAMMER’S GUIDE TO A UNIVERSE OF POSSIB...
 

Último

Unleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubUnleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubKalema Edgar
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii SoldatenkoFwdays
 
Beyond Boundaries: Leveraging No-Code Solutions for Industry Innovation
Beyond Boundaries: Leveraging No-Code Solutions for Industry InnovationBeyond Boundaries: Leveraging No-Code Solutions for Industry Innovation
Beyond Boundaries: Leveraging No-Code Solutions for Industry InnovationSafe Software
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr BaganFwdays
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embeddingZilliz
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyAlfredo García Lavilla
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Commit University
 
Vector Databases 101 - An introduction to the world of Vector Databases
Vector Databases 101 - An introduction to the world of Vector DatabasesVector Databases 101 - An introduction to the world of Vector Databases
Vector Databases 101 - An introduction to the world of Vector DatabasesZilliz
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Patryk Bandurski
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...Fwdays
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr LapshynFwdays
 

Último (20)

Unleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubUnleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding Club
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko
 
Beyond Boundaries: Leveraging No-Code Solutions for Industry Innovation
Beyond Boundaries: Leveraging No-Code Solutions for Industry InnovationBeyond Boundaries: Leveraging No-Code Solutions for Industry Innovation
Beyond Boundaries: Leveraging No-Code Solutions for Industry Innovation
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embedding
 
DMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special EditionDMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special Edition
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easy
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!
 
Vector Databases 101 - An introduction to the world of Vector Databases
Vector Databases 101 - An introduction to the world of Vector DatabasesVector Databases 101 - An introduction to the world of Vector Databases
Vector Databases 101 - An introduction to the world of Vector Databases
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
 

ISCA Final Presentaiton - Compilations

  • 1. HSA COMPILATION WEN-MEI HWU, CTO, MULTICOREWARE INC WITH RAY I-JUI SUNG
  • 2. KEY HSA FEATURES FOR COMPILATION ALL-PROCESSORS-EQUAL  GPU and CPU have equal flexibility to create and dispatch work items EQUAL ACCESS TO ENTIRE SYSTEM MEMORY  GPU and CPU have uniform visibility into entire memory space Unified Coherent Memory GPUCPU Single Dispatch Path GPUCPU © Copyright 2014 HSA Foundation. All Rights Reserved
  • 3. A QUICK REVIEW OF OPENCL CURRENT STATE OF PORTABLE HETEROGENEOUS PARALLEL PROGRAMMING
  • 4. DEVICE CODE IN OPENCL SIMPLE MATRIX MULTIPLICATION __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; } Explicit thread index usage. Reasonably readable. Portable across CPUs, GPUs, and FPGAs © Copyright 2014 HSA Foundation. All Rights Reserved
  • 5. HOST CODE IN OPENCL - CONCEPTUAL 1. allocate and initialize memory on host side 2. Initialize OpenCL 3. allocate device memory and move the data 4. Load and build device code 5. Launch kernel a. append arguments 6. move the data back from device © Copyright 2014 HSA Foundation. All Rights Reserved
  • 6. int main(int argc, char** argv){ // set seed for rand() srand(2006); /****************************************************/ /* Allocate and initialize memory on Host Side */ /****************************************************/ // allocate and initialize host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); /*****************************************/ /* Initialize OpenCL */ /*****************************************/ // OpenCL specific variables cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; size_t dataBytes; size_t kernelLength; cl_int errcode; // OpenCL device memory pointers for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS); // 3. Allocate device memory and move data d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); // 4. Load and build OpenCL kernel char *clMatrixMul = oclLoadProgSource("kernel.cl", "// My commentn", &kernelLength); shrCheckError(clMatrixMul != NULL, shrTRUE); clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&clMatrixMul, &kernelLength, &errcode); shrCheckError(errcode, CL_SUCCESS); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); shrCheckError(errcode, CL_SUCCESS); // 5. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); shrCheckError(errcode, CL_SUCCESS); localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 6. Retrieve result from device errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 7. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); free(clMatrixMul); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue);} almost 100 lines of code – tedious and hard to maintain It does not take advantage of HAS features. It will likely need to be changed for OpenCL 2.0.
  • 7. COMPARING SEVERAL HIGH-LEVEL PROGRAMMING INTERFACES C++AMP Thrust Bolt OpenACC SYCL C++ Language extension proposed by Microsoft library proposed by CUDA library proposed by AMD Annotation and Pragmas proposed by PGI C++ wrapper for OpenCL All these proposals aim to reduce tedious boiler plate code and provide transparent porting to future systems (future proofing). © Copyright 2014 HSA Foundation. All Rights Reserved
  • 8. OPENACC HSA ENABLES SIMPLER IMPLEMENTATION OR BETTER OPTIMIZATION © Copyright 2014 HSA Foundation. All Rights Reserved
  • 9. OPENACC - SIMPLE MATRIX MULTIPLICATION EXAMPLE 1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB) 2 { 3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB]) 4 for (int i=0; i<hA; i++) { 5 #pragma acc loop 6 for (int j=0; j<wB; j++) { 7 float sum = 0; 8 for (int k=0; k<wA; k++) { 9 float a = A[i*wA+k]; 10 float b = B[k*wB+j]; 11 sum += a*b; 12 } 13 C[i*Nw+j] = sum; 14 } 15 } 16 } Little Host Code Overhead Programmer annotation of kernel computation Programmer annotation of data movement © Copyright 2014 HSA Foundation. All Rights Reserved
  • 10. ADVANTAGE OF HSA FOR OPENACC  Flexibility in copyin and copyout implementation  Flexible code generation for nested acc parallel loops  E.g., inner loop bounds that depend on outer loop iterations  Compiler data affinity optimization (especially OpenACC kernel regions)  The compiler does not have to undo programmer managed data transfers © Copyright 2014 HSA Foundation. All Rights Reserved
  • 11. C++AMP HSA ENABLES EFFICIENT COMPILATION OF AN EVEN HIGHER LEVEL OF PROGRAMMING INTERFACE © Copyright 2014 HSA Foundation. All Rights Reserved
  • 12. C++ AMP ● C++ Accelerated Massive Parallelism ● Designed for data level parallelism ● Extension of C++11 proposed by Microsoft ● An open specification with multiple implementations aiming at standardization ● MS Visual Studio 2013 ● MulticoreWare CLAMP ● GPU data modeled as C++14-like containers for multidimensional arrays ● GPU kernels modeled as C++11 lambda ● Minimal extension to C++ for simplicity and future proofing © Copyright 2014 HSA Foundation. All Rights Reserved
  • 13. MATRIX MULTIPLICATION IN C++AMP void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix, int ha, int hb, int hc) { array_view<int, 2> a(ha, hb, aMatrix); array_view<int, 2> b(hb, hc, bMatrix); array_view<int, 2> product(ha, hc, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize();} clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS); __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value;} © Copyright 2014 HSA Foundation. All Rights Reserved
  • 14. C++AMP PROGRAMMING MODEL void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize();} GPU data modeled as data container © Copyright 2014 HSA Foundation. All Rights Reserved
  • 15. C++AMP PROGRAMMING MODEL void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize();} Kernels modeled as lambdas; arguments are implicitly modeled as captured variables, programmer do not need to specify copyin and copyout © Copyright 2014 HSA Foundation. All Rights Reserved
  • 16. C++AMP PROGRAMMING MODEL void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } Execution interface; marking an implicitly parallel region for GPU execution © Copyright 2014 HSA Foundation. All Rights Reserved
  • 17. MCW C++AMP (CLAMP) ● Runs on Linux and Mac OS X ● Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X), NVIDIA and even POCL ● Clang/LLVM-based, open source o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR o With template helper library ● Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems ● One of the two C++ AMP implementations recognized by HSA foundation © Copyright 2014 HSA Foundation. All Rights Reserved
  • 18. MCW C++ AMP COMPILER ● Device Path o generate OpenCL C code and SPIR o emit kernel function ● Host Path o preparation to launch the code C++ AMP source code Clang/LLVM 3.3 Device Code Host Code © Copyright 2014 HSA Foundation. All Rights Reserved
  • 19. TRANSLATION parallel_for_each(product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } }); __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB){ int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value;} ● Append the arguments ● Set the index ● emit kernel function ● implicit memory management © Copyright 2014 HSA Foundation. All Rights Reserved
  • 20. EXECUTION ON NON-HSA OPENCL PLATFORMS C++ AMP source code Clang/LLVM 3.3 Device Code C++ AMP source code Clang/LLVM 3.3 Host Code gmac OpenCL Our work Runtime © Copyright 2014 HSA Foundation. All Rights Reserved
  • 21. GMAC ● unified virtual address space in software ● Can have high overhead sometimes ● In HSA (e.g., AMD Kaveri), GMAC is not longer needed Gelado, et al, ASPLOS 2010 © Copyright 2014 HSA Foundation. All Rights Reserved
  • 22. CASE STUDY: BINOMIAL OPTION PRICING  Line of Codes 0 50 100 150 200 250 300 350 C++AMP OpenCL Lines of Code by Cloc Host Kernel © Copyright 2014 HSA Foundation. All Rights Reserved
  • 23. PERFORMANCE ON NON-HSA SYSTEMS BINOMIAL OPTION PRICING 0 0.02 0.04 0.06 0.08 0.1 0.12 Total GPU Time Kernel-only TimeinSeconds Performance on an NV Tesla C2050 OpenCL C++AMP © Copyright 2014 HSA Foundation. All Rights Reserved
  • 24. EXECUTION ON HSA C++ AMP source code Clang/LLVM 3.3 Device SPIR C++ AMP source code Clang/LLVM 3.3 Host SPIR HSA Runtime Compile Time Runtime © Copyright 2014 HSA Foundation. All Rights Reserved
  • 25. WHAT WE NEED TO DO? ● Kernel function o emit the kernel function with required arguments ● On Host side o a function that recursively traverses the object and append the arguments to OpenCL stack. ● On Device side o reconstruct it on the device code for future use. © Copyright 2014 HSA Foundation. All Rights Reserved
  • 26. WHY COMPILING C++AMP TO OPENCL IS NOT TRIVIAL ● C++AMP → LLVM IR → OpenCL C or SPIR ● arguments passing (lambda capture vs function calls) ● explicit V.S. implicit memory transfer ● Heavy lifting is done by compiler and runtime © Copyright 2014 HSA Foundation. All Rights Reserved
  • 27. EXAMPLE struct A { int a; };struct B : A { int b; };struct C { B b; int c; }; struct C c; c.c = 100; auto fn = [=] () { int qq = c.c; }; © Copyright 2014 HSA Foundation. All Rights Reserved
  • 28. TRANSLATION parallel_for_each(product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } }); __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB){ int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value;} ● Compiler ● Turn captured variables into OpenCL arguments ● Populate the index<N> in OCL kernel ● Runtime ● Implicit memory management © Copyright 2014 HSA Foundation. All Rights Reserved
  • 29. QUESTIONS? © Copyright 2014 HSA Foundation. All Rights Reserved