Monday, September 28, 2009

Matrix Multiplication 2 (OpenCL)

I will assume that you have gone through the CUDA Matrix Multiplication 2 example and understand the conceptual changes that we will be making to our OpenCL kernel. All we really need to do is express our kernel from CUDA Matrix Multiplication 2 in terms of OpenCL and slightly modify our main program from our OpenCL Matrix Multiplication 1 example to account for the different work group and NDRange sizes.

So when we launch our kernel we want the GPU resources allocated something like the image below:


As you can see we want a 64 by 64 NDRange of 16 by 16 work groups. This will only require us to change a couple of lines in our main program.

Main program (Listing 1)

// Multiply two matrices A * B = C

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <oclUtils.h>

#define WA 1024
#define HA 1024
#define WB 1024
#define HB WA
#define WC WB
#define HC HA

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}

/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////

int
main(int argc, char** argv)
{

// set seed for rand()
srand(2006);

// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);

unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);

// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}

printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}

// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);

// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;

size_t dataBytes;
size_t kernelLength;
cl_int errcode;

// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

/*****************************************/
/* Initialize OpenCL */
/*****************************************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);

// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);

//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);

// Setup device memory
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);


// 6. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My comment\n",
&kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE);

clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
shrCheckError(errcode, CL_SUCCESS);

errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS);


// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];

int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS);

localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;


errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

// 9. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");

// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);

clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);

free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);

}



So we set our localWorkSize to 16 by 16 and our globalWorkSize to 1024 by 1024. Why 1024 and not 64? Rember that in OpenCL we need to express the globalWorkSize in terms of the total number of threads. The underlying OpenCL API will look at the globalWorkSize and divide by the localWorkSize to arrive at a 64 by 64 NDRange of 16 by 16 work groups.

So what are the changes that we need to make to our CUDA Matrix Multiplication 2 kernel code?

OpenCL Kernel (Listing 1)

/* kernel.cl 
* Matrix multiplication: C = A * B.
* Device code.
*/

// OpenCL Kernel
__kernel void
matrixMul(__global float* C,
__global float* A,
__global float* B,
int wA, int wB)
{

// 2D Thread ID
// Old CUDA code
//int tx = blockIdx.x * TILE_SIZE + threadIdx.x;
//int ty = blockIdx.y * TILE_SIZE + threadIdx.y;
int tx = get_global_id(0);
int ty = get_global_id(1);

// value stores the element that is
// computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}

// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}


We changed the __global__ to __kernel to indicate that the function is to be run on the device. We added a __global to designate the input matrices are in global memory. If we look at the old CUDA code that is commented out we see that it was taking the blockId index and multiplying them times the TILE_SIZE (which was 16) and then adding in the index of the thread block. Well as luck would have it that is exactly what the call to get_global_id( ) returns.

Now cut copy and paste your way to a running binary (don't forget to replace "kernel.cl" in the call to oclLoadProgSource( ) with the complete path to your kernel). If you insert timing code you will find that this OpenCL example is about 46X faster than the CPU version.

Before we move on to Matrix Multiplication 3 let's take our OpenCL compiler oclcc out for a spin. First you are going to need to cut / copy / paste the compiler source and build oclcc. Once you have a oclcc binary in your bin tree all you need to do to compile a kernel is:

$ oclcc /path/to/kernel/source/kernel.cl -o kernel.ptx

This will compile your source to Nvidias ptx assembler. Pass --help on the oclcc command line to get a usage command. Since our kernel is already in ptx assembler we will need to change out main program slightly.

Main program (Listing 2)

// Multiply two matrices A * B = C

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <oclUtils.h>

#define WA 1024
#define HA 1024
#define WB 1024
#define HB WA
#define WC WB
#define HC HA

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}

/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////

int
main(int argc, char** argv)
{

// set seed for rand()
srand(2006);

// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);

unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);

// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}

printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}

// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);

// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;

size_t dataBytes;
size_t kernelLength;
cl_int errcode;

// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

/*****************************************/
/* Initialize OpenCL */
/*****************************************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);

// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);

//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);

// Setup device memory
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);


// 6. Load and build OpenCL kernel
//char *clMatrixMul = oclLoadProgSource("kernel.cl",
// "// My comment\n",
// &kernelLength);
//shrCheckError(clMatrixMul != NULL, shrTRUE);

//clProgram = clCreateProgramWithSource(
// clGPUContext,
// 1, (const char **)&clMatrixMul,
// &kernelLength, &errcode);
//shrCheckError(errcode, CL_SUCCESS);


// Open the ptx file and load it
// into a char* buffer
FILE* fp = fopen("kernel.ptx", "r");
fseek (fp , 0 , SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);
unsigned char* buffer;
buffer = (unsigned char*) malloc (lSize);
fread(buffer, 1, lSize, fp);
fclose(fp);

cl_int status;
clProgram = clCreateProgramWithBinary(clGPUContext,
1, (const cl_device_id *)clDevices,
&lSize, (const unsigned char**)&buffer,
&status, &errcode);
errcode = clBuildProgram(clProgram, 0, NULL, NULL,
NULL, NULL);
shrCheckError(status, CL_SUCCESS);
shrCheckError(errcode, CL_SUCCESS);


errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS);


// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];

int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS);

localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;

errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);

// 9. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");

// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);

clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);

free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);

}



Since we have already compiled our OpenCL code to ptx assembler we no longer need the calls to open the kernel source file or to create the program from source so we just comment them out. We replace these calls with code to open up the ptx file and load it into a buffer and we then call clCreateProgramWithBinary( ). Notice that we still must make a call to clBuildProgram( ). Remeber that Nvidia's implementation of OpenCL compiles to an intermeadiate form (ptx assembler) so we still need to compile the assembler to a binary capable of running on the card.

Now if we cut / copy / paste our way to a binary we will find that the GPU version is now 57X faster than the CPU version of the program. So by precompiling the kernel we get an additional 11X speedup. Not bad for changing less than 25 lines of code but we can still get faster...

On to OpenCL Matrix Multiplication 3