Thread: Chapter 1: Creating and distributing a matrix-vector multiplication kernel

This is the adapted code from listing 1.1 in chapter 1 of the book OpenCL in Action.
The example basically performs this matrix multiplication:
```|  0.0   2.0   4.0   6.0 |   | 0.0 |   | 84.0  |
|                        |   |     |   |       |
|  8.0  10.0  12.0  14.0 |   | 3.0 |   | 228.0 |
|                        | x |     | = |       |
| 16.0  18.0  20.0  22.0 |   | 6.0 |   | 372.0 |
|                        |   |     |   |       |
| 24.0  26.0  28.0  30.0 |   | 9.0 |   | 516.0 |
```

```' -- NOTE: Enhanced to list the result/expected result

Uses "Console", "File"

#INCLUDE "%APP_INCLUDEPATH%/cl/cl.tBasicU"

Begin Const
\$PROGRAM_FILE = APP_SourcePath+"matvec.cl"
\$KERNEL_FUNC  = "matvec_mult"
End Const

Function TBMain()

/* Host/device data structures */
tcl_platform_id   platform
tcl_device_id     device
tcl_context       context
tcl_command_queue queue
tcl_int           i, ErrCl

/* Program/kernel data structures */
tcl_program program

String program_buffer, program_log
tSize program_size, log_size
tcl_kernel kernel

/* Data and buffers */
Single mtx(16), vec(4), result(4)
Single correct(4) = 0.0, 0.0, 0.0, 0.0
tcl_mem mat_buff, vec_buff, res_buff
tSize work_units_per_kernel

/* Initialize data to be processed by the kernel */
For i = 1 To 16
mtx(i) = (i-1) * 2.0
Next

For i = 1 To 4
vec(i) = (i-1) * 3.0
correct(1) += mtx(i)    * vec(i)
correct(2) += mtx(i+4)  * vec(i)
correct(3) += mtx(i+8)  * vec(i)
correct(4) += mtx(i+12) * vec(i)
Next

/* Identify a platform */
ErrCl = clGetPlatformIDs(1, platform, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't find any platforms")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Access a device */
ErrCl = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, device, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't find any devices")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Create the context */
context = clCreateContext(ByVal NULL, 1, device, ByVal NULL, ByVal NULL, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create a context")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Read program file and place content into buffer */
program_size   = Len(program_buffer)
If (program_size = 0) Then
perror("Couldn't find the program file")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Create program from file */
program = clCreateProgramWithSource(context, 1, VarPtr(program_buffer), program_size, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the program")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If
program_buffer = ""

/* Build program */
ErrCl = clBuildProgram(program, 0, ByVal 0, "", 0, ByVal 0)
If (ErrCl < 0) Then

/* Find size of log and print to std output */
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
0, ByVal NULL, log_size)
program_log = Repeat\$(log_size+1, \$NUL)

clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
log_size + 1, ByVal StrPtr(program_log), ByVal NULL)
PrintL program_log
program_log = ""

APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, StrPtr(\$KERNEL_FUNC), ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the kernel")
APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Create CL buffers to hold input and output data */
CL_MEM_COPY_HOST_PTR, SizeOf(Single)*16, mtx, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create a buffer object")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

CL_MEM_COPY_HOST_PTR, SizeOf(Single)*4, vec, ByVal NULL)
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
SizeOf(Single)*4, ByVal NULL, ByVal NULL)

/* Create kernel arguments from the CL buffers */
ErrCl = clSetKernelArg(kernel, 0, SizeOf(tcl_mem), mat_buff)
If (ErrCl < 0) Then
perror("Couldn't set the kernel argument")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

clSetKernelArg(kernel, 1, SizeOf(tcl_mem), vec_buff)
clSetKernelArg(kernel, 2, SizeOf(tcl_mem), res_buff)

/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the command queue")
APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Enqueue the command queue to the device */
work_units_per_kernel = 4 /* 4 work-units per kernel */
ErrCl = clEnqueueNDRangeKernel(queue, kernel, 1, ByVal NULL, work_units_per_kernel,
ByVal NULL, 0, ByVal NULL, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't enqueue the kernel execution command")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

ErrCl = clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0, SizeOf(Single)*4,
result, 0, ByVal NULL, ByVal NULL)

If (ErrCl < 0) Then
perror("Couldn't enqueue the read buffer command")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Test the result */
If((result(1) = correct(1)) And (result(2) = correct(2)) And (result(3) = correct(3)) And (result(4) = correct(4))) Then

PrintL("Matrix-vector multiplication successful.")

Else

PrintL("Matrix-vector multiplication unsuccessful.")

End If
PrintL("Calculated result: " + Join\$(result, ", "))
PrintL("Expected result  : " + Join\$(correct, ", "))

/* Deallocate resources */
clReleaseMemObject(mat_buff)
clReleaseMemObject(vec_buff)
clReleaseMemObject(res_buff)
clReleaseKernel(kernel)
clReleaseCommandQueue(queue)
clReleaseProgram(program)
clReleaseContext(context)

PrintL "Press any key to continue..."
WaitKey
APP_SetReturnCode(0)

End Function

Function pError( sError As String )

Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_INTENSITY)
PrintL sError
Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_GREEN | %CONSOLE_FOREGROUND_BLUE)

End Function
```
You will need the latest ThinBASIC and OpenCL headers to run it + of course modern GPU or CPU.

Enjoy!,
Petr

2. Updated code on 18-9-2012, 15:22 CET - there was a typo in error handling of failed build, fixed

3. Thanks for the first of hopefully many examples Petr. I looked over the code and it is daunting to use OpenCL. Goodluck as you make your way through the book and code, it will be a valuable resource for the rest of us, so thanks!

4. The example in first chapter is to give overview of how the finished minimal example works. Lot of odd things to see, might even look confusing.
Examples since chapter 2 go right from the basics, I will start posting them soon. You will see it is nothing to be scared of

Petr

5. Finally had the chance to try this. I haven't got around to trying the other examples yet but I'm sure they'll work. :-D

6. Hi Matthew,

Petr

7. No not yet I haven't but this is the only example that I've tried so-far. Maybe when I try the other examples later I'll need to make some changes?

8. Hopefully not,

OpenCL C is not device specific. My only worry was about name of DLL on ATi/AMD cards - in first releases it was atiocl.dll, while NVidia and Intel had OpenCL.DLL.
It seems Radeon guys have changed the naming convention, good for us

Petr

