StarPU Handbook
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
Full source code for the ’Scaling a Vector’ example

Main Application

/*
* This example demonstrates how to use StarPU to scale an array by a factor.
* It shows how to manipulate data with StarPU's data management library.
* 1- how to declare a piece of data to StarPU (starpu_vector_data_register)
* 2- how to describe which data are accessed by a task (task->handles[0])
* 3- how a kernel can manipulate the data (buffers[0].vector.ptr)
*/
#include <starpu.h>
#define NX 2048
extern void scal_cpu_func(void *buffers[], void *_args);
extern void scal_sse_func(void *buffers[], void *_args);
extern void scal_cuda_func(void *buffers[], void *_args);
extern void scal_opencl_func(void *buffers[], void *_args);
static struct starpu_codelet cl = {
/* CPU implementation of the codelet */
.cpu_funcs = { scal_cpu_func, scal_sse_func },
.cpu_funcs_name = { "scal_cpu_func", "scal_sse_func" },
#ifdef STARPU_USE_CUDA
/* CUDA implementation of the codelet */
.cuda_funcs = { scal_cuda_func },
#endif
#ifdef STARPU_USE_OPENCL
/* OpenCL implementation of the codelet */
.opencl_funcs = { scal_opencl_func },
#endif
.nbuffers = 1,
.modes = { STARPU_RW }
};
#ifdef STARPU_USE_OPENCL
#endif
int main(int argc, char **argv)
{
/* We consider a vector of float that is initialized just as any of C
* data */
float vector[NX];
unsigned i;
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
/* Initialize StarPU with default configuration */
starpu_init(NULL);
#ifdef STARPU_USE_OPENCL
"examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
#endif
/* Tell StaPU to associate the "vector" vector with the "vector_handle"
* identifier. When a task needs to access a piece of data, it should
* refer to the handle that is associated to it.
* In the case of the "vector" data interface:
* - the first argument of the registration method is a pointer to the
* handle that should describe the data
* - the second argument is the memory node where the data (ie. "vector")
* resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
* opposed to an adress on a GPU for instance.
* - the third argument is the adress of the vector in RAM
* - the fourth argument is the number of elements in the vector
* - the fifth argument is the size of each element.
*/
starpu_data_handle_t vector_handle;
starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
NX, sizeof(vector[0]));
float factor = 3.14;
/* create a synchronous task: any call to starpu_task_submit will block
* until it is terminated */
struct starpu_task *task = starpu_task_create();
task->synchronous = 1;
task->cl = &cl;
/* the codelet manipulates one buffer in RW mode */
task->handles[0] = vector_handle;
/* an argument is passed to the codelet, beware that this is a
* READ-ONLY buffer and that the codelet may be given a pointer to a
* COPY of the argument */
task->cl_arg = &factor;
task->cl_arg_size = sizeof(factor);
/* execute the task on any eligible computational ressource */
/* StarPU does not need to manipulate the array anymore so we can stop
* monitoring it */
starpu_data_unregister(vector_handle);
#ifdef STARPU_USE_OPENCL
#endif
/* terminate StarPU, no task can be submitted after */
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
return 0;
}

CPU Kernel

#include <starpu.h>
#include <xmmintrin.h>
/* This kernel takes a buffer and scales it by a constant factor */
void scal_cpu_func(void *buffers[], void *cl_arg)
{
unsigned i;
float *factor = cl_arg;
/*
* The "buffers" array matches the task->handles array: for instance
* task->handles[0] is a handle that corresponds to a data with
* vector "interface", so that the first entry of the array in the
* codelet is a pointer to a structure describing such a vector (ie.
* struct starpu_vector_interface *). Here, we therefore manipulate
* the buffers[0] element as a vector: nx gives the number of elements
* in the array, ptr gives the location of the array (that was possibly
* migrated/replicated), and elemsize gives the size of each elements.
*/
struct starpu_vector_interface *vector = buffers[0];
/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(vector);
/* get a pointer to the local copy of the vector: note that we have to
* cast it in (float *) since a vector could contain any type of
* elements so that the .ptr field is actually a uintptr_t */
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
/* scale the vector */
for (i = 0; i < n; i++)
val[i] *= *factor;
}
void scal_sse_func(void *buffers[], void *cl_arg)
{
float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
unsigned int n_iterations = n/4;
__m128 *VECTOR = (__m128*) vector;
__m128 FACTOR STARPU_ATTRIBUTE_ALIGNED(16);
float factor = *(float *) cl_arg;
FACTOR = _mm_set1_ps(factor);
unsigned int i;
for (i = 0; i < n_iterations; i++)
VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
unsigned int remainder = n%4;
if (remainder != 0)
{
unsigned int start = 4 * n_iterations;
for (i = start; i < start+remainder; ++i)
{
vector[i] = factor * vector[i];
}
}
}

CUDA Kernel

#include <starpu.h>
static __global__ void vector_mult_cuda(unsigned n, float *val,
float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
val[i] *= factor;
}
extern "C" void scal_cuda_func(void *buffers[], void *_args)
{
float *factor = (float *)_args;
/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned threads_per_block = 64;
unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
(n, val, *factor);
cudaStreamSynchronize(starpu_cuda_get_local_stream());
}

OpenCL Kernel

Invoking the Kernel

#include <starpu.h>
void scal_opencl_func(void *buffers[], void *_args)
{
float *factor = _args;
int id, devid, err; /* OpenCL specific code */
cl_kernel kernel; /* OpenCL specific code */
cl_command_queue queue; /* OpenCL specific code */
cl_event event; /* OpenCL specific code */
/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
/* OpenCL copy of the vector pointer */
cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
{ /* OpenCL specific code */
err = starpu_opencl_load_kernel(&kernel, &queue,
"vector_mult_opencl", /* Name of the codelet */
devid);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
err = clSetKernelArg(kernel, 0, sizeof(n), &n);
err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
}
{ /* OpenCL specific code */
size_t global=n;
size_t local;
size_t s;
cl_device_id device;
starpu_opencl_get_device(devid, &device);
err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(local), &local, &s);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
if (local > global) local=global;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
NULL, &event);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
{ /* OpenCL specific code */
clFinish(queue);
clReleaseEvent(event);
}
}

Source of the Kernel

__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
{
const int i = get_global_id(0);
if (i < nx) {
val[i] *= factor;
}
}