--- /dev/null
+/*
+
+ Copyright (c) 2013 Pekka Jääskeläinen and
+ Kalray
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <CL/cl.h>
+#include <poclu.h>
+#include "config.h"
+
+const char source[] =
+"#ifdef DOUBLE_PRECISION\n"
+" #ifdef cl_khr_fp64\n"
+" #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+" #else\n"
+" #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+" #endif\n"
+"#endif\n"
+"\n"
+"__kernel void Sdot_kernel( __global float *_X, __global float *_Y, __global float *scratchBuff,\n"
+" uint N, uint offx, int incx, uint offy, int incy, int doConj )\n"
+"{\n"
+" __global float *X = _X + offx;\n"
+" __global float *Y = _Y + offy;\n"
+" float dotP = (float) 0.0;\n"
+"\n"
+" if ( incx < 0 ) {\n"
+" X = X + (N - 1) * abs(incx);\n"
+" }\n"
+" if ( incy < 0 ) {\n"
+" Y = Y + (N - 1) * abs(incy);\n"
+" }\n"
+"\n"
+" int gOffset;\n"
+" for( gOffset=(get_global_id(0) * 4); (gOffset + 4 - 1)<N; gOffset+=( get_global_size(0) * 4 ) )\n"
+" {\n"
+" float4 vReg1, vReg2, res;\n"
+"\n"
+" #ifdef INCX_NONUNITY\n"
+" vReg1 = (float4)( (X + (gOffset*incx))[0 + ( incx * 0)], (X + (gOffset*incx))[0 + ( incx * 1)], (X + (gOffset*incx))[0 + ( incx * 2)], (X + (gOffset*incx))[0 + ( incx * 3)]);\n"
+" #else\n"
+" vReg1 = vload4( 0, (__global float *) (X + gOffset) );\n"
+" #endif\n"
+"\n"
+" #ifdef INCY_NONUNITY\n"
+" vReg2 = (float4)( (Y + (gOffset*incy))[0 + ( incy * 0)], (Y + (gOffset*incy))[0 + ( incy * 1)], (Y + (gOffset*incy))[0 + ( incy * 2)], (Y + (gOffset*incy))[0 + ( incy * 3)]);\n"
+" #else\n"
+" vReg2 = vload4( 0, (__global float *) (Y + gOffset) );\n"
+" #endif\n"
+"\n"
+" ;\n"
+" res = vReg1 * vReg2 ;\n"
+" dotP += res .S0 + res .S1 + res .S2 + res .S3;\n"
+"; // Add-up elements in the vector to give a scalar\n"
+" }\n"
+"\n"
+" // Loop for the last thread to handle the tail part of the vector\n"
+" // Using the same gOffset used above\n"
+" for( ; gOffset<N; gOffset++ )\n"
+" {\n"
+" float sReg1, sReg2, res;\n"
+" sReg1 = X[gOffset * incx];\n"
+" sReg2 = Y[gOffset * incy];\n"
+"\n"
+" ;\n"
+" res = sReg1 * sReg2 ;\n"
+" dotP = dotP + res ;\n"
+" }\n"
+"\n"
+" // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier\n"
+" // dotP of work-item 0 will have the final reduced item of the work-group\n"
+" __local float p1753 [ 64 ];\n"
+" uint QKiD0 = get_local_id(0);\n"
+" p1753 [ QKiD0 ] = dotP ;\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 < 32 ) {\n"
+" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 32 ];\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 < 16 ) {\n"
+" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 16 ];\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 < 8 ) {\n"
+" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 8 ];\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 < 4 ) {\n"
+" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 4 ];\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 < 2 ) {\n"
+" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 2 ];\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+" if( QKiD0 == 0 ) {\n"
+" dotP = p1753 [0] + p1753 [1];\n"
+" }\n"
+"\n"
+" if( (get_local_id(0)) == 0 ) {\n"
+" scratchBuff[ get_group_id(0) ] = dotP;\n"
+" }\n"
+"}\n"
+"\n"
+;
+
+#define MAX_PLATFORMS 32
+#define MAX_DEVICES 32
+#define MAX_BINARIES 32
+
+int main(){
+ cl_int err;
+ cl_platform_id platforms[MAX_PLATFORMS];
+ cl_uint nplatforms;
+ cl_device_id devices[MAX_DEVICES];
+ cl_uint ndevices;
+ cl_program program = NULL;
+ size_t binsizes[MAX_BINARIES];
+ size_t nbinaries;
+
+ CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms));
+ TEST_ASSERT(nplatforms > 0);
+
+ CHECK_CL_ERROR(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices));
+ TEST_ASSERT(ndevices > 0);
+
+ cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, &err);
+ CHECK_OPENCL_ERROR_IN("clCreateContext");
+
+ const char * src[] = {source};
+ program = clCreateProgramWithSource(context, 1, src, NULL, &err);
+ CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");
+
+ CHECK_CL_ERROR(clBuildProgram(program, 1, devices, "-g -DINCX_NONUNITY -DINCY_NONUNITY", NULL, NULL));
+
+ CHECK_CL_ERROR(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsizes), binsizes, &nbinaries));
+ printf("binary size: %zd\n", binsizes[0]);
+
+ CHECK_CL_ERROR(clReleaseProgram(program));
+
+ CHECK_CL_ERROR (clReleaseContext (context));
+
+ printf ("OK\n");
+
+ return EXIT_SUCCESS;
+}