1
// This file is part of BOINC.
2
// http://boinc.berkeley.edu
3
// Copyright (C) 2008 University of California
5
// BOINC is free software; you can redistribute it and/or modify it
6
// under the terms of the GNU Lesser General Public License
7
// as published by the Free Software Foundation,
8
// either version 3 of the License, or (at your option) any later version.
10
// BOINC is distributed in the hope that it will be useful,
11
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13
// See the GNU Lesser General Public License for more details.
15
// You should have received a copy of the GNU Lesser General Public License
16
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
18
// This program serves as both
19
// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API
20
// and ATIStream OpenCL API.
21
// - A program for testing various features of BOINC.
23
// The program reads the input nxn matrix from the "input" file, inverts the
24
// matrix NUM_ITERATIONS times and write to "output" file.
26
// command line options
27
// -run_slow: sleep 1 second after each character
28
// -cpu_time N: use about N CPU seconds after copying files
29
// -early_exit: exit(10) after 30 chars
30
// -early_crash: crash after 30 chars
32
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
33
// Contributor: Tuan Le (tuanle86@berkeley.edu)
35
#include "atiopencl.hpp"
38
int main(int argc, char * argv[]) {
39
int i, retval, lastInversion=0, checkpointExists=0, matrixSize=0;
41
char input_path[512], output_path[512], chkpt_path[512], buf[256];
45
generate_random_input_file(MATRIX_SIZE); //call this if you don't want to
46
//construct the input file manually
48
for (i=0; i<argc; i++) {
49
if (!strcmp(argv[i], "-early_exit")) early_exit = true;
50
if (!strcmp(argv[i], "-early_crash")) early_crash = true;
51
if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
52
if (!strcmp(argv[i], "-run_slow")) run_slow = true;
53
if (!strcmp(argv[i], "-cpu_time")) {
54
cpu_time = atof(argv[++i]);
58
retval = boinc_init();
60
fprintf(stderr, "%s boinc_init returned %d\n",
61
boinc_msg_prefix(buf, sizeof(buf)), retval );
65
// open the input file (resolve logical name first)
67
boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
68
infile = boinc_fopen(input_path, "r");
71
"%s Couldn't find input file in boinc\\win_build, resolved name %s.\n",
72
boinc_msg_prefix(buf, sizeof(buf)), input_path
78
boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
80
// See if there's a valid checkpoint file.
81
// If so retrieve the current matrix and inversion number
83
boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
84
state = boinc_fopen(chkpt_path, "r");
86
printf("Checkpoint file is detected. Read from checkpoint file ... \n");
87
checkpointExists=fscanf(state, "%d", &lastInversion);
88
if (checkpointExists == 1) {
89
isStateFileInUse=true;
90
printf("Last inversion # is : %d\n",lastInversion);
91
fscanf(state,"%d",&matrixSize);
92
width=height=matrixSize;
93
printf("Initialize host ....\n");
94
initialize_host(state);
98
printf("There's no valid checkpoint file!\n");
101
retval = out.open(output_path, "wb");
104
fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
105
boinc_msg_prefix(buf, sizeof(buf))
107
fprintf(stderr, "%s resolved name %s, retval %d\n",
108
boinc_msg_prefix(buf, sizeof(buf)), output_path, retval
115
// create shared mem segment for graphics, and arrange to update it
117
shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
119
fprintf(stderr, "%s failed to create shared mem segment\n",
120
boinc_msg_prefix(buf, sizeof(buf))
124
boinc_register_timer_callback(update_shmem);
127
if (checkpointExists != 1) { //checkpoint file is not found.
128
matrixSize=get_matrix_size(infile);
129
printf("Matrix Size: width = height = %d\n",matrixSize);
130
width=height=matrixSize;
131
// Initialize Host application
132
printf("Initialize host ....\n");
133
if (initialize_host(infile)==1) {
136
out.printf("\n----------------- Before being inversed ----------------\n\n");
137
printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",
140
out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",
142
printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",
143
NUM_ITERATIONS-lastInversion,lastInversion+1);
146
// Initialize OpenCL resources
147
if (initialize_cl()==1) {
151
print_to_file(&out,input,matrixSize);
153
for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
154
//the invert function will trigger kernel calls.
155
invert(input,output,matrixSize);
156
printf("Finish inversion #%d\n",i);
157
for (int j=0;j<matrixSize*matrixSize;++j) {
158
input[j]=output[j]; //change the input for the next iteration
164
if (early_exit && i>30) {
168
if (early_crash && i>30) {
172
if (early_sleep && i>30) {
174
while (1) boinc_sleep(1);
177
if (boinc_time_to_checkpoint()) {
178
printf("Perform checkpointing at inversion # %d\n",i);
179
//we'll need to write the current matrix to the state file.
180
retval = do_checkpoint(out, i, input, matrixSize);
182
fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
183
boinc_msg_prefix(buf, sizeof(buf)), retval
187
boinc_checkpoint_completed();
189
fd = i/NUM_ITERATIONS;
190
if (cpu_time) fd /= 2;
191
boinc_fraction_done(fd);
194
out.printf("\n\n----------------- Final inversion #%d ----------------\n\n",
196
print_to_file(&out,output,matrixSize);
198
retval = out.flush(); //force the output file to be closed.
200
fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
201
boinc_msg_prefix(buf, sizeof(buf)), retval
206
// Releases OpenCL resources
207
if (cleanup_cl()==1) {
212
// Release host resources
215
// burn up some CPU time if needed
218
printf("\nBurning up some CPU time ... \n");
219
double start = dtime();
220
for (int i=0; ; i++) {
221
double e = dtime()-start;
222
if (e > cpu_time) break;
223
fd = .5 + .5*(e/cpu_time);
224
boinc_fraction_done(fd);
226
if (boinc_time_to_checkpoint()) {
227
retval = do_checkpoint(out, NUM_ITERATIONS, input, matrixSize);
229
fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
230
boinc_msg_prefix(buf, sizeof(buf)), retval
234
boinc_checkpoint_completed();
236
comp_result = do_a_giga_flop(i);
239
boinc_fraction_done(1);
245
printf("\nDone! Please press ENTER to exit. ");
251
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
256
command_line = GetCommandLine();
257
argc = parse_command_line( command_line, argv );
258
return main(argc, argv);
262
/*** BOINC FUNCTION DEFINITIONS ***/
264
/* Do a billion floating-point ops */
265
static double do_a_giga_flop(int foo) {
266
double x = 3.14159*foo;
268
for (i=0; i<500000000; i++) {
275
/* Save the computation state into checkpoint file */
276
int do_checkpoint(MFILE& mf, int n, cl_float *input, int matrixSize) {
278
string resolved_name;
280
FILE* f = fopen("temp", "w");
282
fprintf(f, "%d", n); //write inversion number
284
fprintf(f, "%d", matrixSize); //write matrixSize
286
for (int i=0;i<matrixSize*matrixSize;++i) {
288
fprintf(f, "%f", input[i]);
292
if (retval) return retval;
293
boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name);
294
retval = boinc_rename("temp", resolved_name.c_str());
295
if (retval) return retval;
296
return 0; //return 0 to indicate success.
299
/*** FUNCTION DEFINITIONS ***/
301
/* Create an input file filled with random data of type cl_float. */
302
void generate_random_input_file(int n) {
305
infile=fopen(INPUT_FILENAME,"w");
306
cl_float *input = (cl_float *)malloc(sizeof(cl_float)*(n*n));
308
for( int i = 0; i < n; i++ ) {
309
for (int j = 0; j < n; j++) {
310
input[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
312
input[i*n+i] += sqrt((float)n);
315
for (int i=0;i<n*n;++i) {
316
fprintf(infile,"%15f",input[i]);
318
fprintf(infile,"\n");
329
* Parse the input file and determine the size of the matrix.
330
* This is an nxn matrix. Note: if width<> height, the matrix is
333
int get_matrix_size(FILE *infile) {
337
fseek(infile,0,SEEK_SET);
341
if (c == EOF || c == '\n') {
344
} while (isspace(c));
346
if (isdigit(c) || c=='.' || c=='-') {
352
if (c == EOF || c == '\n') {
355
} while (isdigit(c) || c=='.' || c=='-');
357
if (c==EOF || c == '\n') {
366
* \brief Host Initialization
367
* Allocate and initialize memory
368
* on the host. Print input array.
370
int initialize_host(FILE *infile) {
375
printf("Error: non nxn matrix cannot be invertiable.\n");
379
/////////////////////////////////////////////////////////////////
380
// Allocate and initialize memory used by host
381
/////////////////////////////////////////////////////////////////
382
cl_uint sizeInBytes = width * height * sizeof(cl_float);
383
input = (cl_float *) malloc(sizeInBytes);
385
printf("Error: Failed to allocate input memory on host\n");
389
output = (cl_float *) malloc(sizeInBytes);
391
printf("Error: Failed to allocate output memory on host\n");
395
//fillRandom(input,width,height);
396
fetch_elements_into_host_memory(infile,input);
401
* Read the float values from input file into "input" array.
403
void fetch_elements_into_host_memory(FILE *infile, cl_float *input) {
406
if (!isStateFileInUse) {
407
fseek(infile,0,SEEK_SET);
409
while (fscanf(infile,"%f",&num)==1) {
416
* Converts the contents of a file into a string
418
char * convert_to_string(const char *fileName) {
424
// look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
425
// in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl"
426
// is automatically copied to these directories along the building process.
427
FILE *infile=fopen(fileName,"r");
428
if (!infile) { //not found. This typically happens on Linux or Mac.
429
//look for "atiopencl_kernels.cl" in "boinc/sample/atiopencl/" instead.
430
infile = fopen(KERNELS_FILEPATH,"r");
432
printf("File open Error!");
436
fseek(infile,0,SEEK_SET);
437
while (fgetc(infile)!=EOF) count++;
438
s=(char *) malloc(sizeof(char)*(count+1)); //add 1 for string terminator.
439
fseek(infile,0,SEEK_SET);
440
while ((c=fgetc(infile))!=EOF) {
448
* \brief OpenCL related initialization
449
* Create Context, Device list, Command Queue
450
* Load CL file, compile, link CL source
451
* Build program and kernel objects
454
// Note: OpenCL memory buffer objects will be created in invert
455
// function before kernel calls are made.
456
int initialize_cl(void) {
458
size_t deviceListSize;
460
localThreads[0] = LOCAL_WORK_SIZE;
461
globalThreads[0] = GLOBAL_WORK_SIZE;
464
* Have a look at the available platforms and pick either
465
* the AMD one if available or a reasonable default.
468
cl_uint numPlatforms;
469
cl_platform_id platform = NULL;
470
status = clGetPlatformIDs(0, NULL, &numPlatforms);
471
if(status != CL_SUCCESS) {
472
printf("Error: Getting Platforms. (clGetPlatformsIDs)\n");
476
if (numPlatforms > 0) {
477
cl_platform_id* platforms = (cl_platform_id *)
478
malloc(sizeof(cl_platform_id)*numPlatforms);
479
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
480
if (status != CL_SUCCESS) {
481
printf("Error: Getting Platform Ids. (clGetPlatformsIDs)\n");
484
for (unsigned int i=0; i < numPlatforms; ++i) {
486
status = clGetPlatformInfo(platforms[i],
491
if (status != CL_SUCCESS) {
492
printf("Error: Getting Platform Info.(clGetPlatformInfo)\n");
495
platform = platforms[i];
496
if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
503
if(NULL == platform) {
504
printf("NULL platform found so Exiting Application.");
509
* If we could find our platform, use it. Otherwise use just available platform.
511
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
512
(cl_context_properties)platform,
516
/////////////////////////////////////////////////////////////////
517
// Create an OpenCL context
518
/////////////////////////////////////////////////////////////////
519
context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status);
520
if (status != CL_SUCCESS) {
521
printf("Error: Creating Context. (clCreateContextFromType)\n");
525
/* First, get the size of device list data */
526
status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize);
527
if (status != CL_SUCCESS) {
528
printf("Error: Getting Context Info (device list size, clGetContextInfo)\n");
532
/////////////////////////////////////////////////////////////////
533
// Detect OpenCL devices
534
/////////////////////////////////////////////////////////////////
535
devices = (cl_device_id *)malloc(deviceListSize);
537
printf("Error: No devices found.\n");
541
/* Now, get the device list data */
542
status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL);
543
if (status != CL_SUCCESS) {
544
printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
548
/////////////////////////////////////////////////////////////////
549
// Create an OpenCL command queue
550
/////////////////////////////////////////////////////////////////
551
commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);
552
if(status != CL_SUCCESS) {
553
printf("Creating Command Queue. (clCreateCommandQueue)\n");
557
/////////////////////////////////////////////////////////////////
558
// Load CL file, build CL program object, create CL kernel object
559
/////////////////////////////////////////////////////////////////
560
source = convert_to_string(KERNELS_FILENAME);
561
size_t sourceSize[] = { strlen(source) };
562
program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
563
if (status != CL_SUCCESS) {
564
printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
568
/* create a cl program executable for all the devices specified */
569
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
570
if (status != CL_SUCCESS) {
571
printf("Error: Building Program (clBuildProgram)\n");
575
/* get a kernel object handle for a kernel with the given name */
576
GEStep1A_kernel = clCreateKernel(program, "GEStep1A", &status);
577
if (status != CL_SUCCESS) {
578
printf("Error: clCreateKernel (GEStep1A)\n");
582
GEStep2_kernel = clCreateKernel(program, "GEStep2", &status);
583
if (status != CL_SUCCESS) {
584
printf("Error: clCreateKernel (GEStep2)\n");
588
GEStep3_kernel = clCreateKernel(program, "GEStep3", &status);
589
if (status != CL_SUCCESS) {
590
printf("Error: clCreateKernel (GEStep3)\n");
598
* \brief Release OpenCL resources (Context, Memory etc.)
600
int cleanup_cl(void) {
603
status = clReleaseKernel(GEStep1A_kernel);
604
if (status != CL_SUCCESS) {
605
printf("Error: In clReleaseKernel (GEStep1A_kernel)\n");
609
status = clReleaseKernel(GEStep2_kernel);
610
if (status != CL_SUCCESS) {
611
printf("Error: In clReleaseKernel (GEStep2_kernel)\n");
615
status = clReleaseKernel(GEStep3_kernel);
616
if (status != CL_SUCCESS) {
617
printf("Error: In clReleaseKernel (GEStep3_kernel)\n");
621
status = clReleaseProgram(program);
622
if (status != CL_SUCCESS) {
623
printf("Error: In clReleaseProgram\n");
627
status = clReleaseMemObject(inputBuffer);
628
if (status != CL_SUCCESS) {
629
printf("Error: In clReleaseMemObject (inputBuffer)\n");
633
status = clReleaseCommandQueue(commandQueue);
634
if (status != CL_SUCCESS) {
635
printf("Error: In clReleaseCommandQueue\n");
639
status = clReleaseContext(context);
640
if (status != CL_SUCCESS) {
641
printf("Error: In clReleaseContext\n");
649
* \brief Releases program's resources
651
void cleanup_host(void) {
657
if (output != NULL) {
662
if (devices != NULL) {
667
if (source != NULL) {
668
free((char *)source);
674
* Write the result to output file
676
void print_to_file(MFILE *out, float *h_odata, int n) {
679
int num_elements=n*n;
680
while (num_elements>0) {
681
out->printf("%15f ",h_odata[move]);
693
* \brief Run OpenCL program
695
* Bind host variables to kernel arguments
698
int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
703
* the input array to the kernel. This array will eventually be modified
704
* to the inverted array.
706
status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
707
if (status != CL_SUCCESS) {
708
printf("Error: Setting kernel argument. (input)\n");
713
status = clSetKernelArg(GEStep1A_kernel, 1, sizeof(int), (void *)&i);
714
if (status != CL_SUCCESS) {
715
printf("Error: Setting kernel argument. (i)\n");
720
status = clSetKernelArg(GEStep1A_kernel, 2, sizeof(int), (void *)&n2);
721
if (status != CL_SUCCESS) {
722
printf("Error: Setting kernel argument. (n2)\n");
727
status = clSetKernelArg(GEStep1A_kernel, 3, sizeof(int), (void *)&lda2);
728
if (status != CL_SUCCESS) {
729
printf("Error: Setting kernel argument. (lda2)\n");
734
* Enqueue a kernel run call.
736
status = clEnqueueNDRangeKernel(commandQueue,
745
if (status != CL_SUCCESS) {
746
printf("Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)\n");
750
/* wait for the kernel call to finish execution */
751
status = clWaitForEvents(1, &events[0]);
752
if (status != CL_SUCCESS) {
753
printf("Error: Waiting for kernel run to finish. (clWaitForEvents)\n");
757
status = clReleaseEvent(events[0]);
758
if (status != CL_SUCCESS) {
759
printf("Error: Release event object. (clReleaseEvent)\n");
763
/* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel
764
status = clEnqueueReadBuffer(commandQueue,
768
globalThreads[0] * sizeof(cl_float),
774
if(status != CL_SUCCESS) {
775
printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
779
/* Wait for the read buffer to finish execution */
780
status = clWaitForEvents(1, &events[1]);
781
if (status != CL_SUCCESS) {
782
printf("Error: Waiting for read buffer call to finish. (clWaitForEvents)\n");
786
status = clReleaseEvent(events[1]);
787
if (status != CL_SUCCESS) {
788
printf("Error: Release event object. (clReleaseEvent)\n");
794
int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
799
* the input array to the kernel. This array will eventually be modified
800
* to the inverted array.
802
status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
803
if (status != CL_SUCCESS) {
804
printf("Error: Setting kernel argument. (AI)\n");
809
status = clSetKernelArg(GEStep2_kernel, 1, sizeof(cl_float), (void *)&diag);
810
if (status != CL_SUCCESS) {
811
printf("Error: Setting kernel argument. (diag)\n");
816
status = clSetKernelArg(GEStep2_kernel, 2, sizeof(int), (void *)&i);
817
if (status != CL_SUCCESS) {
818
printf("Error: Setting kernel argument. (i)\n");
823
status = clSetKernelArg(GEStep2_kernel, 3, sizeof(int), (void *)&n2);
824
if (status != CL_SUCCESS) {
825
printf("Error: Setting kernel argument. (n2)\n");
830
status = clSetKernelArg(GEStep2_kernel, 4, sizeof(int), (void *)&lda2);
831
if (status != CL_SUCCESS) {
832
printf("Error: Setting kernel argument. (lda2)\n");
837
* Enqueue a kernel run call.
839
status = clEnqueueNDRangeKernel(commandQueue,
848
if (status != CL_SUCCESS) {
849
printf("Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)\n");
853
/* wait for the kernel call to finish execution */
854
status = clWaitForEvents(1, &events[0]);
855
if (status != CL_SUCCESS) {
856
printf("Error: Waiting for kernel run to finish. (clWaitForEvents)\n");
860
status = clReleaseEvent(events[0]);
861
if (status != CL_SUCCESS) {
862
printf("Error: Release event object. (clReleaseEvent)\n");
866
/* Enqueue readBuffer*/
867
//Note: we are reading back from inputBuffer since AI is modified directly in kernel
868
status = clEnqueueReadBuffer(commandQueue,
872
globalThreads[0] * sizeof(cl_float),
877
if (status != CL_SUCCESS) {
878
printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
882
/* Wait for the read buffer to finish execution */
883
status = clWaitForEvents(1, &events[1]);
884
if (status != CL_SUCCESS) {
885
printf("Error: Waiting for read buffer call to finish. (clWaitForEvents)\n");
889
status = clReleaseEvent(events[1]);
890
if (status != CL_SUCCESS) {
891
printf("Error: Release event object. (clReleaseEvent)\n");
897
int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
902
* The input array to the kernel. This array will eventually be modified
903
* to the inverted array.
905
status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
906
if (status != CL_SUCCESS) {
907
printf("Error: Setting kernel argument. (input)\n");
912
status = clSetKernelArg(GEStep3_kernel, 1, sizeof(int), (void *)&i);
913
if (status != CL_SUCCESS) {
914
printf("Error: Setting kernel argument. (i)\n");
919
status = clSetKernelArg(GEStep3_kernel, 2, sizeof(int), (void *)&n2);
920
if (status != CL_SUCCESS) {
921
printf("Error: Setting kernel argument. (n2)\n");
926
status = clSetKernelArg(GEStep3_kernel, 3, sizeof(int), (void *)&lda2);
927
if (status != CL_SUCCESS) {
928
printf("Error: Setting kernel argument. (lda2)\n");
933
* Enqueue a kernel run call.
935
status = clEnqueueNDRangeKernel(commandQueue,
944
if (status != CL_SUCCESS) {
945
printf("Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)\n");
949
/* wait for the kernel call to finish execution */
950
status = clWaitForEvents(1, &events[0]);
951
if (status != CL_SUCCESS) {
952
printf("Error: Waiting for kernel run to finish. (clWaitForEvents)\n");
956
status = clReleaseEvent(events[0]);
957
if (status != CL_SUCCESS) {
958
printf("Error: Release event object. (clReleaseEvent)\n");
962
/* Enqueue readBuffer*/
963
//Note: we are reading back from inputBuffer since AI is modified directly in kernel
964
status = clEnqueueReadBuffer(commandQueue,
968
globalThreads[0] * sizeof(cl_float),
973
if (status != CL_SUCCESS) {
974
printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
978
/* Wait for the read buffer to finish execution */
979
status = clWaitForEvents(1, &events[1]);
980
if (status != CL_SUCCESS) {
981
printf("Error: Waiting for read buffer call to finish. (clWaitForEvents)\n");
985
status = clReleaseEvent(events[1]);
986
if(status != CL_SUCCESS) {
987
printf("Error: Release event object. (clReleaseEvent)\n");
994
void invertge(cl_float * AI_d, int lda, int n) {
996
// perform elementary row operations till A in AI becomes identity matrix
997
for (int i = 0; i < n; i++) {
999
run_GEStep1A_kernel(AI_d,i,n*2, lda2);
1002
for (int i = n-1; i >= 0; i--) {
1003
cl_float diag = 1.0;
1004
diag=AI_d[i*lda2+i];
1006
run_GEStep2_kernel(AI_d,diag,i,n*2, lda2);
1007
run_GEStep3_kernel(AI_d,i,n*2, lda2);
1011
/* inverts nxn matrix input and stores the result in output */
1012
void invert(cl_float * input, cl_float *output, int n) {
1013
fprintf(stderr,"starting inversion n = %d ", n);
1014
volatile clock_t gputime;
1017
int lda = ((n+15)&~15|16);
1018
cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2);
1019
memset(AI_d,0,sizeof(cl_float)*n*lda*2);
1020
for (int i = 0; i < n; i++) {
1021
memcpy(&AI_d[lda*i*2], &input[n*i], sizeof(cl_float)*n);
1022
AI_d[lda*i*2+n+i] = 1;
1027
/////////////////////////////////////////////////////////////////
1028
// Create OpenCL memory buffer
1029
/////////////////////////////////////////////////////////////////
1030
inputBuffer = clCreateBuffer(context,
1031
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1032
sizeof(cl_float) * globalThreads[0],
1035
if (status != CL_SUCCESS) {
1036
printf("Error: clCreateBuffer (inputBuffer)\n");
1039
// Note: there's no output buffer. In kernel, AI_d is modified directly.
1040
// Thus, we should read the result back to host from inputBuffer as well.
1042
invertge(AI_d, lda, n);
1043
gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
1044
fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
1047
// let's verify that
1050
// multiply inverse*xcopy, should be Identity matrix
1051
for (int k = 0; k < n; k++) {
1052
for (int j = 0; j < n; j++) {
1054
for (int i = 0; i < n; i++) {
1055
sum += AI[j*lda*2+n+i]*A[i*n+k];
1060
error += (1.0-sum) * (1.0-sum);
1064
fprintf(stderr, " %6.2f SSE", error);
1067
//copy the result to output
1068
for (int i = 0; i < n; i++) {
1069
memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n);
1072
fprintf(stderr," done!\n");