4
#ifndef USE_EXTERNAL_KERNEL
5
#define KERNEL( ... )# __VA_ARGS__ "\n"
6
// Double precision is a default of spreadsheets
7
// cl_khr_fp64: Khronos extension
8
// cl_amd_fp64: AMD extension
9
// use build option outside to define fp_t
10
/////////////////////////////////////////////
11
const char *kernel_src = KERNEL(
12
\n#ifdef KHR_DP_EXTENSION\n
13
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
14
\n#elif AMD_DP_EXTENSION\n
15
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
18
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
20
int i = get_global_id(1);
21
int j = get_global_id(0);
22
int tiffword,rval,gval,bval;
25
if ((i >= h) || (j >= w))
28
tiffword = tiffdata[i * w + j];
29
rval = ((tiffword) & 0xff);
30
gval = (((tiffword) >> 8) & 0xff);
31
bval = (((tiffword) >> 16) & 0xff);
32
output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
37
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
38
const int wpl, const int h)
40
const unsigned int row = get_global_id(1);
41
const unsigned int col = get_global_id(0);
42
const unsigned int pos = row * wpl + col;
45
if (row >= h || col >= wpl)
48
*(dword + pos) &= ~(*(sword + pos));
53
\n__kernel void pixSubtract(__global int *dword, __global int *sword,
54
const int wpl, const int h, __global int *outword)
56
const unsigned int row = get_global_id(1);
57
const unsigned int col = get_global_id(0);
58
const unsigned int pos = row * wpl + col;
61
if (row >= h || col >= wpl)
64
*(outword + pos) = *(dword + pos) & ~(*(sword + pos));
69
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
70
const int wpl, const int h)
72
const unsigned int row = get_global_id(1);
73
const unsigned int col = get_global_id(0);
74
const unsigned int pos = row * wpl + col;
77
if (row >= h || col >= wpl)
80
*(outword + pos) = *(dword + pos) & (*(sword + pos));
85
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
86
const int wpl, const int h)
88
const unsigned int row = get_global_id(1);
89
const unsigned int col = get_global_id(0);
90
const unsigned int pos = row * wpl + col;
93
if (row >= h || col >= wpl)
96
*(outword + pos) = *(dword + pos) | (*(sword + pos));
101
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
102
const int wpl, const int h)
104
const unsigned int pos = get_global_id(0);
105
unsigned int prevword, nextword, currword,tempword;
106
unsigned int destword;
107
const int col = pos % wpl;
110
if (pos >= (wpl * h))
114
currword = *(sword + pos);
117
//Handle boundary conditions
121
prevword = *(sword + pos - 1);
126
nextword = *(sword + pos + 1);
130
//1 bit to left and 1 bit to right
131
//Get the max value on LHS of every pixel
132
tempword = (prevword << (31)) | ((currword >> 1));
133
destword |= tempword;
134
//Get max value on RHS of every pixel
135
tempword = (currword << 1) | (nextword >> (31));
136
destword |= tempword;
138
//2 bit to left and 2 bit to right
139
//Get the max value on LHS of every pixel
140
tempword = (prevword << (30)) | ((currword >> 2));
141
destword |= tempword;
142
//Get max value on RHS of every pixel
143
tempword = (currword << 2) | (nextword >> (30));
144
destword |= tempword;
147
*(dword + pos) = destword;
153
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
154
const int wpl, const int h)
156
const int col = get_global_id(0);
157
const int row = get_global_id(1);
158
const unsigned int pos = row * wpl + col;
159
unsigned int tempword;
160
unsigned int destword;
164
if (row >= h || col >= wpl)
167
destword = *(sword + pos);
170
i = (row - 2) < 0 ? row : (row - 2);
171
tempword = *(sword + i*wpl + col);
172
destword |= tempword;
175
i = (row - 1) < 0 ? row : (row - 1);
176
tempword = *(sword + i*wpl + col);
177
destword |= tempword;
180
i = (row >= (h - 1)) ? row : (row + 1);
181
tempword = *(sword + i*wpl + col);
182
destword |= tempword;
185
i = (row >= (h - 2)) ? row : (row + 2);
186
tempword = *(sword + i*wpl + col);
187
destword |= tempword;
189
*(dword + pos) = destword;
194
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
196
const int col = get_global_id(0);
197
const int row = get_global_id(1);
198
const unsigned int pos = row * wpl + col;
199
unsigned int parbitsxp, parbitsxn, nwords;
200
unsigned int destword, tempword, lastword, currword;
201
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
202
int i, j, siter, eiter;
205
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
208
currword = *(sword + pos);
220
siter = (col - nwords);
221
eiter = (col + nwords);
227
firstword = *(sword + pos - 1);
230
if (col == (wpl - 1))
233
secondword = *(sword + pos + 1);
235
//Last partial bits on either side
236
for (i = 1; i <= parbitsxp; i++)
238
//Get the max value on LHS of every pixel
239
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
241
destword |= tempword;
243
//Get max value on RHS of every pixel
244
tempword = (currword << i) | (secondword >> (32 - i));
245
destword |= tempword;
248
//Return if halfwidth <= 1 word
253
destword |= firstword;
257
destword |= secondword;
260
*(dword + pos) = destword;
267
firstword = *(sword + row*wpl + siter);
272
lastword = *(sword + row*wpl + eiter);
274
for ( i = 1; i < nwords; i++)
280
secondword = *(sword + row*wpl + siter + i);
282
lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
284
firstword = secondword;
286
if ((siter + i + 1) < 0)
289
secondword = *(sword + row*wpl + siter + i + 1);
291
lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
294
if ((eiter - i) >= wpl)
297
firstword = *(sword + row*wpl + eiter - i);
299
rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
301
lastword = firstword;
302
if ((eiter - i - 1) >= wpl)
305
firstword = *(sword + row*wpl + eiter - i - 1);
307
rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
309
for (j = 1; j < 32; j++)
312
tempword = (lprevword << j) | (lnextword >> (32 - j));
313
destword |= tempword;
316
tempword = (rprevword << j) | (rnextword >> (32 - j));
317
destword |= tempword;
320
destword |= lprevword;
321
destword |= lnextword;
322
destword |= rprevword;
323
destword |= rnextword;
325
lastword = firstword;
326
firstword = secondword;
329
*(dword + pos) = destword;
334
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
336
const int wpl, const int h,
339
const int col = get_global_id(0);
340
const int row = get_global_id(1);
341
const unsigned int pos = row * wpl + col;
342
unsigned int prevword, nextword, currword,tempword;
343
unsigned int destword;
347
if (pos >= (wpl * h))
350
currword = *(sword + pos);
353
//Handle boundary conditions
357
prevword = *(sword + pos - 1);
362
nextword = *(sword + pos + 1);
364
for (i = 1; i <= halfwidth; i++)
366
//Get the max value on LHS of every pixel
367
if (i == halfwidth && isEven)
373
tempword = (prevword << (32-i)) | ((currword >> i));
376
destword |= tempword;
378
//Get max value on RHS of every pixel
379
tempword = (currword << i) | (nextword >> (32 - i));
381
destword |= tempword;
384
*(dword + pos) = destword;
389
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
391
const int wpl, const int h,
394
const int col = get_global_id(0);
395
const int row = get_global_id(1);
396
const unsigned int pos = row * wpl + col;
397
unsigned int tempword;
398
unsigned int destword;
402
if (row >= h || col >= wpl)
405
destword = *(sword + pos);
407
//Set start position and end position considering the boundary conditions
408
siter = (row - yn) < 0 ? 0 : (row - yn);
409
eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
411
for (i = siter; i <= eiter; i++)
413
tempword = *(sword + i*wpl + col);
415
destword |= tempword;
418
*(dword + pos) = destword;
423
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
424
const int wpl, const int h)
426
const unsigned int pos = get_global_id(0);
427
unsigned int prevword, nextword, currword,tempword;
428
unsigned int destword;
429
const int col = pos % wpl;
432
if (pos >= (wpl * h))
435
currword = *(sword + pos);
438
//Handle boundary conditions
442
prevword = *(sword + pos - 1);
447
nextword = *(sword + pos + 1);
451
//1 bit to left and 1 bit to right
452
//Get the min value on LHS of every pixel
453
tempword = (prevword << (31)) | ((currword >> 1));
454
destword &= tempword;
455
//Get min value on RHS of every pixel
456
tempword = (currword << 1) | (nextword >> (31));
457
destword &= tempword;
459
//2 bit to left and 2 bit to right
460
//Get the min value on LHS of every pixel
461
tempword = (prevword << (30)) | ((currword >> 2));
462
destword &= tempword;
463
//Get min value on RHS of every pixel
464
tempword = (currword << 2) | (nextword >> (30));
465
destword &= tempword;
468
*(dword + pos) = destword;
474
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
475
const int wpl, const int h,
476
const int fwmask, const int lwmask)
478
const int col = get_global_id(0);
479
const int row = get_global_id(1);
480
const unsigned int pos = row * wpl + col;
481
unsigned int tempword;
482
unsigned int destword;
486
if (row >= h || col >= wpl)
489
destword = *(sword + pos);
491
if (row < 2 || row >= (h - 2))
498
//i = (row - 2) < 0 ? row : (row - 2);
500
tempword = *(sword + i*wpl + col);
501
destword &= tempword;
504
//i = (row - 1) < 0 ? row : (row - 1);
506
tempword = *(sword + i*wpl + col);
507
destword &= tempword;
510
//i = (row >= (h - 1)) ? row : (row + 1);
512
tempword = *(sword + i*wpl + col);
513
destword &= tempword;
516
//i = (row >= (h - 2)) ? row : (row + 2);
518
tempword = *(sword + i*wpl + col);
519
destword &= tempword;
525
if (col == (wpl - 1))
532
*(dword + pos) = destword;
537
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
538
const int h, const char isAsymmetric, const int rwmask, const int lwmask)
540
const int col = get_global_id(0);
541
const int row = get_global_id(1);
542
const unsigned int pos = row * wpl + col;
543
unsigned int parbitsxp, parbitsxn, nwords;
544
unsigned int destword, tempword, lastword, currword;
545
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
546
int i, j, siter, eiter;
549
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
552
currword = *(sword + pos);
564
siter = (col - nwords);
565
eiter = (col + nwords);
569
firstword = 0xffffffff;
571
firstword = *(sword + pos - 1);
574
if (col == (wpl - 1))
575
secondword = 0xffffffff;
577
secondword = *(sword + pos + 1);
579
//Last partial bits on either side
580
for (i = 1; i <= parbitsxp; i++)
582
//Get the max value on LHS of every pixel
583
tempword = (firstword << (32-i)) | ((currword >> i));
584
destword &= tempword;
586
//Get max value on RHS of every pixel
587
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
589
//tempword = (currword << i) | (secondword >> (32 - i));
590
destword &= tempword;
593
//Return if halfwidth <= 1 word
598
destword &= firstword;
602
destword &= secondword;
605
//Clear boundary pixels
610
if (col == (wpl - 1))
614
*(dword + pos) = destword;
619
firstword = 0xffffffff;
621
firstword = *(sword + row*wpl + siter);
624
lastword = 0xffffffff;
626
lastword = *(sword + row*wpl + eiter);
629
for ( i = 1; i < nwords; i++)
633
secondword = 0xffffffff;
635
secondword = *(sword + row*wpl + siter + i);
637
lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
639
firstword = secondword;
641
if ((siter + i + 1) < 0)
642
secondword = 0xffffffff;
644
secondword = *(sword + row*wpl + siter + i + 1);
646
lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
649
if ((eiter - i) >= wpl)
650
firstword = 0xffffffff;
652
firstword = *(sword + row*wpl + eiter - i);
654
rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
656
lastword = firstword;
657
if ((eiter - i - 1) >= wpl)
658
firstword = 0xffffffff;
660
firstword = *(sword + row*wpl + eiter - i - 1);
662
rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
664
for (j = 0; j < 32; j++)
667
tempword = (lprevword << j) | (lnextword >> (32 - j));
668
destword &= tempword;
671
tempword = (rprevword << j) | (rnextword >> (32 - j));
672
destword &= tempword;
675
destword &= lprevword;
676
destword &= lnextword;
677
destword &= rprevword;
678
destword &= rnextword;
680
lastword = firstword;
681
firstword = secondword;
686
//Clear boundary pixels
687
if (col < (nwords - 1))
689
else if (col == (nwords - 1))
691
else if (col > (wpl - nwords))
693
else if (col == (wpl - nwords))
697
*(dword + pos) = destword;
702
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
703
const int halfwidth, const int wpl,
704
const int h, const char clearBoundPixH,
705
const int rwmask, const int lwmask,
708
const int col = get_global_id(0);
709
const int row = get_global_id(1);
710
const unsigned int pos = row * wpl + col;
711
unsigned int prevword, nextword, currword,tempword, destword;
715
if (pos >= (wpl * h))
718
currword = *(sword + pos);
721
//Handle boundary conditions
725
prevword = *(sword + pos - 1);
730
nextword = *(sword + pos + 1);
732
for (i = 1; i <= halfwidth; i++)
734
//Get the min value on LHS of every pixel
735
tempword = (prevword << (32-i)) | ((currword >> i));
737
destword &= tempword;
739
//Get min value on RHS of every pixel
740
if (i == halfwidth && isEven)
742
tempword = 0xffffffff;
746
tempword = (currword << i) | (nextword >> (32 - i));
749
destword &= tempword;
758
else if (col == (wpl - 1))
764
*(dword + pos) = destword;
769
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
771
const int wpl, const int h,
772
const char clearBoundPixV, const int yn)
774
const int col = get_global_id(0);
775
const int row = get_global_id(1);
776
const unsigned int pos = row * wpl + col;
777
unsigned int tempword, destword;
781
if (row >= h || col >= wpl)
784
destword = *(sword + pos);
786
//Set start position and end position considering the boundary conditions
787
siter = (row - yp) < 0 ? 0 : (row - yp);
788
eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
790
for (i = siter; i <= eiter; i++)
792
tempword = *(sword + i*wpl + col);
794
destword &= tempword;
797
//Clear boundary pixels
798
if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
803
*(dword + pos) = destword;
807
// HistogramRect Kernel: Accumulate
808
// assumes 4 channels, i.e., bytes_per_pixel = 4
809
// assumes number of pixels is multiple of 8
810
// data is layed out as
812
// bin0 bin1 bin2... bin0...
813
// rpt0,1,2...256 rpt0,1,2...
815
\n#define HIST_REDUNDANCY 256\n
816
\n#define GROUP_SIZE 256\n
817
\n#define HIST_SIZE 256\n
818
\n#define NUM_CHANNELS 4\n
819
\n#define HR_UNROLL_SIZE 8 \n
820
\n#define HR_UNROLL_TYPE uchar8 \n
822
__attribute__((reqd_work_group_size(256, 1, 1)))
824
void kernel_HistogramRectAllChannels(
825
__global const uchar8 *data,
827
__global uint *histBuffer) {
831
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
833
// for each pixel/channel, accumulate in global memory
834
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
836
// channel bin thread
837
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
838
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
839
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
840
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
841
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
842
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
843
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
844
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
851
__attribute__((reqd_work_group_size(256, 1, 1)))
853
void kernel_HistogramRectOneChannel(
854
__global const uchar8 *data,
856
__global uint *histBuffer) {
860
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
862
// for each pixel/channel, accumulate in global memory
863
for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
866
atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
867
atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
868
atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
869
atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
870
atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
871
atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
872
atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
873
atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
881
\n __attribute__((reqd_work_group_size(256, 1, 1)))
883
\n void kernel_HistogramRectAllChannels_Grey(
884
\n __global const uchar* data,
886
\n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
888
\n /* declare variables */
891
\n size_t groupId = get_group_id(0);
892
\n size_t localId = get_local_id(0); // 0 -> 256-1
893
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
894
\n uint numThreads = get_global_size(0);
896
\n /* accumulate in global memory */
897
\n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
898
\n uchar value = data[ pc ];
899
\n int idx = value * get_global_size(0) + get_global_id(0);
900
\n histBuffer[ idx ]++;
904
\n } // kernel_HistogramRectAllChannels_Grey
908
// HistogramRect Kernel: Reduction
909
// only supports 4 channels
910
// each work group handles a single channel of a single histogram bin
912
__attribute__((reqd_work_group_size(256, 1, 1)))
914
void kernel_HistogramRectAllChannelsReduction(
915
int n, // unused pixel redundancy
916
__global uint *histBuffer,
917
__global int* histResult) {
920
int channel = get_group_id(0)/HIST_SIZE;
921
int bin = get_group_id(0)%HIST_SIZE;
924
// accumulate in register
925
for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
926
value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
929
// reduction in local memory
930
__local int localHist[GROUP_SIZE];
931
localHist[get_local_id(0)] = value;
932
barrier(CLK_LOCAL_MEM_FENCE);
933
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
934
if (get_local_id(0) < stride) {
935
value = localHist[ get_local_id(0)+stride];
937
barrier(CLK_LOCAL_MEM_FENCE);
938
if (get_local_id(0) < stride) {
939
localHist[ get_local_id(0)] += value;
941
barrier(CLK_LOCAL_MEM_FENCE);
944
// write reduction to final result
945
if (get_local_id(0) == 0) {
946
histResult[get_group_id(0)] = localHist[0];
948
} // kernel_HistogramRectAllChannels
954
__attribute__((reqd_work_group_size(256, 1, 1)))
956
void kernel_HistogramRectOneChannelReduction(
957
int n, // unused pixel redundancy
958
__global uint *histBuffer,
959
__global int* histResult) {
962
// int channel = get_group_id(0)/HIST_SIZE;
963
int bin = get_group_id(0)%HIST_SIZE;
966
// accumulate in register
967
for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
968
value += histBuffer[ bin*HIST_REDUNDANCY+i];
971
// reduction in local memory
972
__local int localHist[GROUP_SIZE];
973
localHist[get_local_id(0)] = value;
974
barrier(CLK_LOCAL_MEM_FENCE);
975
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
976
if (get_local_id(0) < stride) {
977
value = localHist[ get_local_id(0)+stride];
979
barrier(CLK_LOCAL_MEM_FENCE);
980
if (get_local_id(0) < stride) {
981
localHist[ get_local_id(0)] += value;
983
barrier(CLK_LOCAL_MEM_FENCE);
986
// write reduction to final result
987
if (get_local_id(0) == 0) {
988
histResult[get_group_id(0)] = localHist[0];
990
} // kernel_HistogramRectOneChannelReduction
996
// each work group (x256) handles a histogram bin
997
\n __attribute__((reqd_work_group_size(256, 1, 1)))
999
\n void kernel_HistogramRectAllChannelsReduction_Grey(
1000
\n int n, // pixel redundancy that needs to be accumulated
1001
\n __global uint *histBuffer,
1002
\n __global uint* histResult) { // each wg accumulates 1 bin
1004
\n /* declare variables */
1007
\n size_t groupId = get_group_id(0);
1008
\n size_t localId = get_local_id(0); // 0 -> 256-1
1009
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
1010
\n uint numThreads = get_global_size(0);
1011
\n unsigned int hist = 0;
1013
\n /* accumulate in global memory */
1014
\n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
1015
\n hist += histBuffer[ (get_group_id(0)*n + p)];
1018
\n /* reduction in local memory */
1019
\n // populate local memory
1020
\n __local unsigned int localHist[GROUP_SIZE];
1022
\n localHist[localId] = hist;
1023
\n barrier(CLK_LOCAL_MEM_FENCE);
1025
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1026
\n if (localId < stride) {
1027
\n hist = localHist[ (localId+stride)];
1029
\n barrier(CLK_LOCAL_MEM_FENCE);
1030
\n if (localId < stride) {
1031
\n localHist[ localId] += hist;
1033
\n barrier(CLK_LOCAL_MEM_FENCE);
1036
\n if (localId == 0)
1037
\n histResult[get_group_id(0)] = localHist[0];
1039
\n } // kernel_HistogramRectAllChannelsReduction_Grey
1043
// ThresholdRectToPix Kernel
1044
// only supports 4 channels
1045
// imageData is input image (24-bits/pixel)
1046
// pix is output image (1-bit/pixel)
1048
\n#define CHAR_VEC_WIDTH 8 \n
1049
\n#define PIXELS_PER_WORD 32 \n
1050
\n#define PIXELS_PER_BURST 8 \n
1051
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
1053
uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
1054
uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
1057
__attribute__((reqd_work_group_size(256, 1, 1)))
1059
void kernel_ThresholdRectToPix(
1060
__global const uchar8 *imageData,
1063
int wpl, // words per line
1064
__global int *thresholds,
1065
__global int *hi_values,
1066
__global int *pix) {
1068
// declare variables
1069
int pThresholds[NUM_CHANNELS];
1070
int pHi_Values[NUM_CHANNELS];
1071
for ( int i = 0; i < NUM_CHANNELS; i++) {
1072
pThresholds[i] = thresholds[i];
1073
pHi_Values[i] = hi_values[i];
1076
// for each word (32 pixels) in output image
1077
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1078
unsigned int word = 0; // all bits start at zero
1080
// for each burst in word
1081
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
1085
for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
1086
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i];
1089
// for each pixel in burst
1090
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
1091
for ( int c = 0; c < NUM_CHANNELS; c++) {
1092
unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
1093
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1094
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1103
// only supports 1 channel
1105
uchar s[PIXELS_PER_BURST];
1106
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
1109
__attribute__((reqd_work_group_size(256, 1, 1)))
1111
void kernel_ThresholdRectToPix_OneChan(
1112
__global const uchar8 *imageData,
1115
int wpl, // words per line
1116
__global int *thresholds,
1117
__global int *hi_values,
1118
__global int *pix) {
1120
// declare variables
1123
for ( int i = 0; i < 1; i++) {
1124
pThresholds[i] = thresholds[i];
1125
pHi_Values[i] = hi_values[i];
1128
// for each word (32 pixels) in output image
1129
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1130
unsigned int word = 0; // all bits start at zero
1132
// for each burst in word
1133
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
1137
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
1138
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
1141
// for each pixel in burst
1142
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
1143
for ( int c = 0; c < 1; c++) {
1144
unsigned char pixChan = pixels.s[p + c];
1145
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1146
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1158
#endif // USE_EXTERNAL_KERNEL
1159
#endif //_OCL_KERNEL_H_
1160
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
1162
// Alternative histogram kernel written to use uchar and different global memory scattered write
1163
// was a little better for intel platforms but still not faster then native serial code
1165
/* data layed out as
1167
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
1169
\n__attribute__((reqd_work_group_size(256, 1, 1)))
1171
\n void kernel_HistogramRectAllChannels_uchar(
1172
\n volatile __global const uchar *data,
1174
\n volatile __global uint *histBuffer) {
1176
\n // for each pixel/channel, accumulate in global memory
1177
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
1178
\n uchar value = data[pc];
1179
\n int idx = value*get_global_size(0) + get_global_id(0);
1180
\n histBuffer[ idx ]++; // coalesced if same value
1182
\n } // kernel_HistogramRectAllChannels
1184
\n __attribute__((reqd_work_group_size(256, 1, 1)))
1186
\n void kernel_HistogramRectAllChannelsReduction_uchar(
1187
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
1188
\n __global uint4 *histBuffer,
1189
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
1191
\n // declare variables
1192
\n int binIdx = get_group_id(0);
1193
\n size_t groupId = get_group_id(0);
1194
\n size_t localId = get_local_id(0); // 0 -> 256-1
1195
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
1196
\n uint numThreads = get_global_size(0);
1197
\n uint4 hist = {0, 0, 0, 0};
1199
\n // accumulate in register
1200
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
1201
\n hist += histBuffer[binIdx*n+p];
1204
\n // reduction in local memory
1205
\n __local uint4 localHist[GROUP_SIZE];
1206
\n localHist[localId] = hist;
1207
\n barrier(CLK_LOCAL_MEM_FENCE);
1209
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1210
\n if (localId < stride) {
1211
\n hist = localHist[ localId+stride];
1213
\n barrier(CLK_LOCAL_MEM_FENCE);
1214
\n if (localId < stride) {
1215
\n localHist[ localId] += hist;
1217
\n barrier(CLK_LOCAL_MEM_FENCE);
1220
\n // write reduction to final result
1221
\n if (localId == 0) {
1222
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
1223
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
1224
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
1225
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
1228
\n } // kernel_HistogramRectAllChannels