1
function GPUFFTDemo1(fwidth)
2
% GPUFFTDemo1([fwidth=11]) - Demonstrate use of GPGPU computing for 2D-FFT.
4
% This demo makes use of the FOSS GPUmat toolbox to perform a GPU
5
% accelerated 2D FFT + filtering in frequency space + 2D inverse FFT on our
6
% favourite bunnies. GPUmat allows to use NVidia's CUDA gpu computing
7
% framework on supported NVidia gpu's (GeForce-8000 series and later, aka
8
% Direct3D-10 or OpenGL-3 capable).
10
% It shows how a Psychtoolbox floating point texture (with our bunnies
11
% inside) can be efficiently passed to GPUmat as a matrix of GPUsingle data
12
% type, which is stored and processed on the GPU. Then it uses GPUmat's fft
13
% routines for forward/inverse fft's and matrix manipulation. Then it
14
% returns the final image to Psychtoolbox as a new floating point texture
15
% for display. The same steps are carried out with Matlab/Octave's regular
16
% fft routines on the cpu for reference.
18
% Requires the freely downloadable NVidia CUDA-5.0 SDK/Runtime and the free
19
% and open-source GPUmat toolbox as well as a compute capable NVidia
24
% 'fwidth' = Width of low-pass filter kernel in frequency space units.
28
% 5.02.2013 mk Written.
30
% Default filter width is 11 units:
31
if nargin < 1 || isempty(fwidth)
35
% Running under PTB-3 hopefully?
38
% Skip timing tests and calibrations for this demo:
39
oldskip = Screen('Preference','SkipSyncTests', 2);
41
% Open onscreen window with black background on (external) screen,
42
% enable GPGPU computing support:
43
screenid = max(Screen('Screens'));
45
PsychImaging('PrepareConfiguration');
46
% We explicitely request the GPUmat based api:
47
PsychImaging('AddTask', 'General', 'UseGPGPUCompute', 'GPUmat');
48
w = PsychImaging('OpenWindow', screenid, 0);
51
Screen('TextSize', w, 28);
52
DrawFormattedText(w, 'Please be patient throughout the demo.\nSome benchmark steps are time intense...', 'center', 'center', 255);
55
% Read our beloved bunny image from filesystem:
56
bunnyimg = imread([PsychtoolboxRoot 'PsychDemos/konijntjes1024x768.jpg']);
58
% Add a fourth neutral alpha channel, so we can create a 4-channel
59
% RGBA texture. Why? Some GPU compute api's, e.g., NVidia's CUDA,
60
% don't like RGB textures, so we need to do this to avoid errors:
61
bunnyimg(:,:,4) = 255;
63
% Turn it into a double matrix for conversion into a floating point
64
% texture, and normalize/convert its color range from 0-255 to 0-1:
65
dbunny = double(bunnyimg)/255;
67
% Maketexture a texture out of it in float precision with upright
68
% texture orientation, so further processing in teamwork between
69
% GPUmat and Psychtoolbox is simplified. Note: This conversion
70
% would turn a luminance texture into a troublesome RGB texture,
71
% which would only work on Linux and maybe Windows, but not on OSX.
72
% We avoid this by never passing in a luminance texture if it is
73
% intended to be used with GPUmat aka NVidia's CUDA framework.
74
bunnytex = Screen('MakeTexture', w, dbunny, [], [], 2, 1);
76
%% Do the FFT -> manipulate -> IFFT cycle once in Matlab/Octave as a
79
% Extract 2nd layer, the green color channel as an approximation of
81
dbunny = dbunny(:,:,2);
83
% Show it pre-transform:
86
title('Pre-FFT Bunny.');
88
% Perform 2D-FFT in Matlab/Octave on CPU:
91
% Shift DC component to center of spectrum "image":
94
% Define a low-pass filter in frequency space, ie., an amplitude mask
95
% to point-wise multiply with the FFT spectrum of FFT transformed
97
% fwidth controls frequency - lower values = lower cutoff frequency:
100
[x,y] = meshgrid(-hw:hw,-hh:hh);
101
mask = exp(-((x/fwidth).^2)-((y/fwidth).^2));
103
% Quick and dirty trick: Cut off a bit from mask, so size of mask and
104
% frequency spectrum matches -- Don't do this at home (or for real research scripts)!
105
mask = mask(2:end, 2:end);
109
title('Gaussian low pass filter mask for FFT space filtering:');
114
% Low pass filter by point-wise multiply of fs with filter 'mask' in
118
% Invert shift of filtered fs:
121
% Perform inverse 2D-FFT in Matlab/Octave on CPU:
125
tcpu = (GetSecs - tcpu) / trials;
126
fprintf('Measured per trial runtime of CPU FFT: %f msecs [n=%i trials].\n', tcpu * 1000, trials);
128
% Extract real component for display:
133
title('Post-FFT->Filter->IFFT Bunny.');
135
%% Perform FFT->Process->IFFT on GPU via GPUmat / CUDA:
137
% First Show input image:
138
Screen('DrawTexture', w, bunnytex);
139
Screen('TextSize', w, 28);
140
DrawFormattedText(w, 'Original pre-FFT Bunny:\nPress key to continue.', 'center', 40, [255 255 0]);
144
% Create GPUsingle matrix with input image from RGBA float bunnytex:
145
A = GPUTypeFromToGL(0, bunnytex, [], [], 0);
147
% Extract 2nd layer, the green channel from it for further use:
148
% Note: The 1st dimension of a GPUsingle matrix created from a
149
% Psychtoolbox texture always contains the "color channel":
152
% Squeeze it to remove the singleton 1st dimension of A, because
153
% fft2 and ifft2 can only work on 2D input matrices, not 3D
154
% matrices, not even ones with a singleton dimension, ie., a 2D
155
% matrix in disguise:
158
% Perform forward 2D FFT on GPU:
163
% First shift the spectrums origin (DC component aka 0 Hz frequency) to
164
% the center of the FFT image:
167
% Generate the amplitude spectrum, scaled to 1/100th via abs(FS)/100.0,
168
% then converte the image into a texture 'fftmag' and display it:
169
fftmag = GPUTypeFromToGL(1, abs(FS)/100.0, [], [], 0);
170
Screen('DrawTexture', w, fftmag);
171
DrawFormattedText(w, 'Amplitude spectrum post-FFT Bunny:\nPress key to continue.', 'center', 40, [0 255 0]);
175
% Turn our filter 'mask' from cpu version above into a matrix on GPU:
176
% We must also transpose the mask, because 'mask' was generated to
177
% match the shape of the fs matrix on the cpu in Matlab/Octave. As
178
% Matlab and Octave use column-major storage, whereas Psychtoolbox uses
179
% row-major storage, all the GPU variables we got from Psychtoolbox
180
% textures are transposed with respect to the Matlab version. A simple
181
% transpose fixes this for our mask to match GPU format:
182
FM = transpose(GPUsingle(mask));
184
% Measure execution time on GPU. The cudaThreadSynchronize() command
185
% makes sure we are actually measuring GPU timing with the GetSecs():
186
cudaThreadSynchronize;
190
% Filter the amplitude spectrum by point-wise multiply with filter FM:
193
% Shift back DC component to original position to prepare inverse FFT:
196
% Process inverse 2D FFT on GPU:
200
cudaThreadSynchronize;
201
tgpu = (GetSecs - tgpu) / trials;
202
fprintf('Measured per trial runtime of GPU FFT: %f msecs [n=%i trials].\n', tgpu * 1000, trials);
203
fprintf('Speedup GPU vs. CPU: %f\n', tcpu / tgpu);
205
% Extract real component for display:
208
% Convert R back into a floating point luminance texture 'tex' for
209
% processing/display by Screen. Here it is fine to pass a
210
% single-layer matrix for getting a luminance texture, because we
211
% don't use Screen('MakeTexture') or similar and don't perform the
212
% normalization step which would convert into a troublesome RGB
214
tex = GPUTypeFromToGL(1, R, [], [], 0);
217
Screen('DrawTexture', w, tex);
218
DrawFormattedText(w, 'GPU-Processed FFT->Filter->IFFT Bunny:\nPress key to continue.', 'center', 40, [0 255 0]);
222
% Reset interop cache before closing the windows and textures:
225
% Close window, which will also release all textures:
228
% Restore sync test setting:
229
Screen('Preference','SkipSyncTests', oldskip);
234
% Reset interop cache before closing the windows and textures:
240
% Restore sync test setting:
241
Screen('Preference','SkipSyncTests', oldskip);
243
psychrethrow(psychlasterror);
249
% Documentation of another, not yet tested, more complex and probably
250
% higher performance, way of doing GPU accelerated FFT with GPUmat:
252
% fftType = cufftType;
253
% fftDir = cufftTransformDirections;
257
% [status, plan] = cufftPlan2d(plan, size(d_A, 1), size(d_A, 2), fftType.CUFFT_R2C);
258
% cufftCheckStatus(status, 'Error in cufftPlan2D');
261
% [status] = cufftExecR2C(plan, getPtr(d_A), getPtr(d_B), fftDir.CUFFT_FORWARD);
262
% cufftCheckStatus(status, 'Error in cufftExecR2C');
265
% [status] = cufftExecC2R(plan, getPtr(d_B), getPtr(d_A), fftDir.CUFFT_INVERSE);
266
% cufftCheckStatus(status, 'Error in cufftExecC2C');
268
% % results should be scaled by 1/N if compared to CPU
269
% % h_B = 1/N*single(d_A);
271
% [status] = cufftDestroy(plan);
272
% cufftCheckStatus(status, 'Error in cuffDestroyPlan');