3
# TODO: support for GPUBufferedArray?
4
import ensure_failure # import this in order to temporarily disable GPU
7
#import pycuda.autoinit as autoinit
8
import pycuda.driver as drv
10
from pycuda import gpuarray
11
from brian.experimental.cuda.buffering import *
13
from filterbank import Filterbank, RestructureFilterbank
14
from gputools import *
16
__all__ = ['LinearFilterbank']
18
class LinearFilterbank(Filterbank):
20
Generalised parallel linear filterbank
22
This filterbank allows you to construct a chain of linear filters in
23
a bank so that each channel in the bank has its own filters. You pass
24
the (b,a) parameters for the filter in the format specified in the
25
function ``apply_linear_filterbank``.
27
Note that there are additional GPU specific options:
30
Should be single for older GPUs.
32
Should be set to true to force a copy of GPU->CPU after each
33
filter step, but not if the output is being used in ongoing
34
GPU computations. By default this is ``True`` for compatibility.
36
Allocate faster pagelocked memory for GPU->CPU copies (doubles
38
``unroll_filterorder``
39
Whether or not to unroll the loop for the filter order, normally
40
this should be done for small filter orders. By default, it is done
41
if the filter order is less than or equal to 32.
43
def __init__(self, source, b, a, samplerate=None,
44
precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None):
45
# TODO: handle use_gpu = False here
47
# self.__class__=nongpu_LinearFilterbank
48
# self.__init__(b, a, samplerate=samplerate)
50
# Automatically duplicate mono input to fit the desired output shape
51
if b.shape[0]!=source.nchannels:
52
if source.nchannels!=1:
53
raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.')
54
source = RestructureFilterbank(source, b.shape[0])
55
Filterbank.__init__(self, source)
56
if pycuda.context is None:
58
self.precision=precision
59
if self.precision=='double':
60
self.precision_dtype=float64
62
self.precision_dtype=float32
63
self.forcesync=forcesync
64
self.pagelocked_mem=pagelocked_mem
68
filt_b_gpu=array(b, dtype=self.precision_dtype)
69
filt_a_gpu=array(a, dtype=self.precision_dtype)
70
filt_state=zeros((n, m-1, p), dtype=self.precision_dtype)
72
filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
73
self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
75
filt_y=zeros(n, dtype=self.precision_dtype)
76
self.pre_x=zeros(n, dtype=self.precision_dtype)
77
#filt_x=zeros(n, dtype=self.precision_dtype)
78
self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem
79
self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds
80
self.filt_state=gpuarray.to_gpu(filt_state.T.flatten())
81
#self.filt_x=gpuarray.to_gpu(filt_x)
82
#self.filt_y=GPUBufferedArray(filt_y)
83
#self.filt_y=gpuarray.to_gpu(filt_y)
84
if unroll_filterorder is None:
86
unroll_filterorder = True
88
unroll_filterorder = False
89
# TODO: improve code, check memory access patterns, maybe use local memory
91
#define x(s,i) _x[(s)*n+(i)]
92
#define y(s,i) _y[(s)*n+(i)]
93
#define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
94
#define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
95
#define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
96
__global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
98
int j = blockIdx.x * blockDim.x + threadIdx.x;
100
for(int s=0; s<numsamples; s++)
105
y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
107
if unroll_filterorder:
109
loopcode+=re.sub('\\bi\\b', str(i), '''
110
zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
114
for(int i=0;i<m-2;i++)
115
zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
118
zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
124
loopcode=re.sub('\\bk\\b', str(k), loopcode)
130
code=code.replace('SCALAR', self.precision)
131
code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values
132
code=re.sub("\\bm\\b", str(m), code)
133
code=re.sub("\\bn\\b", str(n), code)
135
self.gpu_mod=pycuda.compiler.SourceModule(code)
136
self.gpu_filt_func=self.gpu_mod.get_function("filt")
137
blocksize=512#self.maxblocksize
143
gridsize=n/blocksize+1
144
self.block=(blocksize, 1, 1)
145
self.grid=(gridsize, 1)
146
self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block)
147
self._has_run_once=False
152
def buffer_init(self):
153
Filterbank.buffer_init(self)
154
self.filt_state.set(zeros(self.filt_state.shape, dtype=self.filt_state.dtype))
156
def buffer_apply(self, input):
157
# TODO: buffer apply to a large input may cause a launch timeout, need to buffer in
158
# smaller chunks if this is the case
162
if not hasattr(self, 'filt_x_gpu') or input.shape[0]!=self.filt_x_gpu.shape[0]:
163
self._has_run_once = False
164
self.filt_x_gpu = gpuarray.to_gpu(input)
165
self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu)
166
if self.pagelocked_mem:
167
self.filt_y = drv.pagelocked_zeros(input.shape, dtype=self.precision_dtype)
169
self.filt_y = zeros(input.shape, dtype=self.precision_dtype)
170
filt_x_gpu = self.filt_x_gpu
171
filt_y_gpu = self.filt_y_gpu
173
if self._has_run_once:
174
self.gpu_filt_func.launch_grid(*self.grid)
176
self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata),
177
intp(a.gpudata), intp(filt_x_gpu.gpudata),
178
intp(zi.gpudata), intp(filt_y_gpu.gpudata), int32(input.shape[0]))
179
self._has_run_once = True
180
filt_y_gpu.get(filt_y)
184
# For the moment, assume that input is a CPU array and has shape (numsamples, nchannels)
186
# if isinstance(x, GPUBufferedArray):
187
# if not len(x.shape) or x.shape==(1,):
189
# newx=empty(self.N, dtype=b.dtype)
193
# drv.memcpy_dtod(fx.gpudata, x.gpu_dev_alloc, fx.size*self.precision_dtype().nbytes)
195
# if not isinstance(x, ndarray) or not len(x.shape) or x.shape==(1,):
196
# # Current version of pycuda doesn't allow .fill(val) method on float64 gpuarrays
197
# # because it assumed float32 only, so we have to do our own copying here
202
# if self._has_run_once:
203
# self.gpu_filt_func.launch_grid(*self.grid)
205
# self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata), intp(a.gpudata), intp(fx.gpudata),
206
# intp(zi.gpudata), y.gpu_pointer)
207
# self._has_run_once=True
208
# y.changed_gpu_data()
210
# y.sync_to_cpu()#might need to turn this on although it slows everything down