~ubuntu-branches/ubuntu/precise/brian/precise

« back to all changes in this revision

Viewing changes to brian/hears/newversion/filtering/gpulinearfilterbank.py

  • Committer: Bazaar Package Importer
  • Author(s): Yaroslav Halchenko
  • Date: 2010-11-02 18:19:15 UTC
  • Revision ID: james.westby@ubuntu.com-20101102181915-ivwy29820copccu2
Tags: upstream-1.2.2~svn2229
ImportĀ upstreamĀ versionĀ 1.2.2~svn2229

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
'''
 
2
'''
 
3
# TODO: support for GPUBufferedArray?
 
4
import ensure_failure # import this in order to temporarily disable GPU
 
5
from numpy import *
 
6
import pycuda
 
7
#import pycuda.autoinit as autoinit
 
8
import pycuda.driver as drv
 
9
import pycuda.compiler
 
10
from pycuda import gpuarray
 
11
from brian.experimental.cuda.buffering import *
 
12
import re
 
13
from filterbank import Filterbank, RestructureFilterbank
 
14
from gputools import *
 
15
 
 
16
__all__ = ['LinearFilterbank']
 
17
 
 
18
class LinearFilterbank(Filterbank):
 
19
    '''
 
20
    Generalised parallel linear filterbank
 
21
    
 
22
    This filterbank allows you to construct a chain of linear filters in
 
23
    a bank so that each channel in the bank has its own filters. You pass
 
24
    the (b,a) parameters for the filter in the format specified in the
 
25
    function ``apply_linear_filterbank``.
 
26
    
 
27
    Note that there are additional GPU specific options:
 
28
    
 
29
    ``precision``
 
30
        Should be single for older GPUs.
 
31
    ``forcesync``
 
32
        Should be set to true to force a copy of GPU->CPU after each
 
33
        filter step, but not if the output is being used in ongoing
 
34
        GPU computations. By default this is ``True`` for compatibility.
 
35
    ``pagelocked_mem``
 
36
        Allocate faster pagelocked memory for GPU->CPU copies (doubles
 
37
        copy rate).
 
38
    ``unroll_filterorder``
 
39
        Whether or not to unroll the loop for the filter order, normally
 
40
        this should be done for small filter orders. By default, it is done
 
41
        if the filter order is less than or equal to 32.
 
42
    '''
 
43
    def __init__(self, source, b, a, samplerate=None,
 
44
                 precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None):
 
45
        # TODO: handle use_gpu = False here
 
46
#        if not use_gpu:
 
47
#            self.__class__=nongpu_LinearFilterbank
 
48
#            self.__init__(b, a, samplerate=samplerate)
 
49
#            return
 
50
        # Automatically duplicate mono input to fit the desired output shape
 
51
        if b.shape[0]!=source.nchannels:
 
52
            if source.nchannels!=1:
 
53
                raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.')
 
54
            source = RestructureFilterbank(source, b.shape[0])
 
55
        Filterbank.__init__(self, source)
 
56
        if pycuda.context is None:
 
57
            set_gpu_device(0)
 
58
        self.precision=precision
 
59
        if self.precision=='double':
 
60
            self.precision_dtype=float64
 
61
        else:
 
62
            self.precision_dtype=float32
 
63
        self.forcesync=forcesync
 
64
        self.pagelocked_mem=pagelocked_mem
 
65
        n, m, p=b.shape
 
66
        self.filt_b=b
 
67
        self.filt_a=a
 
68
        filt_b_gpu=array(b, dtype=self.precision_dtype)
 
69
        filt_a_gpu=array(a, dtype=self.precision_dtype)
 
70
        filt_state=zeros((n, m-1, p), dtype=self.precision_dtype)
 
71
        if pagelocked_mem:
 
72
            filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
 
73
            self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
 
74
        else:
 
75
            filt_y=zeros(n, dtype=self.precision_dtype)
 
76
            self.pre_x=zeros(n, dtype=self.precision_dtype)
 
77
        #filt_x=zeros(n, dtype=self.precision_dtype)
 
78
        self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem
 
79
        self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds
 
80
        self.filt_state=gpuarray.to_gpu(filt_state.T.flatten())
 
81
        #self.filt_x=gpuarray.to_gpu(filt_x)
 
82
        #self.filt_y=GPUBufferedArray(filt_y)
 
83
        #self.filt_y=gpuarray.to_gpu(filt_y)
 
84
        if unroll_filterorder is None:
 
85
            if m<=32:
 
86
                unroll_filterorder = True
 
87
            else:
 
88
                unroll_filterorder = False
 
89
        # TODO: improve code, check memory access patterns, maybe use local memory
 
90
        code='''
 
91
        #define x(s,i) _x[(s)*n+(i)]
 
92
        #define y(s,i) _y[(s)*n+(i)]
 
93
        #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
 
94
        #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
 
95
        #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
 
96
        __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
 
97
        {
 
98
            int j = blockIdx.x * blockDim.x + threadIdx.x;
 
99
            if(j>=n) return;
 
100
            for(int s=0; s<numsamples; s++)
 
101
            {
 
102
        '''
 
103
        for k in range(p):
 
104
            loopcode='''
 
105
            y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
 
106
            '''
 
107
            if unroll_filterorder:
 
108
                for i in range(m-2):
 
109
                    loopcode+=re.sub('\\bi\\b', str(i), '''
 
110
                    zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
 
111
                    ''')
 
112
            else:
 
113
                loopcode+='''
 
114
                for(int i=0;i<m-2;i++)
 
115
                    zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
 
116
                '''
 
117
            loopcode+='''
 
118
            zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
 
119
            '''
 
120
            if k<p-1:
 
121
                loopcode+='''
 
122
                x(s,j) = y(s,j);
 
123
                '''
 
124
            loopcode=re.sub('\\bk\\b', str(k), loopcode)
 
125
            code+=loopcode
 
126
        code+='''
 
127
            }
 
128
        }
 
129
        '''
 
130
        code=code.replace('SCALAR', self.precision)
 
131
        code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values
 
132
        code=re.sub("\\bm\\b", str(m), code)
 
133
        code=re.sub("\\bn\\b", str(n), code)
 
134
        #print code
 
135
        self.gpu_mod=pycuda.compiler.SourceModule(code)
 
136
        self.gpu_filt_func=self.gpu_mod.get_function("filt")
 
137
        blocksize=512#self.maxblocksize
 
138
        if n<blocksize:
 
139
            blocksize=n
 
140
        if n%blocksize==0:
 
141
            gridsize=n/blocksize
 
142
        else:
 
143
            gridsize=n/blocksize+1
 
144
        self.block=(blocksize, 1, 1)
 
145
        self.grid=(gridsize, 1)
 
146
        self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block)
 
147
        self._has_run_once=False
 
148
 
 
149
    def reset(self):
 
150
        self.buffer_init()
 
151
 
 
152
    def buffer_init(self):
 
153
        Filterbank.buffer_init(self)
 
154
        self.filt_state.set(zeros(self.filt_state.shape, dtype=self.filt_state.dtype))
 
155
    
 
156
    def buffer_apply(self, input):
 
157
        # TODO: buffer apply to a large input may cause a launch timeout, need to buffer in
 
158
        # smaller chunks if this is the case
 
159
        b = self.filt_b_gpu
 
160
        a = self.filt_a_gpu
 
161
        zi = self.filt_state
 
162
        if not hasattr(self, 'filt_x_gpu') or input.shape[0]!=self.filt_x_gpu.shape[0]:
 
163
            self._has_run_once = False
 
164
            self.filt_x_gpu = gpuarray.to_gpu(input)
 
165
            self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu)
 
166
            if self.pagelocked_mem:
 
167
                self.filt_y = drv.pagelocked_zeros(input.shape, dtype=self.precision_dtype)
 
168
            else:
 
169
                self.filt_y = zeros(input.shape, dtype=self.precision_dtype) 
 
170
        filt_x_gpu = self.filt_x_gpu
 
171
        filt_y_gpu = self.filt_y_gpu
 
172
        filt_y = self.filt_y
 
173
        if self._has_run_once:
 
174
            self.gpu_filt_func.launch_grid(*self.grid)
 
175
        else:
 
176
            self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata),
 
177
                    intp(a.gpudata), intp(filt_x_gpu.gpudata),
 
178
                    intp(zi.gpudata), intp(filt_y_gpu.gpudata), int32(input.shape[0]))
 
179
            self._has_run_once = True
 
180
        filt_y_gpu.get(filt_y)
 
181
        return filt_y
 
182
        #y=self.filt_y
 
183
        #fx=self.filt_x
 
184
        # For the moment, assume that input is a CPU array and has shape (numsamples, nchannels)
 
185
        
 
186
#        if isinstance(x, GPUBufferedArray):
 
187
#            if not len(x.shape) or x.shape==(1,):
 
188
#                x.sync_to_cpu()
 
189
#                newx=empty(self.N, dtype=b.dtype)
 
190
#                newx[:]=x
 
191
#                fx.set(newx)
 
192
#            else:
 
193
#                drv.memcpy_dtod(fx.gpudata, x.gpu_dev_alloc, fx.size*self.precision_dtype().nbytes)
 
194
#        else:
 
195
#            if not isinstance(x, ndarray) or not len(x.shape) or x.shape==(1,):
 
196
#                # Current version of pycuda doesn't allow .fill(val) method on float64 gpuarrays
 
197
#                # because it assumed float32 only, so we have to do our own copying here
 
198
#                px=self.pre_x
 
199
#                px[:]=x
 
200
#                x=px
 
201
#            fx.set(x)
 
202
#        if self._has_run_once:
 
203
#            self.gpu_filt_func.launch_grid(*self.grid)
 
204
#        else:
 
205
#            self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata), intp(a.gpudata), intp(fx.gpudata),
 
206
#                                             intp(zi.gpudata), y.gpu_pointer)
 
207
#            self._has_run_once=True
 
208
#        y.changed_gpu_data()
 
209
#        if self.forcesync:
 
210
#            y.sync_to_cpu()#might need to turn this on although it slows everything down
 
211
#        return y