~ubuntu-branches/ubuntu/vivid/freerdp/vivid

« back to all changes in this revision

Viewing changes to libfreerdp/codec/rfx_sse2.c

  • Committer: Package Import Robot
  • Author(s): Iain Lane
  • Date: 2014-11-11 12:20:50 UTC
  • mfrom: (1.1.9) (9.1.17 sid)
  • Revision ID: package-import@ubuntu.com-20141111122050-wyr8hrnwco9fcmum
Tags: 1.1.0~git20140921.1.440916e+dfsg1-2ubuntu1
* Merge with Debian unstable, remaining changes
  - Disable ffmpeg support
* Disable gstreamer support, this relies on gstreamer 0.10 and we don't want
  to add any more deps on that.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/**
 
2
 * FreeRDP: A Remote Desktop Protocol Implementation
 
3
 * RemoteFX Codec Library - SSE2 Optimizations
 
4
 *
 
5
 * Copyright 2011 Stephen Erisman
 
6
 * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
 
7
 *
 
8
 * Licensed under the Apache License, Version 2.0 (the "License");
 
9
 * you may not use this file except in compliance with the License.
 
10
 * You may obtain a copy of the License at
 
11
 *
 
12
 *     http://www.apache.org/licenses/LICENSE-2.0
 
13
 *
 
14
 * Unless required by applicable law or agreed to in writing, software
 
15
 * distributed under the License is distributed on an "AS IS" BASIS,
 
16
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
17
 * See the License for the specific language governing permissions and
 
18
 * limitations under the License.
 
19
 */
 
20
 
 
21
#ifdef HAVE_CONFIG_H
 
22
#include "config.h"
 
23
#endif
 
24
 
 
25
#include <stdio.h>
 
26
#include <stdlib.h>
 
27
#include <string.h>
 
28
#include <winpr/sysinfo.h>
 
29
 
 
30
#include <xmmintrin.h>
 
31
#include <emmintrin.h>
 
32
 
 
33
#include "rfx_types.h"
 
34
#include "rfx_sse2.h"
 
35
 
 
36
#ifdef _MSC_VER
 
37
#define __attribute__(...)
 
38
#endif
 
39
 
 
40
#define CACHE_LINE_BYTES        64
 
41
 
 
42
#ifndef __clang__
 
43
#define ATTRIBUTES  __gnu_inline__, __always_inline__, __artificial__
 
44
#else
 
45
#define ATTRIBUTES __gnu_inline__, __always_inline__
 
46
#endif
 
47
 
 
48
#define _mm_between_epi16(_val, _min, _max) \
 
49
        do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
 
50
 
 
51
static __inline void __attribute__((ATTRIBUTES))
 
52
_mm_prefetch_buffer(char * buffer, int num_bytes)
 
53
{
 
54
        __m128i * buf = (__m128i*) buffer;
 
55
        unsigned int i;
 
56
        for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
 
57
        {
 
58
                _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
 
59
        }
 
60
}
 
61
 
 
62
/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
 
63
/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
 
64
 
 
65
static __inline void __attribute__((ATTRIBUTES))
 
66
rfx_quantization_decode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor)
 
67
{
 
68
        __m128i a;
 
69
        __m128i * ptr = (__m128i*) buffer;
 
70
        __m128i * buf_end = (__m128i*) (buffer + buffer_size);
 
71
 
 
72
        if (factor == 0)
 
73
                return;
 
74
 
 
75
        do
 
76
        {
 
77
                a = _mm_load_si128(ptr);
 
78
                a = _mm_slli_epi16(a, factor);
 
79
                _mm_store_si128(ptr, a);
 
80
 
 
81
                ptr++;
 
82
        } while(ptr < buf_end);
 
83
}
 
84
 
 
85
static void rfx_quantization_decode_sse2(INT16* buffer, const UINT32* quantization_values)
 
86
{
 
87
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(INT16));
 
88
 
 
89
        rfx_quantization_decode_block_sse2(buffer, 4096, 5);
 
90
 
 
91
        rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
 
92
        rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
 
93
        rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
 
94
        rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
 
95
        rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
 
96
        rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
 
97
        rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
 
98
        rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
 
99
        rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
 
100
        rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 
101
}
 
102
 
 
103
static __inline void __attribute__((ATTRIBUTES))
 
104
rfx_quantization_encode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor)
 
105
{
 
106
        __m128i a;
 
107
        __m128i* ptr = (__m128i*) buffer;
 
108
        __m128i* buf_end = (__m128i*) (buffer + buffer_size);
 
109
        __m128i half;
 
110
 
 
111
        if (factor == 0)
 
112
                return;
 
113
 
 
114
        half = _mm_set1_epi16(1 << (factor - 1));
 
115
        do
 
116
        {
 
117
                a = _mm_load_si128(ptr);
 
118
                a = _mm_add_epi16(a, half);
 
119
                a = _mm_srai_epi16(a, factor);
 
120
                _mm_store_si128(ptr, a);
 
121
 
 
122
                ptr++;
 
123
        } while(ptr < buf_end);
 
124
}
 
125
 
 
126
static void rfx_quantization_encode_sse2(INT16* buffer, const UINT32* quantization_values)
 
127
{
 
128
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(INT16));
 
129
 
 
130
        rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
 
131
        rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
 
132
        rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
 
133
        rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
 
134
        rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
 
135
        rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
 
136
        rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
 
137
        rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
 
138
        rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
 
139
        rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 
140
 
 
141
        rfx_quantization_encode_block_sse2(buffer, 4096, 5);
 
142
}
 
143
 
 
144
static __inline void __attribute__((ATTRIBUTES))
 
145
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
 
146
{
 
147
        int y, n;
 
148
        INT16* l_ptr = l;
 
149
        INT16* h_ptr = h;
 
150
        INT16* dst_ptr = dst;
 
151
        int first;
 
152
        int last;
 
153
        __m128i l_n;
 
154
        __m128i h_n;
 
155
        __m128i h_n_m;
 
156
        __m128i tmp_n;
 
157
        __m128i dst_n;
 
158
        __m128i dst_n_p;
 
159
        __m128i dst1;
 
160
        __m128i dst2;
 
161
 
 
162
        for (y = 0; y < subband_width; y++)
 
163
        {
 
164
                /* Even coefficients */
 
165
                for (n = 0; n < subband_width; n += 8)
 
166
                {
 
167
                        /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
 
168
                        
 
169
                        l_n = _mm_load_si128((__m128i*) l_ptr);
 
170
 
 
171
                        h_n = _mm_load_si128((__m128i*) h_ptr);
 
172
                        h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
 
173
 
 
174
                        if (n == 0)
 
175
                        {
 
176
                                first = _mm_extract_epi16(h_n_m, 1);
 
177
                                h_n_m = _mm_insert_epi16(h_n_m, first, 0);
 
178
                        }
 
179
                        
 
180
                        tmp_n = _mm_add_epi16(h_n, h_n_m);
 
181
                        tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
 
182
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
 
183
                        
 
184
                        dst_n = _mm_sub_epi16(l_n, tmp_n);
 
185
                        
 
186
                        _mm_store_si128((__m128i*) l_ptr, dst_n);
 
187
                        
 
188
                        l_ptr += 8;
 
189
                        h_ptr += 8;
 
190
                }
 
191
 
 
192
                l_ptr -= subband_width;
 
193
                h_ptr -= subband_width;
 
194
                
 
195
                /* Odd coefficients */
 
196
                for (n = 0; n < subband_width; n += 8)
 
197
                {
 
198
                        /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
 
199
                        
 
200
                        h_n = _mm_load_si128((__m128i*) h_ptr);
 
201
                        
 
202
                        h_n = _mm_slli_epi16(h_n, 1);
 
203
                        
 
204
                        dst_n = _mm_load_si128((__m128i*) (l_ptr));
 
205
                        dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
 
206
 
 
207
                        if (n == subband_width - 8)
 
208
                        {
 
209
                                last = _mm_extract_epi16(dst_n_p, 6);
 
210
                                dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
 
211
                        }
 
212
                        
 
213
                        tmp_n = _mm_add_epi16(dst_n_p, dst_n);
 
214
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
 
215
                        
 
216
                        tmp_n = _mm_add_epi16(tmp_n, h_n);
 
217
                        
 
218
                        dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
 
219
                        dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
 
220
                        
 
221
                        _mm_store_si128((__m128i*) dst_ptr, dst1);
 
222
                        _mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
 
223
                        
 
224
                        l_ptr += 8;
 
225
                        h_ptr += 8;
 
226
                        dst_ptr += 16;
 
227
                }
 
228
        }
 
229
}
 
230
 
 
231
static __inline void __attribute__((ATTRIBUTES))
 
232
rfx_dwt_2d_decode_block_vert_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
 
233
{
 
234
        int x, n;
 
235
        INT16* l_ptr = l;
 
236
        INT16* h_ptr = h;
 
237
        INT16* dst_ptr = dst;
 
238
        __m128i l_n;
 
239
        __m128i h_n;
 
240
        __m128i tmp_n;
 
241
        __m128i h_n_m;
 
242
        __m128i dst_n;
 
243
        __m128i dst_n_m;
 
244
        __m128i dst_n_p;
 
245
        
 
246
        int total_width = subband_width + subband_width;
 
247
 
 
248
        /* Even coefficients */
 
249
        for (n = 0; n < subband_width; n++)
 
250
        {
 
251
                for (x = 0; x < total_width; x+=8)
 
252
                {
 
253
                        /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
 
254
                        
 
255
                        l_n = _mm_load_si128((__m128i*) l_ptr);
 
256
                        h_n = _mm_load_si128((__m128i*) h_ptr);
 
257
                        
 
258
                        tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
 
259
                        if (n == 0)
 
260
                                tmp_n = _mm_add_epi16(tmp_n, h_n);
 
261
                        else
 
262
                        {
 
263
                                h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
 
264
                                tmp_n = _mm_add_epi16(tmp_n, h_n_m);
 
265
                        }
 
266
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
 
267
                        
 
268
                        dst_n = _mm_sub_epi16(l_n, tmp_n);
 
269
                        _mm_store_si128((__m128i*) dst_ptr, dst_n);
 
270
                        
 
271
                        l_ptr+=8;
 
272
                        h_ptr+=8;
 
273
                        dst_ptr+=8;
 
274
                }
 
275
                dst_ptr+=total_width;
 
276
        }
 
277
        
 
278
        h_ptr = h;
 
279
        dst_ptr = dst + total_width;
 
280
        
 
281
        /* Odd coefficients */
 
282
        for (n = 0; n < subband_width; n++)
 
283
        {
 
284
                for (x = 0; x < total_width; x+=8)
 
285
                {
 
286
                        /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
 
287
                        
 
288
                        h_n = _mm_load_si128((__m128i*) h_ptr);
 
289
                        dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
 
290
                        h_n = _mm_slli_epi16(h_n, 1);
 
291
                        
 
292
                        tmp_n = dst_n_m;
 
293
                        if (n == subband_width - 1)
 
294
                                tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
 
295
                        else
 
296
                        {
 
297
                                dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
 
298
                                tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
 
299
                        }
 
300
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
 
301
                        
 
302
                        dst_n = _mm_add_epi16(tmp_n, h_n);
 
303
                        _mm_store_si128((__m128i*) dst_ptr, dst_n);
 
304
 
 
305
                        h_ptr+=8;
 
306
                        dst_ptr+=8;
 
307
                }
 
308
                dst_ptr+=total_width;
 
309
        }
 
310
}
 
311
 
 
312
static __inline void __attribute__((ATTRIBUTES))
 
313
rfx_dwt_2d_decode_block_sse2(INT16* buffer, INT16* idwt, int subband_width)
 
314
{
 
315
        INT16 *hl, *lh, *hh, *ll;
 
316
        INT16 *l_dst, *h_dst;
 
317
 
 
318
        _mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(INT16));
 
319
 
 
320
        /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
 
321
        /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
 
322
        /* The lower part L uses LL(3) and HL(0). */
 
323
        /* The higher part H uses LH(1) and HH(2). */
 
324
 
 
325
        ll = buffer + subband_width * subband_width * 3;
 
326
        hl = buffer;
 
327
        l_dst = idwt;
 
328
 
 
329
        rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
 
330
 
 
331
        lh = buffer + subband_width * subband_width;
 
332
        hh = buffer + subband_width * subband_width * 2;
 
333
        h_dst = idwt + subband_width * subband_width * 2;
 
334
        
 
335
        rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
 
336
 
 
337
        /* Inverse DWT in vertical direction, results are stored in original buffer. */
 
338
        rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
 
339
}
 
340
 
 
341
static void rfx_dwt_2d_decode_sse2(INT16* buffer, INT16* dwt_buffer)
 
342
{
 
343
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(INT16));
 
344
        
 
345
        rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
 
346
        rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
 
347
        rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
 
348
}
 
349
 
 
350
static __inline void __attribute__((ATTRIBUTES))
 
351
rfx_dwt_2d_encode_block_vert_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
 
352
{
 
353
        int total_width;
 
354
        int x;
 
355
        int n;
 
356
        __m128i src_2n;
 
357
        __m128i src_2n_1;
 
358
        __m128i src_2n_2;
 
359
        __m128i h_n;
 
360
        __m128i h_n_m;
 
361
        __m128i l_n;
 
362
 
 
363
        total_width = subband_width << 1;
 
364
 
 
365
        for (n = 0; n < subband_width; n++)
 
366
        {
 
367
                for (x = 0; x < total_width; x += 8)
 
368
                {
 
369
                        src_2n = _mm_load_si128((__m128i*) src);
 
370
                        src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
 
371
                        if (n < subband_width - 1)
 
372
                                src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
 
373
                        else
 
374
                                src_2n_2 = src_2n;
 
375
 
 
376
                        /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
 
377
 
 
378
                        h_n = _mm_add_epi16(src_2n, src_2n_2);
 
379
                        h_n = _mm_srai_epi16(h_n, 1);
 
380
                        h_n = _mm_sub_epi16(src_2n_1, h_n);
 
381
                        h_n = _mm_srai_epi16(h_n, 1);
 
382
 
 
383
                        _mm_store_si128((__m128i*) h, h_n);
 
384
 
 
385
                        if (n == 0)
 
386
                                h_n_m = h_n;
 
387
                        else
 
388
                                h_n_m = _mm_load_si128((__m128i*) (h - total_width));
 
389
 
 
390
                        /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
 
391
 
 
392
                        l_n = _mm_add_epi16(h_n_m, h_n);
 
393
                        l_n = _mm_srai_epi16(l_n, 1);
 
394
                        l_n = _mm_add_epi16(l_n, src_2n);
 
395
 
 
396
                        _mm_store_si128((__m128i*) l, l_n);
 
397
 
 
398
                        src += 8;
 
399
                        l += 8;
 
400
                        h += 8;
 
401
                }
 
402
                src += total_width;
 
403
        }
 
404
}
 
405
 
 
406
static __inline void __attribute__((ATTRIBUTES))
 
407
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
 
408
{
 
409
        int y;
 
410
        int n;
 
411
        int first;
 
412
        __m128i src_2n;
 
413
        __m128i src_2n_1;
 
414
        __m128i src_2n_2;
 
415
        __m128i h_n;
 
416
        __m128i h_n_m;
 
417
        __m128i l_n;
 
418
 
 
419
        for (y = 0; y < subband_width; y++)
 
420
        {
 
421
                for (n = 0; n < subband_width; n += 8)
 
422
                {
 
423
                        /* The following 3 Set operations consumes more than half of the total DWT processing time! */
 
424
                        src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
 
425
                        src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
 
426
                        src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
 
427
                                src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
 
428
 
 
429
                        /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
 
430
 
 
431
                        h_n = _mm_add_epi16(src_2n, src_2n_2);
 
432
                        h_n = _mm_srai_epi16(h_n, 1);
 
433
                        h_n = _mm_sub_epi16(src_2n_1, h_n);
 
434
                        h_n = _mm_srai_epi16(h_n, 1);
 
435
 
 
436
                        _mm_store_si128((__m128i*) h, h_n);
 
437
 
 
438
                        h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
 
439
                        if (n == 0)
 
440
                        {
 
441
                                first = _mm_extract_epi16(h_n_m, 1);
 
442
                                h_n_m = _mm_insert_epi16(h_n_m, first, 0);
 
443
                        }
 
444
 
 
445
                        /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
 
446
 
 
447
                        l_n = _mm_add_epi16(h_n_m, h_n);
 
448
                        l_n = _mm_srai_epi16(l_n, 1);
 
449
                        l_n = _mm_add_epi16(l_n, src_2n);
 
450
 
 
451
                        _mm_store_si128((__m128i*) l, l_n);
 
452
 
 
453
                        src += 16;
 
454
                        l += 8;
 
455
                        h += 8;
 
456
                }
 
457
        }
 
458
}
 
459
 
 
460
static __inline void __attribute__((ATTRIBUTES))
 
461
rfx_dwt_2d_encode_block_sse2(INT16* buffer, INT16* dwt, int subband_width)
 
462
{
 
463
        INT16 *hl, *lh, *hh, *ll;
 
464
        INT16 *l_src, *h_src;
 
465
 
 
466
        _mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(INT16));
 
467
 
 
468
        /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
 
469
 
 
470
        l_src = dwt;
 
471
        h_src = dwt + subband_width * subband_width * 2;
 
472
 
 
473
        rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
 
474
 
 
475
        /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
 
476
        /* The lower part L generates LL(3) and HL(0). */
 
477
        /* The higher part H generates LH(1) and HH(2). */
 
478
 
 
479
        ll = buffer + subband_width * subband_width * 3;
 
480
        hl = buffer;
 
481
 
 
482
        lh = buffer + subband_width * subband_width;
 
483
        hh = buffer + subband_width * subband_width * 2;
 
484
 
 
485
        rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
 
486
        rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
 
487
}
 
488
 
 
489
static void rfx_dwt_2d_encode_sse2(INT16* buffer, INT16* dwt_buffer)
 
490
{
 
491
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(INT16));
 
492
        
 
493
        rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
 
494
        rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
 
495
        rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
 
496
}
 
497
 
 
498
void rfx_init_sse2(RFX_CONTEXT* context)
 
499
{
 
500
        if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
 
501
                return;
 
502
 
 
503
        DEBUG_RFX("Using SSE2 optimizations");
 
504
 
 
505
        IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
 
506
        IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
 
507
        IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
 
508
        IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
 
509
 
 
510
        context->quantization_decode = rfx_quantization_decode_sse2;
 
511
        context->quantization_encode = rfx_quantization_encode_sse2;
 
512
        context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
 
513
        context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
 
514
}