2
* FreeRDP: A Remote Desktop Protocol client.
3
* RemoteFX Codec Library - SSE2 Optimizations
5
* Copyright 2011 Stephen Erisman
6
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
8
* Licensed under the Apache License, Version 2.0 (the "License");
9
* you may not use this file except in compliance with the License.
10
* You may obtain a copy of the License at
12
* http://www.apache.org/licenses/LICENSE-2.0
14
* Unless required by applicable law or agreed to in writing, software
15
* distributed under the License is distributed on an "AS IS" BASIS,
16
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
* See the License for the specific language governing permissions and
18
* limitations under the License.
24
#include <xmmintrin.h>
25
#include <emmintrin.h>
27
#include "rfx_types.h"
31
#define __attribute__(...)
34
#define CACHE_LINE_BYTES 64
36
#define _mm_between_epi16(_val, _min, _max) \
37
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
39
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40
_mm_prefetch_buffer(char * buffer, int num_bytes)
42
__m128i * buf = (__m128i*) buffer;
44
for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
46
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
50
static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
52
__m128i zero = _mm_setzero_si128();
53
__m128i max = _mm_set1_epi16(255);
55
__m128i* y_r_buf = (__m128i*) y_r_buffer;
56
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
57
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
68
__m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14
69
__m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14
70
__m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14
71
__m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14
72
__m128i c4096 = _mm_set1_epi16(4096);
74
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
76
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
77
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
78
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
80
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
83
In order to use SSE2 signed 16-bit integer multiplication we need to convert
84
the floating point factors to signed int without loosing information.
85
The result of this multiplication is 32 bit and we have two SSE instructions
86
that return either the hi or lo word.
87
Thus we will multiply the factors by the highest possible 2^n, take the
88
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
89
result by multiplying it by 2^(16-n).
90
For the given factors in the conversion matrix the best possible n is 14.
92
Example for calculating r:
93
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
94
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
95
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
96
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
99
/* y = (y_r_buf[i] + 4096) >> 2 */
100
y = _mm_load_si128(&y_r_buf[i]);
101
y = _mm_add_epi16(y, c4096);
102
y = _mm_srai_epi16(y, 2);
103
/* cb = cb_g_buf[i]; */
104
cb = _mm_load_si128(&cb_g_buf[i]);
105
/* cr = cr_b_buf[i]; */
106
cr = _mm_load_si128(&cr_b_buf[i]);
108
/* (y + HIWORD(cr*22986)) >> 3 */
109
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
110
r = _mm_srai_epi16(r, 3);
111
/* y_r_buf[i] = MINMAX(r, 0, 255); */
112
_mm_between_epi16(r, zero, max);
113
_mm_store_si128(&y_r_buf[i], r);
115
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
116
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
117
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
118
g = _mm_srai_epi16(g, 3);
119
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
120
_mm_between_epi16(g, zero, max);
121
_mm_store_si128(&cb_g_buf[i], g);
123
/* (y + HIWORD(cb*28999)) >> 3 */
124
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
125
b = _mm_srai_epi16(b, 3);
126
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
127
_mm_between_epi16(b, zero, max);
128
_mm_store_si128(&cr_b_buf[i], b);
132
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
133
static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
135
__m128i min = _mm_set1_epi16(-128 << 5);
136
__m128i max = _mm_set1_epi16(127 << 5);
138
__m128i* y_r_buf = (__m128i*) y_r_buffer;
139
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
140
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
149
__m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
150
__m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
151
__m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
152
__m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
153
__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
154
__m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
155
__m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
156
__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
157
__m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
161
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
163
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
164
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
165
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
167
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
170
In order to use SSE2 signed 16-bit integer multiplication we need to convert
171
the floating point factors to signed int without loosing information.
172
The result of this multiplication is 32 bit and using SSE2 we get either the
173
product's hi or lo word.
174
Thus we will multiply the factors by the highest possible 2^n and take the
175
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
176
Since the final result needs to be scaled by << 5 and also in in order to keep
177
the precision within the upper 16 bits we will also have to scale the RGB
178
values used in the multiplication by << 5+(16-n).
181
/* r = y_r_buf[i]; */
182
r = _mm_load_si128(&y_r_buf[i]);
184
/* g = cb_g_buf[i]; */
185
g = _mm_load_si128(&cb_g_buf[i]);
187
/* b = cr_b_buf[i]; */
188
b = _mm_load_si128(&cr_b_buf[i]);
190
/* r<<6; g<<6; b<<6 */
191
r = _mm_slli_epi16(r, 6);
192
g = _mm_slli_epi16(g, 6);
193
b = _mm_slli_epi16(b, 6);
195
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
196
y = _mm_mulhi_epi16(r, y_r);
197
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
198
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
199
y = _mm_add_epi16(y, min);
200
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
201
_mm_between_epi16(y, min, max);
202
_mm_store_si128(&y_r_buf[i], y);
204
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
205
cb = _mm_mulhi_epi16(r, cb_r);
206
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
207
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
208
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
209
_mm_between_epi16(cb, min, max);
210
_mm_store_si128(&cb_g_buf[i], cb);
212
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
213
cr = _mm_mulhi_epi16(r, cr_r);
214
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
215
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
216
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
217
_mm_between_epi16(cr, min, max);
218
_mm_store_si128(&cr_b_buf[i], cr);
222
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223
rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
226
__m128i * ptr = (__m128i*) buffer;
227
__m128i * buf_end = (__m128i*) (buffer + buffer_size);
234
a = _mm_load_si128(ptr);
235
a = _mm_slli_epi16(a, factor);
236
_mm_store_si128(ptr, a);
239
} while(ptr < buf_end);
242
static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantization_values)
244
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
246
rfx_quantization_decode_block_sse2(buffer, 4096, 5);
248
rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
249
rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
250
rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
251
rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
252
rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
253
rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
254
rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
255
rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
256
rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
257
rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
260
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261
rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
264
__m128i* ptr = (__m128i*) buffer;
265
__m128i* buf_end = (__m128i*) (buffer + buffer_size);
271
half = _mm_set1_epi16(1 << (factor - 1));
274
a = _mm_load_si128(ptr);
275
a = _mm_add_epi16(a, half);
276
a = _mm_srai_epi16(a, factor);
277
_mm_store_si128(ptr, a);
280
} while(ptr < buf_end);
283
static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantization_values)
285
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
287
rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
288
rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
289
rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
290
rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
291
rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
292
rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
293
rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
294
rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
295
rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
296
rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
298
rfx_quantization_encode_block_sse2(buffer, 4096, 5);
301
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302
rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
307
sint16* dst_ptr = dst;
319
for (y = 0; y < subband_width; y++)
321
/* Even coefficients */
322
for (n = 0; n < subband_width; n+=8)
324
/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
326
l_n = _mm_load_si128((__m128i*) l_ptr);
328
h_n = _mm_load_si128((__m128i*) h_ptr);
329
h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
332
first = _mm_extract_epi16(h_n_m, 1);
333
h_n_m = _mm_insert_epi16(h_n_m, first, 0);
336
tmp_n = _mm_add_epi16(h_n, h_n_m);
337
tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
338
tmp_n = _mm_srai_epi16(tmp_n, 1);
340
dst_n = _mm_sub_epi16(l_n, tmp_n);
342
_mm_store_si128((__m128i*) l_ptr, dst_n);
347
l_ptr -= subband_width;
348
h_ptr -= subband_width;
350
/* Odd coefficients */
351
for (n = 0; n < subband_width; n+=8)
353
/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
355
h_n = _mm_load_si128((__m128i*) h_ptr);
357
h_n = _mm_slli_epi16(h_n, 1);
359
dst_n = _mm_load_si128((__m128i*) (l_ptr));
360
dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
361
if (n == subband_width - 8)
363
last = _mm_extract_epi16(dst_n_p, 6);
364
dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
367
tmp_n = _mm_add_epi16(dst_n_p, dst_n);
368
tmp_n = _mm_srai_epi16(tmp_n, 1);
370
tmp_n = _mm_add_epi16(tmp_n, h_n);
372
dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
373
dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
375
_mm_store_si128((__m128i*) dst_ptr, dst1);
376
_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
385
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386
rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
391
sint16* dst_ptr = dst;
400
int total_width = subband_width + subband_width;
402
/* Even coefficients */
403
for (n = 0; n < subband_width; n++)
405
for (x = 0; x < total_width; x+=8)
407
/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
409
l_n = _mm_load_si128((__m128i*) l_ptr);
410
h_n = _mm_load_si128((__m128i*) h_ptr);
412
tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
414
tmp_n = _mm_add_epi16(tmp_n, h_n);
417
h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
418
tmp_n = _mm_add_epi16(tmp_n, h_n_m);
420
tmp_n = _mm_srai_epi16(tmp_n, 1);
422
dst_n = _mm_sub_epi16(l_n, tmp_n);
423
_mm_store_si128((__m128i*) dst_ptr, dst_n);
429
dst_ptr+=total_width;
433
dst_ptr = dst + total_width;
435
/* Odd coefficients */
436
for (n = 0; n < subband_width; n++)
438
for (x = 0; x < total_width; x+=8)
440
/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
442
h_n = _mm_load_si128((__m128i*) h_ptr);
443
dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
444
h_n = _mm_slli_epi16(h_n, 1);
447
if (n == subband_width - 1)
448
tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
451
dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
452
tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
454
tmp_n = _mm_srai_epi16(tmp_n, 1);
456
dst_n = _mm_add_epi16(tmp_n, h_n);
457
_mm_store_si128((__m128i*) dst_ptr, dst_n);
462
dst_ptr+=total_width;
466
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467
rfx_dwt_2d_decode_block_sse2(sint16* buffer, sint16* idwt, int subband_width)
469
sint16 *hl, *lh, *hh, *ll;
470
sint16 *l_dst, *h_dst;
472
_mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(sint16));
474
/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
475
/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
476
/* The lower part L uses LL(3) and HL(0). */
477
/* The higher part H uses LH(1) and HH(2). */
479
ll = buffer + subband_width * subband_width * 3;
483
rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
485
lh = buffer + subband_width * subband_width;
486
hh = buffer + subband_width * subband_width * 2;
487
h_dst = idwt + subband_width * subband_width * 2;
489
rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
491
/* Inverse DWT in vertical direction, results are stored in original buffer. */
492
rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
495
static void rfx_dwt_2d_decode_sse2(sint16* buffer, sint16* dwt_buffer)
497
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
499
rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
500
rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
501
rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
504
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505
rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
517
total_width = subband_width << 1;
519
for (n = 0; n < subband_width; n++)
521
for (x = 0; x < total_width; x += 8)
523
src_2n = _mm_load_si128((__m128i*) src);
524
src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
525
if (n < subband_width - 1)
526
src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
530
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
532
h_n = _mm_add_epi16(src_2n, src_2n_2);
533
h_n = _mm_srai_epi16(h_n, 1);
534
h_n = _mm_sub_epi16(src_2n_1, h_n);
535
h_n = _mm_srai_epi16(h_n, 1);
537
_mm_store_si128((__m128i*) h, h_n);
542
h_n_m = _mm_load_si128((__m128i*) (h - total_width));
544
/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
546
l_n = _mm_add_epi16(h_n_m, h_n);
547
l_n = _mm_srai_epi16(l_n, 1);
548
l_n = _mm_add_epi16(l_n, src_2n);
550
_mm_store_si128((__m128i*) l, l_n);
560
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561
rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
573
for (y = 0; y < subband_width; y++)
575
for (n = 0; n < subband_width; n += 8)
577
/* The following 3 Set operations consumes more than half of the total DWT processing time! */
578
src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
579
src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
580
src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
581
src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
583
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
585
h_n = _mm_add_epi16(src_2n, src_2n_2);
586
h_n = _mm_srai_epi16(h_n, 1);
587
h_n = _mm_sub_epi16(src_2n_1, h_n);
588
h_n = _mm_srai_epi16(h_n, 1);
590
_mm_store_si128((__m128i*) h, h_n);
592
h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
595
first = _mm_extract_epi16(h_n_m, 1);
596
h_n_m = _mm_insert_epi16(h_n_m, first, 0);
599
/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
601
l_n = _mm_add_epi16(h_n_m, h_n);
602
l_n = _mm_srai_epi16(l_n, 1);
603
l_n = _mm_add_epi16(l_n, src_2n);
605
_mm_store_si128((__m128i*) l, l_n);
614
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615
rfx_dwt_2d_encode_block_sse2(sint16* buffer, sint16* dwt, int subband_width)
617
sint16 *hl, *lh, *hh, *ll;
618
sint16 *l_src, *h_src;
620
_mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(sint16));
622
/* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
625
h_src = dwt + subband_width * subband_width * 2;
627
rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
629
/* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
630
/* The lower part L generates LL(3) and HL(0). */
631
/* The higher part H generates LH(1) and HH(2). */
633
ll = buffer + subband_width * subband_width * 3;
636
lh = buffer + subband_width * subband_width;
637
hh = buffer + subband_width * subband_width * 2;
639
rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
640
rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
643
static void rfx_dwt_2d_encode_sse2(sint16* buffer, sint16* dwt_buffer)
645
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
647
rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
648
rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
649
rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
652
void rfx_init_sse2(RFX_CONTEXT* context)
654
DEBUG_RFX("Using SSE2 optimizations");
656
IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
657
IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
658
IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
659
IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
660
IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
661
IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
663
context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
664
context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
665
context->quantization_decode = rfx_quantization_decode_sse2;
666
context->quantization_encode = rfx_quantization_encode_sse2;
667
context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
668
context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;