~ubuntu-branches/ubuntu/vivid/freerdp/vivid

« back to all changes in this revision

Viewing changes to libfreerdp-codec/rfx_sse2.c

  • Committer: Package Import Robot
  • Author(s): Iain Lane
  • Date: 2014-11-11 12:20:50 UTC
  • mfrom: (1.1.9) (9.1.17 sid)
  • Revision ID: package-import@ubuntu.com-20141111122050-wyr8hrnwco9fcmum
Tags: 1.1.0~git20140921.1.440916e+dfsg1-2ubuntu1
* Merge with Debian unstable, remaining changes
  - Disable ffmpeg support
* Disable gstreamer support, this relies on gstreamer 0.10 and we don't want
  to add any more deps on that.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/**
2
 
 * FreeRDP: A Remote Desktop Protocol client.
3
 
 * RemoteFX Codec Library - SSE2 Optimizations
4
 
 *
5
 
 * Copyright 2011 Stephen Erisman
6
 
 * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
7
 
 *
8
 
 * Licensed under the Apache License, Version 2.0 (the "License");
9
 
 * you may not use this file except in compliance with the License.
10
 
 * You may obtain a copy of the License at
11
 
 *
12
 
 *     http://www.apache.org/licenses/LICENSE-2.0
13
 
 *
14
 
 * Unless required by applicable law or agreed to in writing, software
15
 
 * distributed under the License is distributed on an "AS IS" BASIS,
16
 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
 
 * See the License for the specific language governing permissions and
18
 
 * limitations under the License.
19
 
 */
20
 
 
21
 
#include <stdio.h>
22
 
#include <stdlib.h>
23
 
#include <string.h>
24
 
#include <xmmintrin.h>
25
 
#include <emmintrin.h>
26
 
 
27
 
#include "rfx_types.h"
28
 
#include "rfx_sse2.h"
29
 
 
30
 
#ifdef _MSC_VER
31
 
#define __attribute__(...)
32
 
#endif
33
 
 
34
 
#define CACHE_LINE_BYTES        64
35
 
 
36
 
#define _mm_between_epi16(_val, _min, _max) \
37
 
        do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
38
 
 
39
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40
 
_mm_prefetch_buffer(char * buffer, int num_bytes)
41
 
{
42
 
        __m128i * buf = (__m128i*) buffer;
43
 
        int i;
44
 
        for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
45
 
        {
46
 
                _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
47
 
        }
48
 
}
49
 
 
50
 
static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
51
 
{       
52
 
        __m128i zero = _mm_setzero_si128();
53
 
        __m128i max = _mm_set1_epi16(255);
54
 
 
55
 
        __m128i* y_r_buf = (__m128i*) y_r_buffer;
56
 
        __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
57
 
        __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
58
 
 
59
 
        __m128i y;
60
 
        __m128i cr;
61
 
        __m128i cb;
62
 
        __m128i r;
63
 
        __m128i g;
64
 
        __m128i b;
65
 
 
66
 
        int i;
67
 
 
68
 
        __m128i r_cr = _mm_set1_epi16(22986);   //  1.403 << 14
69
 
        __m128i g_cb = _mm_set1_epi16(-5636);   // -0.344 << 14
70
 
        __m128i g_cr = _mm_set1_epi16(-11698);  // -0.714 << 14
71
 
        __m128i b_cb = _mm_set1_epi16(28999);   //  1.770 << 14
72
 
        __m128i c4096 = _mm_set1_epi16(4096);
73
 
 
74
 
        for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
75
 
        {
76
 
                _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
77
 
                _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
78
 
                _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
79
 
        }
80
 
        for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
81
 
        {
82
 
                /*
83
 
                In order to use SSE2 signed 16-bit integer multiplication we need to convert
84
 
                the floating point factors to signed int without loosing information.
85
 
                The result of this multiplication is 32 bit and we have two SSE instructions
86
 
                that return either the hi or lo word.
87
 
                Thus we will multiply the factors by the highest possible 2^n, take the 
88
 
                upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
89
 
                result by multiplying it by 2^(16-n).
90
 
                For the given factors in the conversion matrix the best possible n is 14.
91
 
 
92
 
                Example for calculating r:
93
 
                r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
94
 
                r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
95
 
                r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
96
 
                r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
97
 
                */
98
 
 
99
 
                /* y = (y_r_buf[i] + 4096) >> 2 */
100
 
                y = _mm_load_si128(&y_r_buf[i]);
101
 
                y = _mm_add_epi16(y, c4096);
102
 
                y = _mm_srai_epi16(y, 2);
103
 
                /* cb = cb_g_buf[i]; */
104
 
                cb = _mm_load_si128(&cb_g_buf[i]);
105
 
                /* cr = cr_b_buf[i]; */
106
 
                cr = _mm_load_si128(&cr_b_buf[i]);
107
 
 
108
 
                /* (y + HIWORD(cr*22986)) >> 3 */
109
 
                r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
110
 
                r = _mm_srai_epi16(r, 3);
111
 
                /* y_r_buf[i] = MINMAX(r, 0, 255); */
112
 
                _mm_between_epi16(r, zero, max);
113
 
                _mm_store_si128(&y_r_buf[i], r);
114
 
 
115
 
                /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
116
 
                g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
117
 
                g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
118
 
                g = _mm_srai_epi16(g, 3);
119
 
                /* cb_g_buf[i] = MINMAX(g, 0, 255); */
120
 
                _mm_between_epi16(g, zero, max);
121
 
                _mm_store_si128(&cb_g_buf[i], g);
122
 
 
123
 
                /* (y + HIWORD(cb*28999)) >> 3 */
124
 
                b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
125
 
                b = _mm_srai_epi16(b, 3);
126
 
                /* cr_b_buf[i] = MINMAX(b, 0, 255); */
127
 
                _mm_between_epi16(b, zero, max);
128
 
                _mm_store_si128(&cr_b_buf[i], b);
129
 
        }
130
 
}
131
 
 
132
 
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
133
 
static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
134
 
{
135
 
        __m128i min = _mm_set1_epi16(-128 << 5);
136
 
        __m128i max = _mm_set1_epi16(127 << 5);
137
 
 
138
 
        __m128i* y_r_buf = (__m128i*) y_r_buffer;
139
 
        __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
140
 
        __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
141
 
 
142
 
        __m128i y;
143
 
        __m128i cr;
144
 
        __m128i cb;
145
 
        __m128i r;
146
 
        __m128i g;
147
 
        __m128i b;
148
 
 
149
 
        __m128i y_r  = _mm_set1_epi16(9798);   //  0.299000 << 15
150
 
        __m128i y_g  = _mm_set1_epi16(19235);  //  0.587000 << 15
151
 
        __m128i y_b  = _mm_set1_epi16(3735);   //  0.114000 << 15
152
 
        __m128i cb_r = _mm_set1_epi16(-5535);  // -0.168935 << 15
153
 
        __m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
154
 
        __m128i cb_b = _mm_set1_epi16(16403);  //  0.500590 << 15
155
 
        __m128i cr_r = _mm_set1_epi16(16377);  //  0.499813 << 15
156
 
        __m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
157
 
        __m128i cr_b = _mm_set1_epi16(-2663);  // -0.081282 << 15
158
 
 
159
 
        int i;
160
 
 
161
 
        for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
162
 
        {
163
 
                _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
164
 
                _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
165
 
                _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
166
 
        }
167
 
        for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
168
 
        {
169
 
                /*
170
 
                In order to use SSE2 signed 16-bit integer multiplication we need to convert
171
 
                the floating point factors to signed int without loosing information.
172
 
                The result of this multiplication is 32 bit and using SSE2 we get either the
173
 
                product's hi or lo word.
174
 
                Thus we will multiply the factors by the highest possible 2^n and take the
175
 
                upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
176
 
                Since the final result needs to be scaled by << 5 and also in in order to keep
177
 
                the precision within the upper 16 bits we will also have to scale the RGB
178
 
                values used in the multiplication by << 5+(16-n).
179
 
                */
180
 
 
181
 
                /* r = y_r_buf[i]; */
182
 
                r = _mm_load_si128(&y_r_buf[i]);
183
 
 
184
 
                /* g = cb_g_buf[i]; */
185
 
                g = _mm_load_si128(&cb_g_buf[i]);
186
 
 
187
 
                /* b = cr_b_buf[i]; */
188
 
                b = _mm_load_si128(&cr_b_buf[i]);
189
 
 
190
 
                /* r<<6; g<<6; b<<6 */
191
 
                r = _mm_slli_epi16(r, 6);
192
 
                g = _mm_slli_epi16(g, 6);
193
 
                b = _mm_slli_epi16(b, 6);
194
 
 
195
 
                /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
196
 
                y = _mm_mulhi_epi16(r, y_r);
197
 
                y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
198
 
                y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
199
 
                y = _mm_add_epi16(y, min);
200
 
                /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
201
 
                _mm_between_epi16(y, min, max);
202
 
                _mm_store_si128(&y_r_buf[i], y);
203
 
 
204
 
                /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
205
 
                cb = _mm_mulhi_epi16(r, cb_r);
206
 
                cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
207
 
                cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
208
 
                /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
209
 
                _mm_between_epi16(cb, min, max);
210
 
                _mm_store_si128(&cb_g_buf[i], cb);
211
 
 
212
 
                /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
213
 
                cr = _mm_mulhi_epi16(r, cr_r);
214
 
                cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
215
 
                cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
216
 
                /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
217
 
                _mm_between_epi16(cr, min, max);
218
 
                _mm_store_si128(&cr_b_buf[i], cr);
219
 
        }
220
 
}
221
 
 
222
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223
 
rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
224
 
{
225
 
        __m128i a;
226
 
        __m128i * ptr = (__m128i*) buffer;
227
 
        __m128i * buf_end = (__m128i*) (buffer + buffer_size);
228
 
 
229
 
        if (factor == 0)
230
 
                return;
231
 
 
232
 
        do
233
 
        {
234
 
                a = _mm_load_si128(ptr);
235
 
                a = _mm_slli_epi16(a, factor);
236
 
                _mm_store_si128(ptr, a);
237
 
 
238
 
                ptr++;
239
 
        } while(ptr < buf_end);
240
 
}
241
 
 
242
 
static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantization_values)
243
 
{
244
 
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
245
 
 
246
 
        rfx_quantization_decode_block_sse2(buffer, 4096, 5);
247
 
 
248
 
        rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
249
 
        rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
250
 
        rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
251
 
        rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
252
 
        rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
253
 
        rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
254
 
        rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
255
 
        rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
256
 
        rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
257
 
        rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
258
 
}
259
 
 
260
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261
 
rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
262
 
{
263
 
        __m128i a;
264
 
        __m128i* ptr = (__m128i*) buffer;
265
 
        __m128i* buf_end = (__m128i*) (buffer + buffer_size);
266
 
        __m128i half;
267
 
 
268
 
        if (factor == 0)
269
 
                return;
270
 
 
271
 
        half = _mm_set1_epi16(1 << (factor - 1));
272
 
        do
273
 
        {
274
 
                a = _mm_load_si128(ptr);
275
 
                a = _mm_add_epi16(a, half);
276
 
                a = _mm_srai_epi16(a, factor);
277
 
                _mm_store_si128(ptr, a);
278
 
 
279
 
                ptr++;
280
 
        } while(ptr < buf_end);
281
 
}
282
 
 
283
 
static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantization_values)
284
 
{
285
 
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
286
 
 
287
 
        rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
288
 
        rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
289
 
        rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
290
 
        rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
291
 
        rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
292
 
        rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
293
 
        rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
294
 
        rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
295
 
        rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
296
 
        rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
297
 
 
298
 
        rfx_quantization_encode_block_sse2(buffer, 4096, 5);
299
 
}
300
 
 
301
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302
 
rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
303
 
{
304
 
        int y, n;
305
 
        sint16* l_ptr = l;
306
 
        sint16* h_ptr = h;
307
 
        sint16* dst_ptr = dst;
308
 
        int first;
309
 
        int last;
310
 
        __m128i l_n;
311
 
        __m128i h_n;
312
 
        __m128i h_n_m;
313
 
        __m128i tmp_n;
314
 
        __m128i dst_n;
315
 
        __m128i dst_n_p;
316
 
        __m128i dst1;
317
 
        __m128i dst2;
318
 
 
319
 
        for (y = 0; y < subband_width; y++)
320
 
        {
321
 
                /* Even coefficients */
322
 
                for (n = 0; n < subband_width; n+=8)
323
 
                {
324
 
                        /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
325
 
                        
326
 
                        l_n = _mm_load_si128((__m128i*) l_ptr);
327
 
 
328
 
                        h_n = _mm_load_si128((__m128i*) h_ptr);
329
 
                        h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
330
 
                        if (n == 0)
331
 
                        {
332
 
                                first = _mm_extract_epi16(h_n_m, 1);
333
 
                                h_n_m = _mm_insert_epi16(h_n_m, first, 0);
334
 
                        }
335
 
                        
336
 
                        tmp_n = _mm_add_epi16(h_n, h_n_m);
337
 
                        tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
338
 
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
339
 
                        
340
 
                        dst_n = _mm_sub_epi16(l_n, tmp_n);
341
 
                        
342
 
                        _mm_store_si128((__m128i*) l_ptr, dst_n);
343
 
                        
344
 
                        l_ptr+=8;
345
 
                        h_ptr+=8;
346
 
                }
347
 
                l_ptr -= subband_width;
348
 
                h_ptr -= subband_width;
349
 
                
350
 
                /* Odd coefficients */
351
 
                for (n = 0; n < subband_width; n+=8)
352
 
                {
353
 
                        /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
354
 
                        
355
 
                        h_n = _mm_load_si128((__m128i*) h_ptr);
356
 
                        
357
 
                        h_n = _mm_slli_epi16(h_n, 1);
358
 
                        
359
 
                        dst_n = _mm_load_si128((__m128i*) (l_ptr));
360
 
                        dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
361
 
                        if (n == subband_width - 8)
362
 
                        {
363
 
                                last = _mm_extract_epi16(dst_n_p, 6);
364
 
                                dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
365
 
                        }
366
 
                        
367
 
                        tmp_n = _mm_add_epi16(dst_n_p, dst_n);
368
 
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
369
 
                        
370
 
                        tmp_n = _mm_add_epi16(tmp_n, h_n);
371
 
                        
372
 
                        dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
373
 
                        dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
374
 
                        
375
 
                        _mm_store_si128((__m128i*) dst_ptr, dst1);
376
 
                        _mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
377
 
                        
378
 
                        l_ptr+=8;
379
 
                        h_ptr+=8;
380
 
                        dst_ptr+=16;
381
 
                }
382
 
        }
383
 
}
384
 
 
385
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386
 
rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
387
 
{
388
 
        int x, n;
389
 
        sint16* l_ptr = l;
390
 
        sint16* h_ptr = h;
391
 
        sint16* dst_ptr = dst;
392
 
        __m128i l_n;
393
 
        __m128i h_n;
394
 
        __m128i tmp_n;
395
 
        __m128i h_n_m;
396
 
        __m128i dst_n;
397
 
        __m128i dst_n_m;
398
 
        __m128i dst_n_p;
399
 
        
400
 
        int total_width = subband_width + subband_width;
401
 
 
402
 
        /* Even coefficients */
403
 
        for (n = 0; n < subband_width; n++)
404
 
        {
405
 
                for (x = 0; x < total_width; x+=8)
406
 
                {
407
 
                        /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
408
 
                        
409
 
                        l_n = _mm_load_si128((__m128i*) l_ptr);
410
 
                        h_n = _mm_load_si128((__m128i*) h_ptr);
411
 
                        
412
 
                        tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
413
 
                        if (n == 0)
414
 
                                tmp_n = _mm_add_epi16(tmp_n, h_n);
415
 
                        else
416
 
                        {
417
 
                                h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
418
 
                                tmp_n = _mm_add_epi16(tmp_n, h_n_m);
419
 
                        }
420
 
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
421
 
                        
422
 
                        dst_n = _mm_sub_epi16(l_n, tmp_n);
423
 
                        _mm_store_si128((__m128i*) dst_ptr, dst_n);
424
 
                        
425
 
                        l_ptr+=8;
426
 
                        h_ptr+=8;
427
 
                        dst_ptr+=8;
428
 
                }
429
 
                dst_ptr+=total_width;
430
 
        }
431
 
        
432
 
        h_ptr = h;
433
 
        dst_ptr = dst + total_width;
434
 
        
435
 
        /* Odd coefficients */
436
 
        for (n = 0; n < subband_width; n++)
437
 
        {
438
 
                for (x = 0; x < total_width; x+=8)
439
 
                {
440
 
                        /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
441
 
                        
442
 
                        h_n = _mm_load_si128((__m128i*) h_ptr);
443
 
                        dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
444
 
                        h_n = _mm_slli_epi16(h_n, 1);
445
 
                        
446
 
                        tmp_n = dst_n_m;
447
 
                        if (n == subband_width - 1)
448
 
                                tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
449
 
                        else
450
 
                        {
451
 
                                dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
452
 
                                tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
453
 
                        }
454
 
                        tmp_n = _mm_srai_epi16(tmp_n, 1);
455
 
                        
456
 
                        dst_n = _mm_add_epi16(tmp_n, h_n);
457
 
                        _mm_store_si128((__m128i*) dst_ptr, dst_n);
458
 
 
459
 
                        h_ptr+=8;
460
 
                        dst_ptr+=8;
461
 
                }
462
 
                dst_ptr+=total_width;
463
 
        }
464
 
}
465
 
 
466
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467
 
rfx_dwt_2d_decode_block_sse2(sint16* buffer, sint16* idwt, int subband_width)
468
 
{
469
 
        sint16 *hl, *lh, *hh, *ll;
470
 
        sint16 *l_dst, *h_dst;
471
 
 
472
 
        _mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(sint16));
473
 
 
474
 
        /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
475
 
        /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
476
 
        /* The lower part L uses LL(3) and HL(0). */
477
 
        /* The higher part H uses LH(1) and HH(2). */
478
 
 
479
 
        ll = buffer + subband_width * subband_width * 3;
480
 
        hl = buffer;
481
 
        l_dst = idwt;
482
 
 
483
 
        rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
484
 
 
485
 
        lh = buffer + subband_width * subband_width;
486
 
        hh = buffer + subband_width * subband_width * 2;
487
 
        h_dst = idwt + subband_width * subband_width * 2;
488
 
        
489
 
        rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
490
 
 
491
 
        /* Inverse DWT in vertical direction, results are stored in original buffer. */
492
 
        rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
493
 
}
494
 
 
495
 
static void rfx_dwt_2d_decode_sse2(sint16* buffer, sint16* dwt_buffer)
496
 
{
497
 
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
498
 
        
499
 
        rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
500
 
        rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
501
 
        rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
502
 
}
503
 
 
504
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505
 
rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
506
 
{
507
 
        int total_width;
508
 
        int x;
509
 
        int n;
510
 
        __m128i src_2n;
511
 
        __m128i src_2n_1;
512
 
        __m128i src_2n_2;
513
 
        __m128i h_n;
514
 
        __m128i h_n_m;
515
 
        __m128i l_n;
516
 
 
517
 
        total_width = subband_width << 1;
518
 
 
519
 
        for (n = 0; n < subband_width; n++)
520
 
        {
521
 
                for (x = 0; x < total_width; x += 8)
522
 
                {
523
 
                        src_2n = _mm_load_si128((__m128i*) src);
524
 
                        src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
525
 
                        if (n < subband_width - 1)
526
 
                                src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
527
 
                        else
528
 
                                src_2n_2 = src_2n;
529
 
 
530
 
                        /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
531
 
 
532
 
                        h_n = _mm_add_epi16(src_2n, src_2n_2);
533
 
                        h_n = _mm_srai_epi16(h_n, 1);
534
 
                        h_n = _mm_sub_epi16(src_2n_1, h_n);
535
 
                        h_n = _mm_srai_epi16(h_n, 1);
536
 
 
537
 
                        _mm_store_si128((__m128i*) h, h_n);
538
 
 
539
 
                        if (n == 0)
540
 
                                h_n_m = h_n;
541
 
                        else
542
 
                                h_n_m = _mm_load_si128((__m128i*) (h - total_width));
543
 
 
544
 
                        /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
545
 
 
546
 
                        l_n = _mm_add_epi16(h_n_m, h_n);
547
 
                        l_n = _mm_srai_epi16(l_n, 1);
548
 
                        l_n = _mm_add_epi16(l_n, src_2n);
549
 
 
550
 
                        _mm_store_si128((__m128i*) l, l_n);
551
 
 
552
 
                        src += 8;
553
 
                        l += 8;
554
 
                        h += 8;
555
 
                }
556
 
                src += total_width;
557
 
        }
558
 
}
559
 
 
560
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561
 
rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
562
 
{
563
 
        int y;
564
 
        int n;
565
 
        int first;
566
 
        __m128i src_2n;
567
 
        __m128i src_2n_1;
568
 
        __m128i src_2n_2;
569
 
        __m128i h_n;
570
 
        __m128i h_n_m;
571
 
        __m128i l_n;
572
 
 
573
 
        for (y = 0; y < subband_width; y++)
574
 
        {
575
 
                for (n = 0; n < subband_width; n += 8)
576
 
                {
577
 
                        /* The following 3 Set operations consumes more than half of the total DWT processing time! */
578
 
                        src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
579
 
                        src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
580
 
                        src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
581
 
                                src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
582
 
 
583
 
                        /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
584
 
 
585
 
                        h_n = _mm_add_epi16(src_2n, src_2n_2);
586
 
                        h_n = _mm_srai_epi16(h_n, 1);
587
 
                        h_n = _mm_sub_epi16(src_2n_1, h_n);
588
 
                        h_n = _mm_srai_epi16(h_n, 1);
589
 
 
590
 
                        _mm_store_si128((__m128i*) h, h_n);
591
 
 
592
 
                        h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
593
 
                        if (n == 0)
594
 
                        {
595
 
                                first = _mm_extract_epi16(h_n_m, 1);
596
 
                                h_n_m = _mm_insert_epi16(h_n_m, first, 0);
597
 
                        }
598
 
 
599
 
                        /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
600
 
 
601
 
                        l_n = _mm_add_epi16(h_n_m, h_n);
602
 
                        l_n = _mm_srai_epi16(l_n, 1);
603
 
                        l_n = _mm_add_epi16(l_n, src_2n);
604
 
 
605
 
                        _mm_store_si128((__m128i*) l, l_n);
606
 
 
607
 
                        src += 16;
608
 
                        l += 8;
609
 
                        h += 8;
610
 
                }
611
 
        }
612
 
}
613
 
 
614
 
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615
 
rfx_dwt_2d_encode_block_sse2(sint16* buffer, sint16* dwt, int subband_width)
616
 
{
617
 
        sint16 *hl, *lh, *hh, *ll;
618
 
        sint16 *l_src, *h_src;
619
 
 
620
 
        _mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(sint16));
621
 
 
622
 
        /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
623
 
 
624
 
        l_src = dwt;
625
 
        h_src = dwt + subband_width * subband_width * 2;
626
 
 
627
 
        rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
628
 
 
629
 
        /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
630
 
        /* The lower part L generates LL(3) and HL(0). */
631
 
        /* The higher part H generates LH(1) and HH(2). */
632
 
 
633
 
        ll = buffer + subband_width * subband_width * 3;
634
 
        hl = buffer;
635
 
 
636
 
        lh = buffer + subband_width * subband_width;
637
 
        hh = buffer + subband_width * subband_width * 2;
638
 
 
639
 
        rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
640
 
        rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
641
 
}
642
 
 
643
 
static void rfx_dwt_2d_encode_sse2(sint16* buffer, sint16* dwt_buffer)
644
 
{
645
 
        _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
646
 
        
647
 
        rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
648
 
        rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
649
 
        rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
650
 
}
651
 
 
652
 
void rfx_init_sse2(RFX_CONTEXT* context)
653
 
{
654
 
        DEBUG_RFX("Using SSE2 optimizations");
655
 
 
656
 
        IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
657
 
        IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
658
 
        IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
659
 
        IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
660
 
        IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
661
 
        IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
662
 
 
663
 
        context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
664
 
        context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
665
 
        context->quantization_decode = rfx_quantization_decode_sse2;
666
 
        context->quantization_encode = rfx_quantization_encode_sse2;
667
 
        context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
668
 
        context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
669
 
}