2
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
* Use of this source code is governed by a BSD-style license
5
* that can be found in the LICENSE file in the root of the source
6
* tree. An additional intellectual property rights grant can be found
7
* in the file PATENTS. All contributing project authors may
8
* be found in the AUTHORS file in the root of the source tree.
11
#include <emmintrin.h> /* SSE2 */
12
#include "vp9/common/vp9_loopfilter.h"
13
#include "vpx_ports/emmintrin_compat.h"
15
static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
17
const unsigned char *_blimit,
18
const unsigned char *_limit,
19
const unsigned char *_thresh) {
20
__m128i mask, hev, flat, flat2;
21
const __m128i zero = _mm_set1_epi16(0);
22
const __m128i one = _mm_set1_epi8(1);
23
__m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
25
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
26
const unsigned int extended_limit = _limit[0] * 0x01010101u;
27
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
28
const __m128i thresh =
29
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
31
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
32
const __m128i blimit =
33
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
35
q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
36
q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
37
(__m64 *)(s + 4 * p)));
38
q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
39
q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
40
(__m64 *)(s + 3 * p)));
41
q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
42
q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
43
(__m64 *)(s + 2 * p)));
44
q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
45
q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
46
(__m64 *)(s + 1 * p)));
47
p1q1 = _mm_shuffle_epi32(q1p1, 78);
48
q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
49
q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
50
(__m64 *)(s - 0 * p)));
51
p0q0 = _mm_shuffle_epi32(q0p0, 78);
54
__m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
55
abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
56
_mm_subs_epu8(q0p0, q1p1));
57
abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
58
fe = _mm_set1_epi8(0xfe);
59
ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
60
abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
61
_mm_subs_epu8(p0q0, q0p0));
62
abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
63
_mm_subs_epu8(p1q1, q1p1));
64
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
65
hev = _mm_subs_epu8(flat, thresh);
66
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
68
abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
69
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
70
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
71
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
72
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
73
mask = _mm_max_epu8(abs_p1p0, mask);
74
// mask |= (abs(p1 - p0) > limit) * -1;
75
// mask |= (abs(q1 - q0) > limit) * -1;
77
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
78
_mm_subs_epu8(q1p1, q2p2)),
79
_mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
80
_mm_subs_epu8(q2p2, q3p3)));
81
mask = _mm_max_epu8(work, mask);
82
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
83
mask = _mm_subs_epu8(mask, limit);
84
mask = _mm_cmpeq_epi8(mask, zero);
89
const __m128i t4 = _mm_set1_epi8(4);
90
const __m128i t3 = _mm_set1_epi8(3);
91
const __m128i t80 = _mm_set1_epi8(0x80);
92
const __m128i t1 = _mm_set1_epi16(0x1);
93
__m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
94
__m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
95
__m128i qs0 = _mm_xor_si128(p0q0, t80);
96
__m128i qs1 = _mm_xor_si128(p1q1, t80);
99
__m128i filter1, filter2;
100
__m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
101
__m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
103
filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
104
work_a = _mm_subs_epi8(qs0, qs0ps0);
105
filt = _mm_adds_epi8(filt, work_a);
106
filt = _mm_adds_epi8(filt, work_a);
107
filt = _mm_adds_epi8(filt, work_a);
108
/* (vp9_filter + 3 * (qs0 - ps0)) & mask */
109
filt = _mm_and_si128(filt, mask);
111
filter1 = _mm_adds_epi8(filt, t4);
112
filter2 = _mm_adds_epi8(filt, t3);
114
filter1 = _mm_unpacklo_epi8(zero, filter1);
115
filter1 = _mm_srai_epi16(filter1, 0xB);
116
filter2 = _mm_unpacklo_epi8(zero, filter2);
117
filter2 = _mm_srai_epi16(filter2, 0xB);
120
filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
121
qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
124
filt = _mm_adds_epi16(filter1, t1);
125
filt = _mm_srai_epi16(filt, 1);
126
filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
128
filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
129
qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
134
flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
135
_mm_subs_epu8(q0p0, q2p2)),
136
_mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
137
_mm_subs_epu8(q0p0, q3p3)));
138
flat = _mm_max_epu8(abs_p1p0, flat);
139
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
140
flat = _mm_subs_epu8(flat, one);
141
flat = _mm_cmpeq_epi8(flat, zero);
142
flat = _mm_and_si128(flat, mask);
144
q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
145
q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
146
(__m64 *)(s + 5 * p)));
148
q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
149
q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
150
(__m64 *)(s + 6 * p)));
152
flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
153
_mm_subs_epu8(q0p0, q4p4)),
154
_mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
155
_mm_subs_epu8(q0p0, q5p5)));
157
q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
158
q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
159
(__m64 *)(s + 7 * p)));
161
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
162
_mm_subs_epu8(q0p0, q6p6)),
163
_mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
164
_mm_subs_epu8(q0p0, q7p7)));
166
flat2 = _mm_max_epu8(work, flat2);
167
flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
168
flat2 = _mm_subs_epu8(flat2, one);
169
flat2 = _mm_cmpeq_epi8(flat2, zero);
170
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
173
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
174
// flat and wide flat calculations
176
const __m128i eight = _mm_set1_epi16(8);
177
const __m128i four = _mm_set1_epi16(4);
178
__m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
179
__m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
180
__m128i pixelFilter_p, pixelFilter_q;
181
__m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
182
__m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
184
p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
185
p6_16 = _mm_unpacklo_epi8(q6p6, zero);
186
p5_16 = _mm_unpacklo_epi8(q5p5, zero);
187
p4_16 = _mm_unpacklo_epi8(q4p4, zero);
188
p3_16 = _mm_unpacklo_epi8(q3p3, zero);
189
p2_16 = _mm_unpacklo_epi8(q2p2, zero);
190
p1_16 = _mm_unpacklo_epi8(q1p1, zero);
191
p0_16 = _mm_unpacklo_epi8(q0p0, zero);
192
q0_16 = _mm_unpackhi_epi8(q0p0, zero);
193
q1_16 = _mm_unpackhi_epi8(q1p1, zero);
194
q2_16 = _mm_unpackhi_epi8(q2p2, zero);
195
q3_16 = _mm_unpackhi_epi8(q3p3, zero);
196
q4_16 = _mm_unpackhi_epi8(q4p4, zero);
197
q5_16 = _mm_unpackhi_epi8(q5p5, zero);
198
q6_16 = _mm_unpackhi_epi8(q6p6, zero);
199
q7_16 = _mm_unpackhi_epi8(q7p7, zero);
201
pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
202
_mm_add_epi16(p4_16, p3_16));
203
pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
204
_mm_add_epi16(q4_16, q3_16));
206
pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
207
pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
209
pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
210
pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
211
pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
213
pixetFilter_p2p1p0 = _mm_add_epi16(four,
214
_mm_add_epi16(pixetFilter_p2p1p0,
215
pixetFilter_q2q1q0));
216
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
217
_mm_add_epi16(p7_16, p0_16)), 4);
218
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
219
_mm_add_epi16(q7_16, q0_16)), 4);
220
flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
221
res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
222
_mm_add_epi16(p3_16, p0_16)), 3);
223
res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
224
_mm_add_epi16(q3_16, q0_16)), 3);
226
flat_q0p0 = _mm_packus_epi16(res_p, res_q);
228
sum_p7 = _mm_add_epi16(p7_16, p7_16);
229
sum_q7 = _mm_add_epi16(q7_16, q7_16);
230
sum_p3 = _mm_add_epi16(p3_16, p3_16);
231
sum_q3 = _mm_add_epi16(q3_16, q3_16);
233
pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
234
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
235
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
236
_mm_add_epi16(sum_p7, p1_16)), 4);
237
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
238
_mm_add_epi16(sum_q7, q1_16)), 4);
239
flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
241
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
242
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
243
res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
244
_mm_add_epi16(sum_p3, p1_16)), 3);
245
res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
246
_mm_add_epi16(sum_q3, q1_16)), 3);
247
flat_q1p1 = _mm_packus_epi16(res_p, res_q);
249
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
250
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
251
sum_p3 = _mm_add_epi16(sum_p3, p3_16);
252
sum_q3 = _mm_add_epi16(sum_q3, q3_16);
254
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
255
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
256
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
257
_mm_add_epi16(sum_p7, p2_16)), 4);
258
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
259
_mm_add_epi16(sum_q7, q2_16)), 4);
260
flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
262
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
263
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
265
res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
266
_mm_add_epi16(sum_p3, p2_16)), 3);
267
res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
268
_mm_add_epi16(sum_q3, q2_16)), 3);
269
flat_q2p2 = _mm_packus_epi16(res_p, res_q);
271
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
272
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
273
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
274
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
275
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
276
_mm_add_epi16(sum_p7, p3_16)), 4);
277
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
278
_mm_add_epi16(sum_q7, q3_16)), 4);
279
flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
281
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
282
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
283
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
284
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
285
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
286
_mm_add_epi16(sum_p7, p4_16)), 4);
287
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
288
_mm_add_epi16(sum_q7, q4_16)), 4);
289
flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
291
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
292
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
293
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
294
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
295
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
296
_mm_add_epi16(sum_p7, p5_16)), 4);
297
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
298
_mm_add_epi16(sum_q7, q5_16)), 4);
299
flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
301
sum_p7 = _mm_add_epi16(sum_p7, p7_16);
302
sum_q7 = _mm_add_epi16(sum_q7, q7_16);
303
pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
304
pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
305
res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
306
_mm_add_epi16(sum_p7, p6_16)), 4);
307
res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
308
_mm_add_epi16(sum_q7, q6_16)), 4);
309
flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
312
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
314
flat = _mm_shuffle_epi32(flat, 68);
315
flat2 = _mm_shuffle_epi32(flat2, 68);
317
q2p2 = _mm_andnot_si128(flat, q2p2);
318
flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
319
q2p2 = _mm_or_si128(q2p2, flat_q2p2);
321
qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
322
flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
323
q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
325
qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
326
flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
327
q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
329
q6p6 = _mm_andnot_si128(flat2, q6p6);
330
flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
331
q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
332
_mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
333
_mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
335
q5p5 = _mm_andnot_si128(flat2, q5p5);
336
flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
337
q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
338
_mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
339
_mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
341
q4p4 = _mm_andnot_si128(flat2, q4p4);
342
flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
343
q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
344
_mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
345
_mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
347
q3p3 = _mm_andnot_si128(flat2, q3p3);
348
flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
349
q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
350
_mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
351
_mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
353
q2p2 = _mm_andnot_si128(flat2, q2p2);
354
flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
355
q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
356
_mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
357
_mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
359
q1p1 = _mm_andnot_si128(flat2, q1p1);
360
flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
361
q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
362
_mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
363
_mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
365
q0p0 = _mm_andnot_si128(flat2, q0p0);
366
flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
367
q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
368
_mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
369
_mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
373
static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
375
const unsigned char *_blimit,
376
const unsigned char *_limit,
377
const unsigned char *_thresh) {
378
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
379
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
381
DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
382
DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
384
DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
385
DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
388
__m128i mask, hev, flat, flat2;
389
const __m128i zero = _mm_set1_epi16(0);
390
const __m128i one = _mm_set1_epi8(1);
392
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
395
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
396
const unsigned int extended_limit = _limit[0] * 0x01010101u;
397
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
398
const __m128i thresh =
399
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
400
const __m128i limit =
401
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
402
const __m128i blimit =
403
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
405
p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
406
p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
407
p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
408
p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
409
p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
410
q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
411
q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
412
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
413
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
414
q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
416
_mm_store_si128((__m128i *)ap[4], p4);
417
_mm_store_si128((__m128i *)ap[3], p3);
418
_mm_store_si128((__m128i *)ap[2], p2);
419
_mm_store_si128((__m128i *)ap[1], p1);
420
_mm_store_si128((__m128i *)ap[0], p0);
421
_mm_store_si128((__m128i *)aq[4], q4);
422
_mm_store_si128((__m128i *)aq[3], q3);
423
_mm_store_si128((__m128i *)aq[2], q2);
424
_mm_store_si128((__m128i *)aq[1], q1);
425
_mm_store_si128((__m128i *)aq[0], q0);
429
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
430
_mm_subs_epu8(p0, p1));
431
const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
432
_mm_subs_epu8(q0, q1));
433
const __m128i fe = _mm_set1_epi8(0xfe);
434
const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
435
__m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
436
_mm_subs_epu8(q0, p0));
437
__m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
438
_mm_subs_epu8(q1, p1));
440
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
441
hev = _mm_subs_epu8(flat, thresh);
442
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
444
abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
445
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
446
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
447
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
448
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
449
mask = _mm_max_epu8(flat, mask);
450
// mask |= (abs(p1 - p0) > limit) * -1;
451
// mask |= (abs(q1 - q0) > limit) * -1;
452
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
453
_mm_subs_epu8(p1, p2)),
454
_mm_or_si128(_mm_subs_epu8(p3, p2),
455
_mm_subs_epu8(p2, p3)));
456
mask = _mm_max_epu8(work, mask);
457
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
458
_mm_subs_epu8(q1, q2)),
459
_mm_or_si128(_mm_subs_epu8(q3, q2),
460
_mm_subs_epu8(q2, q3)));
461
mask = _mm_max_epu8(work, mask);
462
mask = _mm_subs_epu8(mask, limit);
463
mask = _mm_cmpeq_epi8(mask, zero);
468
const __m128i t4 = _mm_set1_epi8(4);
469
const __m128i t3 = _mm_set1_epi8(3);
470
const __m128i t80 = _mm_set1_epi8(0x80);
471
const __m128i te0 = _mm_set1_epi8(0xe0);
472
const __m128i t1f = _mm_set1_epi8(0x1f);
473
const __m128i t1 = _mm_set1_epi8(0x1);
474
const __m128i t7f = _mm_set1_epi8(0x7f);
476
__m128i ps1 = _mm_xor_si128(p1, t80);
477
__m128i ps0 = _mm_xor_si128(p0, t80);
478
__m128i qs0 = _mm_xor_si128(q0, t80);
479
__m128i qs1 = _mm_xor_si128(q1, t80);
482
__m128i filter1, filter2;
484
filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
485
work_a = _mm_subs_epi8(qs0, ps0);
486
filt = _mm_adds_epi8(filt, work_a);
487
filt = _mm_adds_epi8(filt, work_a);
488
filt = _mm_adds_epi8(filt, work_a);
489
/* (vp9_filter + 3 * (qs0 - ps0)) & mask */
490
filt = _mm_and_si128(filt, mask);
492
filter1 = _mm_adds_epi8(filt, t4);
493
filter2 = _mm_adds_epi8(filt, t3);
496
work_a = _mm_cmpgt_epi8(zero, filter1);
497
filter1 = _mm_srli_epi16(filter1, 3);
498
work_a = _mm_and_si128(work_a, te0);
499
filter1 = _mm_and_si128(filter1, t1f);
500
filter1 = _mm_or_si128(filter1, work_a);
501
qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
504
work_a = _mm_cmpgt_epi8(zero, filter2);
505
filter2 = _mm_srli_epi16(filter2, 3);
506
work_a = _mm_and_si128(work_a, te0);
507
filter2 = _mm_and_si128(filter2, t1f);
508
filter2 = _mm_or_si128(filter2, work_a);
509
ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
512
filt = _mm_adds_epi8(filter1, t1);
513
work_a = _mm_cmpgt_epi8(zero, filt);
514
filt = _mm_srli_epi16(filt, 1);
515
work_a = _mm_and_si128(work_a, t80);
516
filt = _mm_and_si128(filt, t7f);
517
filt = _mm_or_si128(filt, work_a);
518
filt = _mm_andnot_si128(hev, filt);
519
ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
520
qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
525
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
526
_mm_subs_epu8(p0, p2)),
527
_mm_or_si128(_mm_subs_epu8(q2, q0),
528
_mm_subs_epu8(q0, q2)));
529
flat = _mm_max_epu8(work, flat);
530
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
531
_mm_subs_epu8(p0, p3)),
532
_mm_or_si128(_mm_subs_epu8(q3, q0),
533
_mm_subs_epu8(q0, q3)));
534
flat = _mm_max_epu8(work, flat);
535
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
536
_mm_subs_epu8(p0, p4)),
537
_mm_or_si128(_mm_subs_epu8(q4, q0),
538
_mm_subs_epu8(q0, q4)));
539
flat = _mm_subs_epu8(flat, one);
540
flat = _mm_cmpeq_epi8(flat, zero);
541
flat = _mm_and_si128(flat, mask);
543
p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
544
q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
545
flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
546
_mm_subs_epu8(p0, p5)),
547
_mm_or_si128(_mm_subs_epu8(q5, q0),
548
_mm_subs_epu8(q0, q5)));
549
_mm_store_si128((__m128i *)ap[5], p5);
550
_mm_store_si128((__m128i *)aq[5], q5);
551
flat2 = _mm_max_epu8(work, flat2);
552
p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
553
q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
554
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
555
_mm_subs_epu8(p0, p6)),
556
_mm_or_si128(_mm_subs_epu8(q6, q0),
557
_mm_subs_epu8(q0, q6)));
558
_mm_store_si128((__m128i *)ap[6], p6);
559
_mm_store_si128((__m128i *)aq[6], q6);
560
flat2 = _mm_max_epu8(work, flat2);
562
p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
563
q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
564
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
565
_mm_subs_epu8(p0, p7)),
566
_mm_or_si128(_mm_subs_epu8(q7, q0),
567
_mm_subs_epu8(q0, q7)));
568
_mm_store_si128((__m128i *)ap[7], p7);
569
_mm_store_si128((__m128i *)aq[7], q7);
570
flat2 = _mm_max_epu8(work, flat2);
571
flat2 = _mm_subs_epu8(flat2, one);
572
flat2 = _mm_cmpeq_epi8(flat2, zero);
573
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
576
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
577
// flat and wide flat calculations
579
const __m128i eight = _mm_set1_epi16(8);
580
const __m128i four = _mm_set1_epi16(4);
581
__m128i temp_flat2 = flat2;
582
unsigned char *src = s;
588
unsigned int off = i * 8;
589
p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
590
p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
591
p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
592
p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
593
p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
594
p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
595
p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
596
p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
597
q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
598
q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
599
q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
600
q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
601
q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
602
q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
603
q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
604
q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
606
c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
607
c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
609
b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
610
a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
611
a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
613
_mm_storel_epi64((__m128i *)&flat_op[2][i*8],
614
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
617
c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
618
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
619
_mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
620
_mm_packus_epi16(workp_shft, workp_shft));
622
a = _mm_add_epi16(q1, a);
623
b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
624
_mm_storel_epi64((__m128i *)&flat_op[1][i*8],
625
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
628
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
629
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
630
_mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
631
_mm_packus_epi16(workp_shft, workp_shft));
633
a = _mm_add_epi16(q2, a);
634
b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
635
_mm_storel_epi64((__m128i *)&flat_op[0][i*8],
636
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
639
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
640
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
641
_mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
642
_mm_packus_epi16(workp_shft, workp_shft));
644
a = _mm_add_epi16(q3, a);
645
b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
646
_mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
647
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
650
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
651
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
652
_mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
653
_mm_packus_epi16(workp_shft, workp_shft));
655
b = _mm_add_epi16(q3, b);
656
b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
657
_mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
658
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
661
c = _mm_add_epi16(q4, c);
662
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
663
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
664
_mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
665
_mm_packus_epi16(workp_shft, workp_shft));
667
b = _mm_add_epi16(q3, b);
668
b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
669
_mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
670
_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
672
a = _mm_add_epi16(q5, a);
673
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
674
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
675
_mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
676
_mm_packus_epi16(workp_shft, workp_shft));
678
a = _mm_add_epi16(q6, a);
679
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
680
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
681
_mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
682
_mm_packus_epi16(workp_shft, workp_shft));
684
a = _mm_add_epi16(q7, a);
685
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
686
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
687
_mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
688
_mm_packus_epi16(workp_shft, workp_shft));
690
a = _mm_add_epi16(q7, a);
691
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
692
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
693
_mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
694
_mm_packus_epi16(workp_shft, workp_shft));
696
a = _mm_add_epi16(q7, a);
697
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
698
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
699
_mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
700
_mm_packus_epi16(workp_shft, workp_shft));
702
a = _mm_add_epi16(q7, a);
703
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
704
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
705
_mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
706
_mm_packus_epi16(workp_shft, workp_shft));
708
a = _mm_add_epi16(q7, a);
709
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
710
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
711
_mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
712
_mm_packus_epi16(workp_shft, workp_shft));
714
a = _mm_add_epi16(q7, a);
715
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
716
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
717
_mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
718
_mm_packus_epi16(workp_shft, workp_shft));
720
a = _mm_add_epi16(q7, a);
721
c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
722
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
723
_mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
724
_mm_packus_epi16(workp_shft, workp_shft));
726
temp_flat2 = _mm_srli_si128(temp_flat2, 8);
731
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
733
work_a = _mm_load_si128((__m128i *)ap[2]);
734
p2 = _mm_load_si128((__m128i *)flat_op[2]);
735
work_a = _mm_andnot_si128(flat, work_a);
736
p2 = _mm_and_si128(flat, p2);
737
p2 = _mm_or_si128(work_a, p2);
738
_mm_store_si128((__m128i *)flat_op[2], p2);
740
p1 = _mm_load_si128((__m128i *)flat_op[1]);
741
work_a = _mm_andnot_si128(flat, ps1);
742
p1 = _mm_and_si128(flat, p1);
743
p1 = _mm_or_si128(work_a, p1);
744
_mm_store_si128((__m128i *)flat_op[1], p1);
746
p0 = _mm_load_si128((__m128i *)flat_op[0]);
747
work_a = _mm_andnot_si128(flat, ps0);
748
p0 = _mm_and_si128(flat, p0);
749
p0 = _mm_or_si128(work_a, p0);
750
_mm_store_si128((__m128i *)flat_op[0], p0);
752
q0 = _mm_load_si128((__m128i *)flat_oq[0]);
753
work_a = _mm_andnot_si128(flat, qs0);
754
q0 = _mm_and_si128(flat, q0);
755
q0 = _mm_or_si128(work_a, q0);
756
_mm_store_si128((__m128i *)flat_oq[0], q0);
758
q1 = _mm_load_si128((__m128i *)flat_oq[1]);
759
work_a = _mm_andnot_si128(flat, qs1);
760
q1 = _mm_and_si128(flat, q1);
761
q1 = _mm_or_si128(work_a, q1);
762
_mm_store_si128((__m128i *)flat_oq[1], q1);
764
work_a = _mm_load_si128((__m128i *)aq[2]);
765
q2 = _mm_load_si128((__m128i *)flat_oq[2]);
766
work_a = _mm_andnot_si128(flat, work_a);
767
q2 = _mm_and_si128(flat, q2);
768
q2 = _mm_or_si128(work_a, q2);
769
_mm_store_si128((__m128i *)flat_oq[2], q2);
771
// write out op6 - op3
773
unsigned char *dst = (s - 7 * p);
774
for (i = 6; i > 2; i--) {
775
__m128i flat2_output;
776
work_a = _mm_load_si128((__m128i *)ap[i]);
777
flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
778
work_a = _mm_andnot_si128(flat2, work_a);
779
flat2_output = _mm_and_si128(flat2, flat2_output);
780
work_a = _mm_or_si128(work_a, flat2_output);
781
_mm_storeu_si128((__m128i *)dst, work_a);
786
work_a = _mm_load_si128((__m128i *)flat_op[2]);
787
p2 = _mm_load_si128((__m128i *)flat2_op[2]);
788
work_a = _mm_andnot_si128(flat2, work_a);
789
p2 = _mm_and_si128(flat2, p2);
790
p2 = _mm_or_si128(work_a, p2);
791
_mm_storeu_si128((__m128i *)(s - 3 * p), p2);
793
work_a = _mm_load_si128((__m128i *)flat_op[1]);
794
p1 = _mm_load_si128((__m128i *)flat2_op[1]);
795
work_a = _mm_andnot_si128(flat2, work_a);
796
p1 = _mm_and_si128(flat2, p1);
797
p1 = _mm_or_si128(work_a, p1);
798
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
800
work_a = _mm_load_si128((__m128i *)flat_op[0]);
801
p0 = _mm_load_si128((__m128i *)flat2_op[0]);
802
work_a = _mm_andnot_si128(flat2, work_a);
803
p0 = _mm_and_si128(flat2, p0);
804
p0 = _mm_or_si128(work_a, p0);
805
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
807
work_a = _mm_load_si128((__m128i *)flat_oq[0]);
808
q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
809
work_a = _mm_andnot_si128(flat2, work_a);
810
q0 = _mm_and_si128(flat2, q0);
811
q0 = _mm_or_si128(work_a, q0);
812
_mm_storeu_si128((__m128i *)(s - 0 * p), q0);
814
work_a = _mm_load_si128((__m128i *)flat_oq[1]);
815
q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
816
work_a = _mm_andnot_si128(flat2, work_a);
817
q1 = _mm_and_si128(flat2, q1);
818
q1 = _mm_or_si128(work_a, q1);
819
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
821
work_a = _mm_load_si128((__m128i *)flat_oq[2]);
822
q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
823
work_a = _mm_andnot_si128(flat2, work_a);
824
q2 = _mm_and_si128(flat2, q2);
825
q2 = _mm_or_si128(work_a, q2);
826
_mm_storeu_si128((__m128i *)(s + 2 * p), q2);
828
// write out oq3 - oq7
830
unsigned char *dst = (s + 3 * p);
831
for (i = 3; i < 7; i++) {
832
__m128i flat2_output;
833
work_a = _mm_load_si128((__m128i *)aq[i]);
834
flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
835
work_a = _mm_andnot_si128(flat2, work_a);
836
flat2_output = _mm_and_si128(flat2, flat2_output);
837
work_a = _mm_or_si128(work_a, flat2_output);
838
_mm_storeu_si128((__m128i *)dst, work_a);
845
void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
847
const unsigned char *_blimit,
848
const unsigned char *_limit,
849
const unsigned char *_thresh,
852
mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
854
mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
857
void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
859
const unsigned char *_blimit,
860
const unsigned char *_limit,
861
const unsigned char *_thresh,
863
DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
864
DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
865
DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
866
DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
867
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
868
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
869
__m128i mask, hev, flat;
870
const __m128i zero = _mm_set1_epi16(0);
871
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
872
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
873
const unsigned int extended_limit = _limit[0] * 0x01010101u;
874
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
875
const __m128i thresh =
876
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
877
const __m128i limit =
878
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
879
const __m128i blimit =
880
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
883
p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
884
p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
885
p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
886
p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
887
q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
888
q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
889
q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
890
q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
892
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
893
_mm_subs_epu8(p0, p1));
894
const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
895
_mm_subs_epu8(q0, q1));
896
const __m128i one = _mm_set1_epi8(1);
897
const __m128i fe = _mm_set1_epi8(0xfe);
898
const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
899
__m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
900
_mm_subs_epu8(q0, p0));
901
__m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
902
_mm_subs_epu8(q1, p1));
904
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905
hev = _mm_subs_epu8(flat, thresh);
906
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
908
abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
913
mask = _mm_max_epu8(flat, mask);
914
// mask |= (abs(p1 - p0) > limit) * -1;
915
// mask |= (abs(q1 - q0) > limit) * -1;
916
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
917
_mm_subs_epu8(p1, p2)),
918
_mm_or_si128(_mm_subs_epu8(p3, p2),
919
_mm_subs_epu8(p2, p3)));
920
mask = _mm_max_epu8(work, mask);
921
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
922
_mm_subs_epu8(q1, q2)),
923
_mm_or_si128(_mm_subs_epu8(q3, q2),
924
_mm_subs_epu8(q2, q3)));
925
mask = _mm_max_epu8(work, mask);
926
mask = _mm_subs_epu8(mask, limit);
927
mask = _mm_cmpeq_epi8(mask, zero);
929
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
930
_mm_subs_epu8(p0, p2)),
931
_mm_or_si128(_mm_subs_epu8(q2, q0),
932
_mm_subs_epu8(q0, q2)));
933
flat = _mm_max_epu8(work, flat);
934
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
935
_mm_subs_epu8(p0, p3)),
936
_mm_or_si128(_mm_subs_epu8(q3, q0),
937
_mm_subs_epu8(q0, q3)));
938
flat = _mm_max_epu8(work, flat);
939
flat = _mm_subs_epu8(flat, one);
940
flat = _mm_cmpeq_epi8(flat, zero);
941
flat = _mm_and_si128(flat, mask);
944
const __m128i four = _mm_set1_epi16(4);
945
unsigned char *src = s;
947
__m128i workp_a, workp_b, workp_shft;
948
p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
949
p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
950
p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
951
p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
952
q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
953
q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
954
q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
955
q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
957
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
958
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
959
workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
960
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
961
_mm_storel_epi64((__m128i *)&flat_op2[0],
962
_mm_packus_epi16(workp_shft, workp_shft));
964
workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
965
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
966
_mm_storel_epi64((__m128i *)&flat_op1[0],
967
_mm_packus_epi16(workp_shft, workp_shft));
969
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
970
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
971
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
972
_mm_storel_epi64((__m128i *)&flat_op0[0],
973
_mm_packus_epi16(workp_shft, workp_shft));
975
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
976
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
977
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
978
_mm_storel_epi64((__m128i *)&flat_oq0[0],
979
_mm_packus_epi16(workp_shft, workp_shft));
981
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
982
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
983
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
984
_mm_storel_epi64((__m128i *)&flat_oq1[0],
985
_mm_packus_epi16(workp_shft, workp_shft));
987
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
988
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
989
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
990
_mm_storel_epi64((__m128i *)&flat_oq2[0],
991
_mm_packus_epi16(workp_shft, workp_shft));
996
const __m128i t4 = _mm_set1_epi8(4);
997
const __m128i t3 = _mm_set1_epi8(3);
998
const __m128i t80 = _mm_set1_epi8(0x80);
999
const __m128i te0 = _mm_set1_epi8(0xe0);
1000
const __m128i t1f = _mm_set1_epi8(0x1f);
1001
const __m128i t1 = _mm_set1_epi8(0x1);
1002
const __m128i t7f = _mm_set1_epi8(0x7f);
1004
const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1006
const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1008
const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1010
const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1014
__m128i filter1, filter2;
1016
filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1017
work_a = _mm_subs_epi8(qs0, ps0);
1018
filt = _mm_adds_epi8(filt, work_a);
1019
filt = _mm_adds_epi8(filt, work_a);
1020
filt = _mm_adds_epi8(filt, work_a);
1021
/* (vp9_filter + 3 * (qs0 - ps0)) & mask */
1022
filt = _mm_and_si128(filt, mask);
1024
filter1 = _mm_adds_epi8(filt, t4);
1025
filter2 = _mm_adds_epi8(filt, t3);
1028
work_a = _mm_cmpgt_epi8(zero, filter1);
1029
filter1 = _mm_srli_epi16(filter1, 3);
1030
work_a = _mm_and_si128(work_a, te0);
1031
filter1 = _mm_and_si128(filter1, t1f);
1032
filter1 = _mm_or_si128(filter1, work_a);
1035
work_a = _mm_cmpgt_epi8(zero, filter2);
1036
filter2 = _mm_srli_epi16(filter2, 3);
1037
work_a = _mm_and_si128(work_a, te0);
1038
filter2 = _mm_and_si128(filter2, t1f);
1039
filter2 = _mm_or_si128(filter2, work_a);
1042
filt = _mm_adds_epi8(filter1, t1);
1043
work_a = _mm_cmpgt_epi8(zero, filt);
1044
filt = _mm_srli_epi16(filt, 1);
1045
work_a = _mm_and_si128(work_a, t80);
1046
filt = _mm_and_si128(filt, t7f);
1047
filt = _mm_or_si128(filt, work_a);
1049
filt = _mm_andnot_si128(hev, filt);
1051
work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1052
q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1053
work_a = _mm_andnot_si128(flat, work_a);
1054
q0 = _mm_and_si128(flat, q0);
1055
q0 = _mm_or_si128(work_a, q0);
1057
work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1058
q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1059
work_a = _mm_andnot_si128(flat, work_a);
1060
q1 = _mm_and_si128(flat, q1);
1061
q1 = _mm_or_si128(work_a, q1);
1063
work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1064
q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1065
work_a = _mm_andnot_si128(flat, work_a);
1066
q2 = _mm_and_si128(flat, q2);
1067
q2 = _mm_or_si128(work_a, q2);
1069
work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1070
p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1071
work_a = _mm_andnot_si128(flat, work_a);
1072
p0 = _mm_and_si128(flat, p0);
1073
p0 = _mm_or_si128(work_a, p0);
1075
work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1076
p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1077
work_a = _mm_andnot_si128(flat, work_a);
1078
p1 = _mm_and_si128(flat, p1);
1079
p1 = _mm_or_si128(work_a, p1);
1081
work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1082
p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1083
work_a = _mm_andnot_si128(flat, work_a);
1084
p2 = _mm_and_si128(flat, p2);
1085
p2 = _mm_or_si128(work_a, p2);
1087
_mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1088
_mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1089
_mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1090
_mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1091
_mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1092
_mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1096
static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1097
int in_p, unsigned char *out, int out_p) {
1098
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
1099
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
1101
/* Read in 16 lines */
1102
x0 = _mm_loadl_epi64((__m128i *)in0);
1103
x8 = _mm_loadl_epi64((__m128i *)in1);
1104
x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1105
x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1106
x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1107
x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1108
x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1109
x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1110
x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1111
x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1112
x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1113
x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1114
x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1115
x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1116
x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1117
x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1119
x0 = _mm_unpacklo_epi8(x0, x1);
1120
x1 = _mm_unpacklo_epi8(x2, x3);
1121
x2 = _mm_unpacklo_epi8(x4, x5);
1122
x3 = _mm_unpacklo_epi8(x6, x7);
1124
x8 = _mm_unpacklo_epi8(x8, x9);
1125
x9 = _mm_unpacklo_epi8(x10, x11);
1126
x10 = _mm_unpacklo_epi8(x12, x13);
1127
x11 = _mm_unpacklo_epi8(x14, x15);
1129
x4 = _mm_unpacklo_epi16(x0, x1);
1130
x5 = _mm_unpacklo_epi16(x2, x3);
1131
x12 = _mm_unpacklo_epi16(x8, x9);
1132
x13 = _mm_unpacklo_epi16(x10, x11);
1134
x6 = _mm_unpacklo_epi32(x4, x5);
1135
x7 = _mm_unpackhi_epi32(x4, x5);
1136
x14 = _mm_unpacklo_epi32(x12, x13);
1137
x15 = _mm_unpackhi_epi32(x12, x13);
1139
/* Store first 4-line result */
1140
_mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1141
_mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1142
_mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1143
_mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1145
x4 = _mm_unpackhi_epi16(x0, x1);
1146
x5 = _mm_unpackhi_epi16(x2, x3);
1147
x12 = _mm_unpackhi_epi16(x8, x9);
1148
x13 = _mm_unpackhi_epi16(x10, x11);
1150
x6 = _mm_unpacklo_epi32(x4, x5);
1151
x7 = _mm_unpackhi_epi32(x4, x5);
1152
x14 = _mm_unpacklo_epi32(x12, x13);
1153
x15 = _mm_unpackhi_epi32(x12, x13);
1155
/* Store second 4-line result */
1156
_mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1157
_mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1158
_mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1159
_mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1162
static INLINE void transpose(unsigned char *src[], int in_p,
1163
unsigned char *dst[], int out_p,
1164
int num_8x8_to_transpose) {
1166
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
1168
unsigned char *in = src[idx8x8];
1169
unsigned char *out = dst[idx8x8];
1171
x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
1172
x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
1173
x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
1174
x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
1175
x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
1176
x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
1177
x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
1178
x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
1179
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1180
x0 = _mm_unpacklo_epi8(x0, x1);
1181
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1182
x1 = _mm_unpacklo_epi8(x2, x3);
1183
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1184
x2 = _mm_unpacklo_epi8(x4, x5);
1185
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1186
x3 = _mm_unpacklo_epi8(x6, x7);
1187
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1188
x4 = _mm_unpacklo_epi16(x0, x1);
1189
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1190
x5 = _mm_unpacklo_epi16(x2, x3);
1191
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1192
x6 = _mm_unpacklo_epi32(x4, x5);
1193
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1194
x7 = _mm_unpackhi_epi32(x4, x5);
1196
_mm_storel_pd((double *)(out + 0*out_p),
1197
_mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
1198
_mm_storeh_pd((double *)(out + 1*out_p),
1199
_mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
1200
_mm_storel_pd((double *)(out + 2*out_p),
1201
_mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
1202
_mm_storeh_pd((double *)(out + 3*out_p),
1203
_mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
1205
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1206
x4 = _mm_unpackhi_epi16(x0, x1);
1207
// 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1208
x5 = _mm_unpackhi_epi16(x2, x3);
1209
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1210
x6 = _mm_unpacklo_epi32(x4, x5);
1211
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1212
x7 = _mm_unpackhi_epi32(x4, x5);
1214
_mm_storel_pd((double *)(out + 4*out_p),
1215
_mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
1216
_mm_storeh_pd((double *)(out + 5*out_p),
1217
_mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
1218
_mm_storel_pd((double *)(out + 6*out_p),
1219
_mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
1220
_mm_storeh_pd((double *)(out + 7*out_p),
1221
_mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1222
} while (++idx8x8 < num_8x8_to_transpose);
1225
void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
1227
const unsigned char *blimit,
1228
const unsigned char *limit,
1229
const unsigned char *thresh,
1231
DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1232
unsigned char *src[2];
1233
unsigned char *dst[2];
1236
/* Transpose 16x16 */
1237
transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
1238
transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
1240
/* Loop filtering */
1241
vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
1243
src[0] = t_dst + 3 * 16;
1244
src[1] = t_dst + 3 * 16 + 8;
1247
dst[1] = s - 5 + p * 8;
1249
/* Transpose 16x8 */
1250
transpose(src, 16, dst, p, 2);
1253
void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
1255
const unsigned char *blimit,
1256
const unsigned char *limit,
1257
const unsigned char *thresh) {
1258
DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1259
unsigned char *src[4];
1260
unsigned char *dst[4];
1263
dst[1] = t_dst + 8 * 16;
1268
/* Transpose 16x16 */
1269
transpose(src, p, dst, 16, 2);
1271
/* Loop filtering */
1272
vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
1276
src[1] = t_dst + 8 * 16;
1281
transpose(src, 16, dst, p, 2);