~ubuntu-branches/ubuntu/vivid/gstreamer-vaapi/vivid

« back to all changes in this revision

Viewing changes to ext/libvpx/upstream/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

  • Committer: Package Import Robot
  • Author(s): Vincent Cheng
  • Date: 2014-08-06 23:56:00 UTC
  • mfrom: (0.1.4 sid) (1.1.3)
  • Revision ID: package-import@ubuntu.com-20140806235600-fg1kcmiu67k315q5
Tags: 0.5.9-2
* Remove spurious build-deps: libva-drm1, libavcodec-dev. (Closes: #757283)
* Drop Build-Depends-Indep and build docs unconditionally on all archs.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 
3
 *
 
4
 *  Use of this source code is governed by a BSD-style license
 
5
 *  that can be found in the LICENSE file in the root of the source
 
6
 *  tree. An additional intellectual property rights grant can be found
 
7
 *  in the file PATENTS.  All contributing project authors may
 
8
 *  be found in the AUTHORS file in the root of the source tree.
 
9
 */
 
10
 
 
11
#include <emmintrin.h>  /* SSE2 */
 
12
#include "vp9/common/vp9_loopfilter.h"
 
13
#include "vpx_ports/emmintrin_compat.h"
 
14
 
 
15
static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
 
16
                                            int p,
 
17
                                            const unsigned char *_blimit,
 
18
                                            const unsigned char *_limit,
 
19
                                            const unsigned char *_thresh) {
 
20
  __m128i mask, hev, flat, flat2;
 
21
  const __m128i zero = _mm_set1_epi16(0);
 
22
  const __m128i one = _mm_set1_epi8(1);
 
23
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
 
24
  __m128i abs_p1p0;
 
25
  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
 
26
  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
 
27
  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
 
28
  const __m128i thresh =
 
29
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
 
30
  const __m128i limit =
 
31
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
 
32
  const __m128i blimit =
 
33
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
34
 
 
35
  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
 
36
  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
 
37
                                       (__m64 *)(s + 4 * p)));
 
38
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
 
39
  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
 
40
                                       (__m64 *)(s + 3 * p)));
 
41
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
 
42
  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
 
43
                                       (__m64 *)(s + 2 * p)));
 
44
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 
45
  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
 
46
                                       (__m64 *)(s + 1 * p)));
 
47
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
 
48
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 
49
  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
 
50
                                       (__m64 *)(s - 0 * p)));
 
51
  p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
52
 
 
53
  {
 
54
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
 
55
    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
 
56
                            _mm_subs_epu8(q0p0, q1p1));
 
57
    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
 
58
    fe = _mm_set1_epi8(0xfe);
 
59
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
 
60
    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
 
61
                            _mm_subs_epu8(p0q0, q0p0));
 
62
    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
 
63
                            _mm_subs_epu8(p1q1, q1p1));
 
64
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
65
    hev = _mm_subs_epu8(flat, thresh);
 
66
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
67
 
 
68
    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
 
69
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 
70
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
 
71
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 
72
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 
73
    mask = _mm_max_epu8(abs_p1p0, mask);
 
74
    // mask |= (abs(p1 - p0) > limit) * -1;
 
75
    // mask |= (abs(q1 - q0) > limit) * -1;
 
76
 
 
77
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
 
78
                                     _mm_subs_epu8(q1p1, q2p2)),
 
79
                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
 
80
                                     _mm_subs_epu8(q2p2, q3p3)));
 
81
    mask = _mm_max_epu8(work, mask);
 
82
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
 
83
    mask = _mm_subs_epu8(mask, limit);
 
84
    mask = _mm_cmpeq_epi8(mask, zero);
 
85
  }
 
86
 
 
87
  // lp filter
 
88
  {
 
89
    const __m128i t4 = _mm_set1_epi8(4);
 
90
    const __m128i t3 = _mm_set1_epi8(3);
 
91
    const __m128i t80 = _mm_set1_epi8(0x80);
 
92
    const __m128i t1 = _mm_set1_epi16(0x1);
 
93
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
 
94
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
 
95
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
 
96
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
 
97
    __m128i filt;
 
98
    __m128i work_a;
 
99
    __m128i filter1, filter2;
 
100
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
 
101
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
 
102
 
 
103
    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
 
104
    work_a = _mm_subs_epi8(qs0, qs0ps0);
 
105
    filt = _mm_adds_epi8(filt, work_a);
 
106
    filt = _mm_adds_epi8(filt, work_a);
 
107
    filt = _mm_adds_epi8(filt, work_a);
 
108
    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
 
109
    filt = _mm_and_si128(filt, mask);
 
110
 
 
111
    filter1 = _mm_adds_epi8(filt, t4);
 
112
    filter2 = _mm_adds_epi8(filt, t3);
 
113
 
 
114
    filter1 = _mm_unpacklo_epi8(zero, filter1);
 
115
    filter1 = _mm_srai_epi16(filter1, 0xB);
 
116
    filter2 = _mm_unpacklo_epi8(zero, filter2);
 
117
    filter2 = _mm_srai_epi16(filter2, 0xB);
 
118
 
 
119
    /* Filter1 >> 3 */
 
120
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
 
121
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
122
 
 
123
    /* filt >> 1 */
 
124
    filt = _mm_adds_epi16(filter1, t1);
 
125
    filt = _mm_srai_epi16(filt, 1);
 
126
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
 
127
                            filt);
 
128
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
 
129
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
 
130
    // loopfilter done
 
131
 
 
132
    {
 
133
      __m128i work;
 
134
      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
 
135
                                       _mm_subs_epu8(q0p0, q2p2)),
 
136
                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
 
137
                                       _mm_subs_epu8(q0p0, q3p3)));
 
138
      flat = _mm_max_epu8(abs_p1p0, flat);
 
139
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
 
140
      flat = _mm_subs_epu8(flat, one);
 
141
      flat = _mm_cmpeq_epi8(flat, zero);
 
142
      flat = _mm_and_si128(flat, mask);
 
143
 
 
144
      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
 
145
      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
 
146
                                           (__m64 *)(s + 5 * p)));
 
147
 
 
148
      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
 
149
      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
 
150
                                           (__m64 *)(s + 6 * p)));
 
151
 
 
152
      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
 
153
                                        _mm_subs_epu8(q0p0, q4p4)),
 
154
                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
 
155
                                        _mm_subs_epu8(q0p0, q5p5)));
 
156
 
 
157
      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
 
158
      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
 
159
                                           (__m64 *)(s + 7 * p)));
 
160
 
 
161
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
 
162
                                       _mm_subs_epu8(q0p0, q6p6)),
 
163
                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
 
164
                                       _mm_subs_epu8(q0p0, q7p7)));
 
165
 
 
166
      flat2 = _mm_max_epu8(work, flat2);
 
167
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
 
168
      flat2 = _mm_subs_epu8(flat2, one);
 
169
      flat2 = _mm_cmpeq_epi8(flat2, zero);
 
170
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
 
171
    }
 
172
 
 
173
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
174
    // flat and wide flat calculations
 
175
    {
 
176
      const __m128i eight = _mm_set1_epi16(8);
 
177
      const __m128i four = _mm_set1_epi16(4);
 
178
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
 
179
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
 
180
      __m128i pixelFilter_p, pixelFilter_q;
 
181
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
 
182
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
 
183
 
 
184
      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
 
185
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
 
186
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
 
187
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
 
188
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
 
189
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
 
190
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
 
191
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
 
192
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
 
193
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
 
194
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
 
195
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
 
196
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
 
197
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
 
198
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
 
199
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
 
200
 
 
201
      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
 
202
                                    _mm_add_epi16(p4_16, p3_16));
 
203
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
 
204
                                    _mm_add_epi16(q4_16, q3_16));
 
205
 
 
206
      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
 
207
      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
 
208
 
 
209
      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
 
210
      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
 
211
      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
 
212
                                                         pixelFilter_q));
 
213
      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
 
214
                                           _mm_add_epi16(pixetFilter_p2p1p0,
 
215
                                                         pixetFilter_q2q1q0));
 
216
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
217
                                           _mm_add_epi16(p7_16, p0_16)), 4);
 
218
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
219
                                           _mm_add_epi16(q7_16, q0_16)), 4);
 
220
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
 
221
      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
 
222
                                           _mm_add_epi16(p3_16, p0_16)), 3);
 
223
      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
 
224
                                           _mm_add_epi16(q3_16, q0_16)), 3);
 
225
 
 
226
      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
 
227
 
 
228
      sum_p7 = _mm_add_epi16(p7_16, p7_16);
 
229
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
 
230
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
 
231
      sum_q3 = _mm_add_epi16(q3_16, q3_16);
 
232
 
 
233
      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
 
234
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
 
235
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
236
                             _mm_add_epi16(sum_p7, p1_16)), 4);
 
237
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
238
                             _mm_add_epi16(sum_q7, q1_16)), 4);
 
239
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
 
240
 
 
241
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
 
242
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
 
243
      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
 
244
                             _mm_add_epi16(sum_p3, p1_16)), 3);
 
245
      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
 
246
                             _mm_add_epi16(sum_q3, q1_16)), 3);
 
247
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
 
248
 
 
249
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
 
250
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
 
251
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
 
252
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
 
253
 
 
254
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
 
255
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
 
256
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
257
                             _mm_add_epi16(sum_p7, p2_16)), 4);
 
258
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
259
                             _mm_add_epi16(sum_q7, q2_16)), 4);
 
260
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
 
261
 
 
262
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
 
263
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
 
264
 
 
265
      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
 
266
                                           _mm_add_epi16(sum_p3, p2_16)), 3);
 
267
      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
 
268
                                           _mm_add_epi16(sum_q3, q2_16)), 3);
 
269
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
 
270
 
 
271
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
 
272
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
 
273
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
 
274
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
 
275
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
276
                             _mm_add_epi16(sum_p7, p3_16)), 4);
 
277
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
278
                             _mm_add_epi16(sum_q7, q3_16)), 4);
 
279
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
 
280
 
 
281
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
 
282
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
 
283
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
 
284
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
 
285
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
286
                             _mm_add_epi16(sum_p7, p4_16)), 4);
 
287
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
288
                             _mm_add_epi16(sum_q7, q4_16)), 4);
 
289
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
 
290
 
 
291
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
 
292
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
 
293
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
 
294
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
 
295
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
296
                             _mm_add_epi16(sum_p7, p5_16)), 4);
 
297
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
298
                             _mm_add_epi16(sum_q7, q5_16)), 4);
 
299
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
 
300
 
 
301
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
 
302
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
 
303
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
 
304
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
 
305
      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
 
306
                             _mm_add_epi16(sum_p7, p6_16)), 4);
 
307
      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
 
308
                             _mm_add_epi16(sum_q7, q6_16)), 4);
 
309
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
 
310
    }
 
311
    // wide flat
 
312
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
313
 
 
314
    flat = _mm_shuffle_epi32(flat, 68);
 
315
    flat2 = _mm_shuffle_epi32(flat2, 68);
 
316
 
 
317
    q2p2 = _mm_andnot_si128(flat, q2p2);
 
318
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
 
319
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
 
320
 
 
321
    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
 
322
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
 
323
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
 
324
 
 
325
    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
 
326
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
 
327
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
 
328
 
 
329
    q6p6 = _mm_andnot_si128(flat2, q6p6);
 
330
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
 
331
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
 
332
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
 
333
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
 
334
 
 
335
    q5p5 = _mm_andnot_si128(flat2, q5p5);
 
336
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
 
337
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
 
338
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
 
339
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
 
340
 
 
341
    q4p4 = _mm_andnot_si128(flat2, q4p4);
 
342
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
 
343
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
 
344
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
 
345
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
 
346
 
 
347
    q3p3 = _mm_andnot_si128(flat2, q3p3);
 
348
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
 
349
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
 
350
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
 
351
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
 
352
 
 
353
    q2p2 = _mm_andnot_si128(flat2, q2p2);
 
354
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
 
355
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
 
356
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
 
357
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
 
358
 
 
359
    q1p1 = _mm_andnot_si128(flat2, q1p1);
 
360
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
 
361
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
 
362
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
 
363
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
 
364
 
 
365
    q0p0 = _mm_andnot_si128(flat2, q0p0);
 
366
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
 
367
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
 
368
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
 
369
    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
 
370
  }
 
371
}
 
372
 
 
373
static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
 
374
                                             int p,
 
375
                                             const unsigned char *_blimit,
 
376
                                             const unsigned char *_limit,
 
377
                                             const unsigned char *_thresh) {
 
378
  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
 
379
  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
 
380
 
 
381
  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
 
382
  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
 
383
 
 
384
  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
 
385
  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
 
386
 
 
387
 
 
388
  __m128i mask, hev, flat, flat2;
 
389
  const __m128i zero = _mm_set1_epi16(0);
 
390
  const __m128i one = _mm_set1_epi8(1);
 
391
  __m128i p7, p6, p5;
 
392
  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
 
393
  __m128i q5, q6, q7;
 
394
  int i = 0;
 
395
  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
 
396
  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
 
397
  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
 
398
  const __m128i thresh =
 
399
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
 
400
  const __m128i limit =
 
401
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
 
402
  const __m128i blimit =
 
403
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
404
 
 
405
  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
 
406
  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
 
407
  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
 
408
  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
 
409
  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
 
410
  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
 
411
  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
 
412
  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
 
413
  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
 
414
  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
 
415
 
 
416
  _mm_store_si128((__m128i *)ap[4], p4);
 
417
  _mm_store_si128((__m128i *)ap[3], p3);
 
418
  _mm_store_si128((__m128i *)ap[2], p2);
 
419
  _mm_store_si128((__m128i *)ap[1], p1);
 
420
  _mm_store_si128((__m128i *)ap[0], p0);
 
421
  _mm_store_si128((__m128i *)aq[4], q4);
 
422
  _mm_store_si128((__m128i *)aq[3], q3);
 
423
  _mm_store_si128((__m128i *)aq[2], q2);
 
424
  _mm_store_si128((__m128i *)aq[1], q1);
 
425
  _mm_store_si128((__m128i *)aq[0], q0);
 
426
 
 
427
 
 
428
  {
 
429
    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
 
430
                                          _mm_subs_epu8(p0, p1));
 
431
    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
 
432
                                          _mm_subs_epu8(q0, q1));
 
433
    const __m128i fe = _mm_set1_epi8(0xfe);
 
434
    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
 
435
    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
 
436
                                    _mm_subs_epu8(q0, p0));
 
437
    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
 
438
                                    _mm_subs_epu8(q1, p1));
 
439
    __m128i work;
 
440
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
441
    hev = _mm_subs_epu8(flat, thresh);
 
442
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
443
 
 
444
    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
 
445
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 
446
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
 
447
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 
448
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 
449
    mask = _mm_max_epu8(flat, mask);
 
450
    // mask |= (abs(p1 - p0) > limit) * -1;
 
451
    // mask |= (abs(q1 - q0) > limit) * -1;
 
452
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
 
453
                                     _mm_subs_epu8(p1, p2)),
 
454
                         _mm_or_si128(_mm_subs_epu8(p3, p2),
 
455
                                      _mm_subs_epu8(p2, p3)));
 
456
    mask = _mm_max_epu8(work, mask);
 
457
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
 
458
                                     _mm_subs_epu8(q1, q2)),
 
459
                         _mm_or_si128(_mm_subs_epu8(q3, q2),
 
460
                                      _mm_subs_epu8(q2, q3)));
 
461
    mask = _mm_max_epu8(work, mask);
 
462
    mask = _mm_subs_epu8(mask, limit);
 
463
    mask = _mm_cmpeq_epi8(mask, zero);
 
464
  }
 
465
 
 
466
  // lp filter
 
467
  {
 
468
    const __m128i t4 = _mm_set1_epi8(4);
 
469
    const __m128i t3 = _mm_set1_epi8(3);
 
470
    const __m128i t80 = _mm_set1_epi8(0x80);
 
471
    const __m128i te0 = _mm_set1_epi8(0xe0);
 
472
    const __m128i t1f = _mm_set1_epi8(0x1f);
 
473
    const __m128i t1 = _mm_set1_epi8(0x1);
 
474
    const __m128i t7f = _mm_set1_epi8(0x7f);
 
475
 
 
476
    __m128i ps1 = _mm_xor_si128(p1, t80);
 
477
    __m128i ps0 = _mm_xor_si128(p0, t80);
 
478
    __m128i qs0 = _mm_xor_si128(q0, t80);
 
479
    __m128i qs1 = _mm_xor_si128(q1, t80);
 
480
    __m128i filt;
 
481
    __m128i work_a;
 
482
    __m128i filter1, filter2;
 
483
 
 
484
    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
 
485
    work_a = _mm_subs_epi8(qs0, ps0);
 
486
    filt = _mm_adds_epi8(filt, work_a);
 
487
    filt = _mm_adds_epi8(filt, work_a);
 
488
    filt = _mm_adds_epi8(filt, work_a);
 
489
    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
 
490
    filt = _mm_and_si128(filt, mask);
 
491
 
 
492
    filter1 = _mm_adds_epi8(filt, t4);
 
493
    filter2 = _mm_adds_epi8(filt, t3);
 
494
 
 
495
    /* Filter1 >> 3 */
 
496
    work_a = _mm_cmpgt_epi8(zero, filter1);
 
497
    filter1 = _mm_srli_epi16(filter1, 3);
 
498
    work_a = _mm_and_si128(work_a, te0);
 
499
    filter1 = _mm_and_si128(filter1, t1f);
 
500
    filter1 = _mm_or_si128(filter1, work_a);
 
501
    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
502
 
 
503
    /* Filter2 >> 3 */
 
504
    work_a = _mm_cmpgt_epi8(zero, filter2);
 
505
    filter2 = _mm_srli_epi16(filter2, 3);
 
506
    work_a = _mm_and_si128(work_a, te0);
 
507
    filter2 = _mm_and_si128(filter2, t1f);
 
508
    filter2 = _mm_or_si128(filter2, work_a);
 
509
    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
510
 
 
511
    /* filt >> 1 */
 
512
    filt = _mm_adds_epi8(filter1, t1);
 
513
    work_a = _mm_cmpgt_epi8(zero, filt);
 
514
    filt = _mm_srli_epi16(filt, 1);
 
515
    work_a = _mm_and_si128(work_a, t80);
 
516
    filt = _mm_and_si128(filt, t7f);
 
517
    filt = _mm_or_si128(filt, work_a);
 
518
    filt = _mm_andnot_si128(hev, filt);
 
519
    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
 
520
    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
 
521
    // loopfilter done
 
522
 
 
523
    {
 
524
      __m128i work;
 
525
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
 
526
                                       _mm_subs_epu8(p0, p2)),
 
527
                           _mm_or_si128(_mm_subs_epu8(q2, q0),
 
528
                                        _mm_subs_epu8(q0, q2)));
 
529
      flat = _mm_max_epu8(work, flat);
 
530
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
 
531
                                       _mm_subs_epu8(p0, p3)),
 
532
                           _mm_or_si128(_mm_subs_epu8(q3, q0),
 
533
                                        _mm_subs_epu8(q0, q3)));
 
534
      flat = _mm_max_epu8(work, flat);
 
535
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
 
536
                                       _mm_subs_epu8(p0, p4)),
 
537
                           _mm_or_si128(_mm_subs_epu8(q4, q0),
 
538
                                        _mm_subs_epu8(q0, q4)));
 
539
      flat = _mm_subs_epu8(flat, one);
 
540
      flat = _mm_cmpeq_epi8(flat, zero);
 
541
      flat = _mm_and_si128(flat, mask);
 
542
 
 
543
      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
 
544
      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
 
545
      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
 
546
                                       _mm_subs_epu8(p0, p5)),
 
547
                           _mm_or_si128(_mm_subs_epu8(q5, q0),
 
548
                                        _mm_subs_epu8(q0, q5)));
 
549
      _mm_store_si128((__m128i *)ap[5], p5);
 
550
      _mm_store_si128((__m128i *)aq[5], q5);
 
551
      flat2 = _mm_max_epu8(work, flat2);
 
552
      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
 
553
      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
 
554
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
 
555
                                       _mm_subs_epu8(p0, p6)),
 
556
                           _mm_or_si128(_mm_subs_epu8(q6, q0),
 
557
                                        _mm_subs_epu8(q0, q6)));
 
558
      _mm_store_si128((__m128i *)ap[6], p6);
 
559
      _mm_store_si128((__m128i *)aq[6], q6);
 
560
      flat2 = _mm_max_epu8(work, flat2);
 
561
 
 
562
      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
 
563
      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
 
564
      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
 
565
                                       _mm_subs_epu8(p0, p7)),
 
566
                           _mm_or_si128(_mm_subs_epu8(q7, q0),
 
567
                                        _mm_subs_epu8(q0, q7)));
 
568
      _mm_store_si128((__m128i *)ap[7], p7);
 
569
      _mm_store_si128((__m128i *)aq[7], q7);
 
570
      flat2 = _mm_max_epu8(work, flat2);
 
571
      flat2 = _mm_subs_epu8(flat2, one);
 
572
      flat2 = _mm_cmpeq_epi8(flat2, zero);
 
573
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
 
574
    }
 
575
 
 
576
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
577
    // flat and wide flat calculations
 
578
    {
 
579
      const __m128i eight = _mm_set1_epi16(8);
 
580
      const __m128i four = _mm_set1_epi16(4);
 
581
      __m128i temp_flat2 = flat2;
 
582
      unsigned char *src = s;
 
583
      int i = 0;
 
584
      do {
 
585
        __m128i workp_shft;
 
586
        __m128i a, b, c;
 
587
 
 
588
        unsigned int off = i * 8;
 
589
        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
 
590
        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
 
591
        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
 
592
        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
 
593
        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
 
594
        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
 
595
        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
 
596
        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
 
597
        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
 
598
        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
 
599
        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
 
600
        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
 
601
        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
 
602
        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
 
603
        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
 
604
        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
 
605
 
 
606
        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
 
607
        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
 
608
 
 
609
        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
 
610
        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
 
611
        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
 
612
 
 
613
        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
 
614
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
615
                                          , b));
 
616
 
 
617
        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
 
618
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
619
        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
 
620
                         _mm_packus_epi16(workp_shft, workp_shft));
 
621
 
 
622
        a = _mm_add_epi16(q1, a);
 
623
        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
 
624
        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
 
625
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
626
                                          , b));
 
627
 
 
628
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
 
629
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
630
        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
 
631
                         _mm_packus_epi16(workp_shft, workp_shft));
 
632
 
 
633
        a = _mm_add_epi16(q2, a);
 
634
        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
 
635
        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
 
636
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
637
                                          , b));
 
638
 
 
639
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
 
640
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
641
        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
 
642
                         _mm_packus_epi16(workp_shft, workp_shft));
 
643
 
 
644
        a = _mm_add_epi16(q3, a);
 
645
        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
 
646
        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
 
647
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
648
                                          , b));
 
649
 
 
650
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
 
651
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
652
        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
 
653
                         _mm_packus_epi16(workp_shft, workp_shft));
 
654
 
 
655
        b = _mm_add_epi16(q3, b);
 
656
        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
 
657
        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
 
658
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
659
                                          , b));
 
660
 
 
661
        c = _mm_add_epi16(q4, c);
 
662
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
 
663
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
664
        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
 
665
                         _mm_packus_epi16(workp_shft, workp_shft));
 
666
 
 
667
        b = _mm_add_epi16(q3, b);
 
668
        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
 
669
        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
 
670
                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
 
671
                                          , b));
 
672
        a = _mm_add_epi16(q5, a);
 
673
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
 
674
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
675
        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
 
676
                         _mm_packus_epi16(workp_shft, workp_shft));
 
677
 
 
678
        a = _mm_add_epi16(q6, a);
 
679
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
 
680
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
681
        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
 
682
                         _mm_packus_epi16(workp_shft, workp_shft));
 
683
 
 
684
        a = _mm_add_epi16(q7, a);
 
685
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
 
686
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
687
        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
 
688
                         _mm_packus_epi16(workp_shft, workp_shft));
 
689
 
 
690
        a = _mm_add_epi16(q7, a);
 
691
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
 
692
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
693
        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
 
694
                         _mm_packus_epi16(workp_shft, workp_shft));
 
695
 
 
696
        a = _mm_add_epi16(q7, a);
 
697
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
 
698
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
699
        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
 
700
                         _mm_packus_epi16(workp_shft, workp_shft));
 
701
 
 
702
        a = _mm_add_epi16(q7, a);
 
703
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
 
704
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
705
        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
 
706
                         _mm_packus_epi16(workp_shft, workp_shft));
 
707
 
 
708
        a = _mm_add_epi16(q7, a);
 
709
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
 
710
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
711
        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
 
712
                         _mm_packus_epi16(workp_shft, workp_shft));
 
713
 
 
714
        a = _mm_add_epi16(q7, a);
 
715
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
 
716
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
717
        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
 
718
                         _mm_packus_epi16(workp_shft, workp_shft));
 
719
 
 
720
        a = _mm_add_epi16(q7, a);
 
721
        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
 
722
        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
 
723
        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
 
724
                         _mm_packus_epi16(workp_shft, workp_shft));
 
725
 
 
726
        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
 
727
        src += 8;
 
728
      } while (++i < 2);
 
729
    }
 
730
    // wide flat
 
731
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
732
 
 
733
    work_a = _mm_load_si128((__m128i *)ap[2]);
 
734
    p2 = _mm_load_si128((__m128i *)flat_op[2]);
 
735
    work_a = _mm_andnot_si128(flat, work_a);
 
736
    p2 = _mm_and_si128(flat, p2);
 
737
    p2 = _mm_or_si128(work_a, p2);
 
738
    _mm_store_si128((__m128i *)flat_op[2], p2);
 
739
 
 
740
    p1 = _mm_load_si128((__m128i *)flat_op[1]);
 
741
    work_a = _mm_andnot_si128(flat, ps1);
 
742
    p1 = _mm_and_si128(flat, p1);
 
743
    p1 = _mm_or_si128(work_a, p1);
 
744
    _mm_store_si128((__m128i *)flat_op[1], p1);
 
745
 
 
746
    p0 = _mm_load_si128((__m128i *)flat_op[0]);
 
747
    work_a = _mm_andnot_si128(flat, ps0);
 
748
    p0 = _mm_and_si128(flat, p0);
 
749
    p0 = _mm_or_si128(work_a, p0);
 
750
    _mm_store_si128((__m128i *)flat_op[0], p0);
 
751
 
 
752
    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
 
753
    work_a = _mm_andnot_si128(flat, qs0);
 
754
    q0 = _mm_and_si128(flat, q0);
 
755
    q0 = _mm_or_si128(work_a, q0);
 
756
    _mm_store_si128((__m128i *)flat_oq[0], q0);
 
757
 
 
758
    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
 
759
    work_a = _mm_andnot_si128(flat, qs1);
 
760
    q1 = _mm_and_si128(flat, q1);
 
761
    q1 = _mm_or_si128(work_a, q1);
 
762
    _mm_store_si128((__m128i *)flat_oq[1], q1);
 
763
 
 
764
    work_a = _mm_load_si128((__m128i *)aq[2]);
 
765
    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
 
766
    work_a = _mm_andnot_si128(flat, work_a);
 
767
    q2 = _mm_and_si128(flat, q2);
 
768
    q2 = _mm_or_si128(work_a, q2);
 
769
    _mm_store_si128((__m128i *)flat_oq[2], q2);
 
770
 
 
771
    // write out op6 - op3
 
772
    {
 
773
      unsigned char *dst = (s - 7 * p);
 
774
      for (i = 6; i > 2; i--) {
 
775
        __m128i flat2_output;
 
776
        work_a = _mm_load_si128((__m128i *)ap[i]);
 
777
        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
 
778
        work_a = _mm_andnot_si128(flat2, work_a);
 
779
        flat2_output = _mm_and_si128(flat2, flat2_output);
 
780
        work_a = _mm_or_si128(work_a, flat2_output);
 
781
        _mm_storeu_si128((__m128i *)dst, work_a);
 
782
        dst += p;
 
783
      }
 
784
    }
 
785
 
 
786
    work_a = _mm_load_si128((__m128i *)flat_op[2]);
 
787
    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
 
788
    work_a = _mm_andnot_si128(flat2, work_a);
 
789
    p2 = _mm_and_si128(flat2, p2);
 
790
    p2 = _mm_or_si128(work_a, p2);
 
791
    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
792
 
 
793
    work_a = _mm_load_si128((__m128i *)flat_op[1]);
 
794
    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
 
795
    work_a = _mm_andnot_si128(flat2, work_a);
 
796
    p1 = _mm_and_si128(flat2, p1);
 
797
    p1 = _mm_or_si128(work_a, p1);
 
798
    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
799
 
 
800
    work_a = _mm_load_si128((__m128i *)flat_op[0]);
 
801
    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
 
802
    work_a = _mm_andnot_si128(flat2, work_a);
 
803
    p0 = _mm_and_si128(flat2, p0);
 
804
    p0 = _mm_or_si128(work_a, p0);
 
805
    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
806
 
 
807
    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
 
808
    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
 
809
    work_a = _mm_andnot_si128(flat2, work_a);
 
810
    q0 = _mm_and_si128(flat2, q0);
 
811
    q0 = _mm_or_si128(work_a, q0);
 
812
    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
813
 
 
814
    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
 
815
    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
 
816
    work_a = _mm_andnot_si128(flat2, work_a);
 
817
    q1 = _mm_and_si128(flat2, q1);
 
818
    q1 = _mm_or_si128(work_a, q1);
 
819
    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
820
 
 
821
    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
 
822
    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
 
823
    work_a = _mm_andnot_si128(flat2, work_a);
 
824
    q2 = _mm_and_si128(flat2, q2);
 
825
    q2 = _mm_or_si128(work_a, q2);
 
826
    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
 
827
 
 
828
    // write out oq3 - oq7
 
829
    {
 
830
      unsigned char *dst = (s + 3 * p);
 
831
      for (i = 3; i < 7; i++) {
 
832
        __m128i flat2_output;
 
833
        work_a = _mm_load_si128((__m128i *)aq[i]);
 
834
        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
 
835
        work_a = _mm_andnot_si128(flat2, work_a);
 
836
        flat2_output = _mm_and_si128(flat2, flat2_output);
 
837
        work_a = _mm_or_si128(work_a, flat2_output);
 
838
        _mm_storeu_si128((__m128i *)dst, work_a);
 
839
        dst += p;
 
840
      }
 
841
    }
 
842
  }
 
843
}
 
844
 
 
845
void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
 
846
                                       int p,
 
847
                                       const unsigned char *_blimit,
 
848
                                       const unsigned char *_limit,
 
849
                                       const unsigned char *_thresh,
 
850
                                       int count) {
 
851
  if (count == 1)
 
852
    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
 
853
  else
 
854
    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 
855
}
 
856
 
 
857
void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
 
858
                                            int p,
 
859
                                            const unsigned char *_blimit,
 
860
                                            const unsigned char *_limit,
 
861
                                            const unsigned char *_thresh,
 
862
                                            int count) {
 
863
  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
 
864
  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
 
865
  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
 
866
  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
 
867
  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
 
868
  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
 
869
  __m128i mask, hev, flat;
 
870
  const __m128i zero = _mm_set1_epi16(0);
 
871
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
872
  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
 
873
  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
 
874
  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
 
875
  const __m128i thresh =
 
876
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
 
877
  const __m128i limit =
 
878
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
 
879
  const __m128i blimit =
 
880
      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
881
 
 
882
  (void)count;
 
883
  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
 
884
  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
 
885
  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 
886
  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 
887
  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
 
888
  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 
889
  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
 
890
  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
 
891
  {
 
892
    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
 
893
                                          _mm_subs_epu8(p0, p1));
 
894
    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
 
895
                                          _mm_subs_epu8(q0, q1));
 
896
    const __m128i one = _mm_set1_epi8(1);
 
897
    const __m128i fe = _mm_set1_epi8(0xfe);
 
898
    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
 
899
    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
 
900
                                    _mm_subs_epu8(q0, p0));
 
901
    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
 
902
                                    _mm_subs_epu8(q1, p1));
 
903
    __m128i work;
 
904
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
905
    hev = _mm_subs_epu8(flat, thresh);
 
906
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
907
 
 
908
    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
 
909
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 
910
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
 
911
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 
912
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 
913
    mask = _mm_max_epu8(flat, mask);
 
914
    // mask |= (abs(p1 - p0) > limit) * -1;
 
915
    // mask |= (abs(q1 - q0) > limit) * -1;
 
916
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
 
917
                                     _mm_subs_epu8(p1, p2)),
 
918
                         _mm_or_si128(_mm_subs_epu8(p3, p2),
 
919
                                      _mm_subs_epu8(p2, p3)));
 
920
    mask = _mm_max_epu8(work, mask);
 
921
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
 
922
                                     _mm_subs_epu8(q1, q2)),
 
923
                         _mm_or_si128(_mm_subs_epu8(q3, q2),
 
924
                                      _mm_subs_epu8(q2, q3)));
 
925
    mask = _mm_max_epu8(work, mask);
 
926
    mask = _mm_subs_epu8(mask, limit);
 
927
    mask = _mm_cmpeq_epi8(mask, zero);
 
928
 
 
929
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
 
930
                                     _mm_subs_epu8(p0, p2)),
 
931
                         _mm_or_si128(_mm_subs_epu8(q2, q0),
 
932
                                      _mm_subs_epu8(q0, q2)));
 
933
    flat = _mm_max_epu8(work, flat);
 
934
    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
 
935
                                     _mm_subs_epu8(p0, p3)),
 
936
                         _mm_or_si128(_mm_subs_epu8(q3, q0),
 
937
                                      _mm_subs_epu8(q0, q3)));
 
938
    flat = _mm_max_epu8(work, flat);
 
939
    flat = _mm_subs_epu8(flat, one);
 
940
    flat = _mm_cmpeq_epi8(flat, zero);
 
941
    flat = _mm_and_si128(flat, mask);
 
942
  }
 
943
  {
 
944
    const __m128i four = _mm_set1_epi16(4);
 
945
    unsigned char *src = s;
 
946
    {
 
947
      __m128i workp_a, workp_b, workp_shft;
 
948
      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
 
949
      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
 
950
      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
 
951
      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
 
952
      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
 
953
      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
 
954
      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
 
955
      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
 
956
 
 
957
      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
 
958
      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
 
959
      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
 
960
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
961
      _mm_storel_epi64((__m128i *)&flat_op2[0],
 
962
                       _mm_packus_epi16(workp_shft, workp_shft));
 
963
 
 
964
      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
 
965
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
966
      _mm_storel_epi64((__m128i *)&flat_op1[0],
 
967
                       _mm_packus_epi16(workp_shft, workp_shft));
 
968
 
 
969
      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
 
970
      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
 
971
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
972
      _mm_storel_epi64((__m128i *)&flat_op0[0],
 
973
                       _mm_packus_epi16(workp_shft, workp_shft));
 
974
 
 
975
      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
 
976
      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
 
977
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
978
      _mm_storel_epi64((__m128i *)&flat_oq0[0],
 
979
                       _mm_packus_epi16(workp_shft, workp_shft));
 
980
 
 
981
      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
 
982
      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
 
983
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
984
      _mm_storel_epi64((__m128i *)&flat_oq1[0],
 
985
                       _mm_packus_epi16(workp_shft, workp_shft));
 
986
 
 
987
      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
 
988
      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
 
989
      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
990
      _mm_storel_epi64((__m128i *)&flat_oq2[0],
 
991
                       _mm_packus_epi16(workp_shft, workp_shft));
 
992
    }
 
993
  }
 
994
  // lp filter
 
995
  {
 
996
    const __m128i t4 = _mm_set1_epi8(4);
 
997
    const __m128i t3 = _mm_set1_epi8(3);
 
998
    const __m128i t80 = _mm_set1_epi8(0x80);
 
999
    const __m128i te0 = _mm_set1_epi8(0xe0);
 
1000
    const __m128i t1f = _mm_set1_epi8(0x1f);
 
1001
    const __m128i t1 = _mm_set1_epi8(0x1);
 
1002
    const __m128i t7f = _mm_set1_epi8(0x7f);
 
1003
 
 
1004
    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
 
1005
                                      t80);
 
1006
    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
 
1007
                                      t80);
 
1008
    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
 
1009
                                      t80);
 
1010
    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
 
1011
                                      t80);
 
1012
    __m128i filt;
 
1013
    __m128i work_a;
 
1014
    __m128i filter1, filter2;
 
1015
 
 
1016
    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
 
1017
    work_a = _mm_subs_epi8(qs0, ps0);
 
1018
    filt = _mm_adds_epi8(filt, work_a);
 
1019
    filt = _mm_adds_epi8(filt, work_a);
 
1020
    filt = _mm_adds_epi8(filt, work_a);
 
1021
    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
 
1022
    filt = _mm_and_si128(filt, mask);
 
1023
 
 
1024
    filter1 = _mm_adds_epi8(filt, t4);
 
1025
    filter2 = _mm_adds_epi8(filt, t3);
 
1026
 
 
1027
    /* Filter1 >> 3 */
 
1028
    work_a = _mm_cmpgt_epi8(zero, filter1);
 
1029
    filter1 = _mm_srli_epi16(filter1, 3);
 
1030
    work_a = _mm_and_si128(work_a, te0);
 
1031
    filter1 = _mm_and_si128(filter1, t1f);
 
1032
    filter1 = _mm_or_si128(filter1, work_a);
 
1033
 
 
1034
    /* Filter2 >> 3 */
 
1035
    work_a = _mm_cmpgt_epi8(zero, filter2);
 
1036
    filter2 = _mm_srli_epi16(filter2, 3);
 
1037
    work_a = _mm_and_si128(work_a, te0);
 
1038
    filter2 = _mm_and_si128(filter2, t1f);
 
1039
    filter2 = _mm_or_si128(filter2, work_a);
 
1040
 
 
1041
    /* filt >> 1 */
 
1042
    filt = _mm_adds_epi8(filter1, t1);
 
1043
    work_a = _mm_cmpgt_epi8(zero, filt);
 
1044
    filt = _mm_srli_epi16(filt, 1);
 
1045
    work_a = _mm_and_si128(work_a, t80);
 
1046
    filt = _mm_and_si128(filt, t7f);
 
1047
    filt = _mm_or_si128(filt, work_a);
 
1048
 
 
1049
    filt = _mm_andnot_si128(hev, filt);
 
1050
 
 
1051
    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
1052
    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
 
1053
    work_a = _mm_andnot_si128(flat, work_a);
 
1054
    q0 = _mm_and_si128(flat, q0);
 
1055
    q0 = _mm_or_si128(work_a, q0);
 
1056
 
 
1057
    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
 
1058
    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
 
1059
    work_a = _mm_andnot_si128(flat, work_a);
 
1060
    q1 = _mm_and_si128(flat, q1);
 
1061
    q1 = _mm_or_si128(work_a, q1);
 
1062
 
 
1063
    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
 
1064
    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
 
1065
    work_a = _mm_andnot_si128(flat, work_a);
 
1066
    q2 = _mm_and_si128(flat, q2);
 
1067
    q2 = _mm_or_si128(work_a, q2);
 
1068
 
 
1069
    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
1070
    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
 
1071
    work_a = _mm_andnot_si128(flat, work_a);
 
1072
    p0 = _mm_and_si128(flat, p0);
 
1073
    p0 = _mm_or_si128(work_a, p0);
 
1074
 
 
1075
    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
 
1076
    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
 
1077
    work_a = _mm_andnot_si128(flat, work_a);
 
1078
    p1 = _mm_and_si128(flat, p1);
 
1079
    p1 = _mm_or_si128(work_a, p1);
 
1080
 
 
1081
    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
 
1082
    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
 
1083
    work_a = _mm_andnot_si128(flat, work_a);
 
1084
    p2 = _mm_and_si128(flat, p2);
 
1085
    p2 = _mm_or_si128(work_a, p2);
 
1086
 
 
1087
    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
 
1088
    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
 
1089
    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
 
1090
    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
 
1091
    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
 
1092
    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 
1093
  }
 
1094
}
 
1095
 
 
1096
static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
 
1097
                                 int in_p, unsigned char *out, int out_p) {
 
1098
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
 
1099
  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
1100
 
 
1101
  /* Read in 16 lines */
 
1102
  x0 = _mm_loadl_epi64((__m128i *)in0);
 
1103
  x8 = _mm_loadl_epi64((__m128i *)in1);
 
1104
  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
 
1105
  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
 
1106
  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
 
1107
  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
 
1108
  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
 
1109
  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
 
1110
  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
 
1111
  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
 
1112
  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
 
1113
  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
 
1114
  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
 
1115
  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
 
1116
  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
 
1117
  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
 
1118
 
 
1119
  x0 = _mm_unpacklo_epi8(x0, x1);
 
1120
  x1 = _mm_unpacklo_epi8(x2, x3);
 
1121
  x2 = _mm_unpacklo_epi8(x4, x5);
 
1122
  x3 = _mm_unpacklo_epi8(x6, x7);
 
1123
 
 
1124
  x8 = _mm_unpacklo_epi8(x8, x9);
 
1125
  x9 = _mm_unpacklo_epi8(x10, x11);
 
1126
  x10 = _mm_unpacklo_epi8(x12, x13);
 
1127
  x11 = _mm_unpacklo_epi8(x14, x15);
 
1128
 
 
1129
  x4 = _mm_unpacklo_epi16(x0, x1);
 
1130
  x5 = _mm_unpacklo_epi16(x2, x3);
 
1131
  x12 = _mm_unpacklo_epi16(x8, x9);
 
1132
  x13 = _mm_unpacklo_epi16(x10, x11);
 
1133
 
 
1134
  x6 = _mm_unpacklo_epi32(x4, x5);
 
1135
  x7 = _mm_unpackhi_epi32(x4, x5);
 
1136
  x14 = _mm_unpacklo_epi32(x12, x13);
 
1137
  x15 = _mm_unpackhi_epi32(x12, x13);
 
1138
 
 
1139
  /* Store first 4-line result */
 
1140
  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
 
1141
  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
 
1142
  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
 
1143
  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
 
1144
 
 
1145
  x4 = _mm_unpackhi_epi16(x0, x1);
 
1146
  x5 = _mm_unpackhi_epi16(x2, x3);
 
1147
  x12 = _mm_unpackhi_epi16(x8, x9);
 
1148
  x13 = _mm_unpackhi_epi16(x10, x11);
 
1149
 
 
1150
  x6 = _mm_unpacklo_epi32(x4, x5);
 
1151
  x7 = _mm_unpackhi_epi32(x4, x5);
 
1152
  x14 = _mm_unpacklo_epi32(x12, x13);
 
1153
  x15 = _mm_unpackhi_epi32(x12, x13);
 
1154
 
 
1155
  /* Store second 4-line result */
 
1156
  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
 
1157
  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
 
1158
  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
 
1159
  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 
1160
}
 
1161
 
 
1162
static INLINE void transpose(unsigned char *src[], int in_p,
 
1163
                             unsigned char *dst[], int out_p,
 
1164
                             int num_8x8_to_transpose) {
 
1165
  int idx8x8 = 0;
 
1166
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
 
1167
  do {
 
1168
    unsigned char *in = src[idx8x8];
 
1169
    unsigned char *out = dst[idx8x8];
 
1170
 
 
1171
    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
 
1172
    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
 
1173
    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
 
1174
    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
 
1175
    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
 
1176
    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
 
1177
    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
 
1178
    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
 
1179
    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
 
1180
    x0 = _mm_unpacklo_epi8(x0, x1);
 
1181
    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
 
1182
    x1 = _mm_unpacklo_epi8(x2, x3);
 
1183
    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
 
1184
    x2 = _mm_unpacklo_epi8(x4, x5);
 
1185
    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
 
1186
    x3 = _mm_unpacklo_epi8(x6, x7);
 
1187
    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
 
1188
    x4 = _mm_unpacklo_epi16(x0, x1);
 
1189
    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
 
1190
    x5 = _mm_unpacklo_epi16(x2, x3);
 
1191
    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
 
1192
    x6 = _mm_unpacklo_epi32(x4, x5);
 
1193
    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
 
1194
    x7 = _mm_unpackhi_epi32(x4, x5);
 
1195
 
 
1196
    _mm_storel_pd((double *)(out + 0*out_p),
 
1197
                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
 
1198
    _mm_storeh_pd((double *)(out + 1*out_p),
 
1199
                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
 
1200
    _mm_storel_pd((double *)(out + 2*out_p),
 
1201
                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
 
1202
    _mm_storeh_pd((double *)(out + 3*out_p),
 
1203
                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
 
1204
 
 
1205
    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
 
1206
    x4 = _mm_unpackhi_epi16(x0, x1);
 
1207
    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
 
1208
    x5 = _mm_unpackhi_epi16(x2, x3);
 
1209
    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
 
1210
    x6 = _mm_unpacklo_epi32(x4, x5);
 
1211
    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
 
1212
    x7 = _mm_unpackhi_epi32(x4, x5);
 
1213
 
 
1214
    _mm_storel_pd((double *)(out + 4*out_p),
 
1215
                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
 
1216
    _mm_storeh_pd((double *)(out + 5*out_p),
 
1217
                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
 
1218
    _mm_storel_pd((double *)(out + 6*out_p),
 
1219
                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
 
1220
    _mm_storeh_pd((double *)(out + 7*out_p),
 
1221
                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
 
1222
  } while (++idx8x8 < num_8x8_to_transpose);
 
1223
}
 
1224
 
 
1225
void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
 
1226
                                          int p,
 
1227
                                          const unsigned char *blimit,
 
1228
                                          const unsigned char *limit,
 
1229
                                          const unsigned char *thresh,
 
1230
                                          int count) {
 
1231
  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
 
1232
  unsigned char *src[2];
 
1233
  unsigned char *dst[2];
 
1234
 
 
1235
  (void)count;
 
1236
  /* Transpose 16x16 */
 
1237
  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
 
1238
  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
 
1239
 
 
1240
  /* Loop filtering */
 
1241
  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
 
1242
                                         thresh, 1);
 
1243
  src[0] = t_dst + 3 * 16;
 
1244
  src[1] = t_dst + 3 * 16 + 8;
 
1245
 
 
1246
  dst[0] = s - 5;
 
1247
  dst[1] = s - 5 + p * 8;
 
1248
 
 
1249
  /* Transpose 16x8 */
 
1250
  transpose(src, 16, dst, p, 2);
 
1251
}
 
1252
 
 
1253
void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
 
1254
                                     int p,
 
1255
                                     const unsigned char *blimit,
 
1256
                                     const unsigned char *limit,
 
1257
                                     const unsigned char *thresh) {
 
1258
  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
 
1259
  unsigned char *src[4];
 
1260
  unsigned char *dst[4];
 
1261
 
 
1262
  dst[0] = t_dst;
 
1263
  dst[1] = t_dst + 8 * 16;
 
1264
 
 
1265
  src[0] = s - 8;
 
1266
  src[1] = s - 8 + 8;
 
1267
 
 
1268
  /* Transpose 16x16 */
 
1269
  transpose(src, p, dst, 16, 2);
 
1270
 
 
1271
  /* Loop filtering */
 
1272
  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
 
1273
                                    thresh, 1);
 
1274
 
 
1275
  src[0] = t_dst;
 
1276
  src[1] = t_dst + 8 * 16;
 
1277
 
 
1278
  dst[0] = s - 8;
 
1279
  dst[1] = s - 8 + 8;
 
1280
 
 
1281
  transpose(src, 16, dst, p, 2);
 
1282
}