~ubuntu-branches/debian/sid/ffmpeg/sid

0.1.29 by Andreas Cadhalpun
Import upstream version 2.8
1
/*
2
 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
21
#include "libavutil/mips/generic_macros_msa.h"
22
#include "libavcodec/mips/hevcdsp_mips.h"
23
24
static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
25
                                         int32_t beta, int32_t *tc,
26
                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
27
{
28
    uint8_t *p3 = src - (stride << 2);
29
    uint8_t *p2 = src - ((stride << 1) + stride);
30
    uint8_t *p1 = src - (stride << 1);
31
    uint8_t *p0 = src - stride;
32
    uint8_t *q0 = src;
33
    uint8_t *q1 = src + stride;
34
    uint8_t *q2 = src + (stride << 1);
35
    uint8_t *q3 = src + (stride << 1) + stride;
36
    uint8_t flag0, flag1;
37
    int32_t dp00, dq00, dp30, dq30, d00, d30;
38
    int32_t dp04, dq04, dp34, dq34, d04, d34;
39
    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
40
    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
41
    uint64_t dst_val0, dst_val1;
42
    v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
43
    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
44
    v8u16 temp0, temp1;
45
    v8i16 temp2;
46
    v8i16 tc_pos, tc_neg;
47
    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
48
    v16i8 zero = { 0 };
49
    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
50
51
    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
52
    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
53
    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
54
    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
55
    d00 = dp00 + dq00;
56
    d30 = dp30 + dq30;
57
    p_is_pcm0 = p_is_pcm[0];
58
    q_is_pcm0 = q_is_pcm[0];
59
    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
60
    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
61
    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
62
    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
63
    d04 = dp04 + dq04;
64
    d34 = dp34 + dq34;
65
    p_is_pcm4 = p_is_pcm[1];
66
    q_is_pcm4 = q_is_pcm[1];
67
68
    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
69
        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
70
            p3_src = LD_UH(p3);
71
            p2_src = LD_UH(p2);
72
            p1_src = LD_UH(p1);
73
            p0_src = LD_UH(p0);
74
            q0_src = LD_UH(q0);
75
            q1_src = LD_UH(q1);
76
            q2_src = LD_UH(q2);
77
            q3_src = LD_UH(q3);
78
79
            tc0 = tc[0];
80
            beta30 = beta >> 3;
81
            beta20 = beta >> 2;
82
            tc250 = ((tc0 * 5 + 1) >> 1);
83
            tc4 = tc[1];
84
            tc254 = ((tc4 * 5 + 1) >> 1);
85
86
            flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
87
                     abs(p0[0] - q0[0]) < tc250 &&
88
                     abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
89
                     abs(p0[3] - q0[3]) < tc250 &&
90
                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
91
            cmp0 = __msa_fill_d(flag0);
92
93
            flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
94
                     abs(p0[4] - q0[4]) < tc254 &&
95
                     abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
96
                     abs(p0[7] - q0[7]) < tc254 &&
97
                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
98
            cmp1 = __msa_fill_d(flag1);
99
            cmp2 = __msa_ilvev_d(cmp1, cmp0);
100
            cmp2 = __msa_ceqi_d(cmp2, 0);
101
102
            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
103
                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
104
                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
105
                       q3_src);
106
107
            cmp0 = (v2i64) __msa_fill_h(tc0);
108
            cmp1 = (v2i64) __msa_fill_h(tc4);
109
            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
110
            tc_pos <<= 1;
111
            tc_neg = -tc_pos;
112
113
            temp0 = (p1_src + p0_src + q0_src);
114
            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
115
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
116
            temp2 = (v8i16) (temp1 - p2_src);
117
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
118
            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
119
120
            temp1 = temp0 + p2_src;
121
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
122
            temp2 = (v8i16) (temp1 - p1_src);
123
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
124
            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
125
126
            temp1 = (temp0 << 1) + p2_src + q1_src;
127
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
128
            temp2 = (v8i16) (temp1 - p0_src);
129
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
130
            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
131
132
            cmp0 = __msa_fill_d(p_is_pcm0);
133
            cmp1 = __msa_fill_d(p_is_pcm4);
134
            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
135
            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
136
137
            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
138
            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
139
            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
140
141
            temp0 = (q1_src + p0_src + q0_src);
142
143
            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
144
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
145
            temp2 = (v8i16) (temp1 - q2_src);
146
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
147
            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
148
149
            temp1 = temp0 + q2_src;
150
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
151
            temp2 = (v8i16) (temp1 - q1_src);
152
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
153
            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
154
155
            temp1 = (temp0 << 1) + p1_src + q2_src;
156
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
157
            temp2 = (v8i16) (temp1 - q0_src);
158
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
159
            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
160
161
            cmp0 = __msa_fill_d(q_is_pcm0);
162
            cmp1 = __msa_fill_d(q_is_pcm4);
163
            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
164
            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
165
166
            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
167
            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
168
            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
169
170
            tc_pos >>= 1;
171
            tc_neg = -tc_pos;
172
173
            diff0 = (v8i16) (q0_src - p0_src);
174
            diff1 = (v8i16) (q1_src - p1_src);
175
            diff0 = (diff0 << 3) + diff0;
176
            diff1 = (diff1 << 1) + diff1;
177
            delta0 = diff0 - diff1;
178
            delta0 = __msa_srari_h(delta0, 4);
179
180
            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
181
            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
182
            abs_delta0 = (v8u16) abs_delta0 < temp1;
183
184
            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
185
186
            temp0 = (v8u16) (delta0 + p0_src);
187
            temp0 = (v8u16) CLIP_SH_0_255(temp0);
188
            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
189
                                        (v16u8) p_is_pcm_vec);
190
191
            temp2 = (v8i16) (q0_src - delta0);
192
            temp2 = CLIP_SH_0_255(temp2);
193
            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
194
                                        (v16u8) q_is_pcm_vec);
195
196
            tmp = (beta + (beta >> 1)) >> 3;
197
            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
198
            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
199
            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
200
            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
201
202
            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
203
            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
204
            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
205
            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
206
207
            tc_pos >>= 1;
208
            tc_neg = -tc_pos;
209
210
            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
211
            delta1 -= (v8i16) p1_src;
212
            delta1 += delta0;
213
            delta1 >>= 1;
214
            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
215
            delta1 = (v8i16) p1_src + (v8i16) delta1;
216
            delta1 = CLIP_SH_0_255(delta1);
217
            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
218
                                          (v16u8) p_is_pcm_vec);
219
220
            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
221
            delta2 = delta2 - (v8i16) q1_src;
222
            delta2 = delta2 - delta0;
223
            delta2 = delta2 >> 1;
224
            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
225
            delta2 = (v8i16) q1_src + (v8i16) delta2;
226
            delta2 = CLIP_SH_0_255(delta2);
227
            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
228
                                          (v16u8) q_is_pcm_vec);
229
230
            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
231
                                         (v16u8) abs_delta0);
232
            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
233
                                        (v16u8) abs_delta0);
234
            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
235
                                        (v16u8) abs_delta0);
236
            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
237
                                         (v16u8) abs_delta0);
238
239
            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
240
            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
241
            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
242
            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
243
            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
244
            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
245
246
            cmp0 = __msa_fill_d(d00 + d30 >= beta);
247
            cmp1 = __msa_fill_d(d04 + d34 >= beta);
248
            cmp0 = __msa_ilvev_d(cmp1, cmp0);
249
            cmp0 = __msa_ceqi_d(cmp0, 0);
250
251
            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0);
252
            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0);
253
            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0);
254
            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0);
255
            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0);
256
            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0);
257
258
            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
259
            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
260
261
            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
262
            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
263
264
            ST8x4_UB(dst0, dst1, p2, stride);
265
            p2 += (4 * stride);
266
            SD(dst_val0, p2);
267
            p2 += stride;
268
            SD(dst_val1, p2);
269
        }
270
    }
271
}
272
273
static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
274
                                         int32_t beta, int32_t *tc,
275
                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
276
{
277
    uint8_t *p3 = src;
278
    uint8_t *p2 = src + 3 * stride;
279
    uint8_t *p1 = src + (stride << 2);
280
    uint8_t *p0 = src + 7 * stride;
281
    uint8_t flag0, flag1;
282
    uint16_t tmp0, tmp1;
283
    uint32_t tmp2, tmp3;
284
    int32_t dp00, dq00, dp30, dq30, d00, d30;
285
    int32_t dp04, dq04, dp34, dq34, d04, d34;
286
    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
287
    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
288
    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
289
    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
290
    v8u16 temp0, temp1;
291
    v8i16 temp2;
292
    v8i16 tc_pos, tc_neg;
293
    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
294
    v16i8 zero = { 0 };
295
    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
296
297
    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
298
    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
299
    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
300
    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
301
    d00 = dp00 + dq00;
302
    d30 = dp30 + dq30;
303
    p_is_pcm0 = p_is_pcm[0];
304
    q_is_pcm0 = q_is_pcm[0];
305
306
    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
307
    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
308
    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
309
    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
310
    d04 = dp04 + dq04;
311
    d34 = dp34 + dq34;
312
    p_is_pcm4 = p_is_pcm[1];
313
    q_is_pcm4 = q_is_pcm[1];
314
315
    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
316
        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
317
            src -= 4;
318
            LD_UH8(src, stride,
319
                   p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
320
                   q3_src);
321
322
            tc0 = tc[0];
323
            beta30 = beta >> 3;
324
            beta20 = beta >> 2;
325
            tc250 = ((tc0 * 5 + 1) >> 1);
326
327
            tc4 = tc[1];
328
            tc254 = ((tc4 * 5 + 1) >> 1);
329
330
            TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
331
                               q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
332
                               q0_src, q1_src, q2_src, q3_src);
333
334
            flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
335
                     abs(p3[-1] - p3[0]) < tc250 &&
336
                     abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
337
                     abs(p2[-1] - p2[0]) < tc250 &&
338
                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
339
            cmp0 = __msa_fill_d(flag0);
340
341
            flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
342
                     abs(p1[-1] - p1[0]) < tc254 &&
343
                     abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
344
                     abs(p0[-1] - p0[0]) < tc254 &&
345
                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
346
            cmp1 = __msa_fill_d(flag1);
347
            cmp2 = __msa_ilvev_d(cmp1, cmp0);
348
            cmp2 = __msa_ceqi_d(cmp2, 0);
349
350
            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
351
                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
352
                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
353
                       q3_src);
354
355
            cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
356
            cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
357
            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
358
            tc_neg = -tc_pos;
359
360
            temp0 = (p1_src + p0_src + q0_src);
361
362
            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
363
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
364
            temp2 = (v8i16) (temp1 - p2_src);
365
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
366
            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
367
368
            temp1 = temp0 + p2_src;
369
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
370
            temp2 = (v8i16) (temp1 - p1_src);
371
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
372
            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
373
374
            temp1 = (temp0 << 1) + p2_src + q1_src;
375
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
376
            temp2 = (v8i16) (temp1 - p0_src);
377
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
378
            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
379
380
            cmp0 = __msa_fill_d(p_is_pcm0);
381
            cmp1 = __msa_fill_d(p_is_pcm4);
382
            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
383
            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
384
385
            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
386
            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
387
            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
388
389
            temp0 = (q1_src + p0_src + q0_src);
390
            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
391
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
392
            temp2 = (v8i16) (temp1 - q2_src);
393
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
394
            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
395
396
            temp1 = temp0 + q2_src;
397
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
398
            temp2 = (v8i16) (temp1 - q1_src);
399
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
400
            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
401
402
            temp1 = (temp0 << 1) + p1_src + q2_src;
403
            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
404
            temp2 = (v8i16) (temp1 - q0_src);
405
            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
406
            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
407
408
            cmp0 = __msa_fill_d(q_is_pcm0);
409
            cmp1 = __msa_fill_d(q_is_pcm4);
410
            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
411
            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
412
413
            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
414
            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
415
            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
416
417
            tc_pos >>= 1;
418
            tc_neg = -tc_pos;
419
420
            diff0 = (v8i16) (q0_src - p0_src);
421
            diff1 = (v8i16) (q1_src - p1_src);
422
            diff0 = (v8i16) (diff0 << 3) + diff0;
423
            diff1 = (v8i16) (diff1 << 1) + diff1;
424
            delta0 = diff0 - diff1;
425
            delta0 = __msa_srari_h(delta0, 4);
426
427
            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
428
            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
429
            abs_delta0 = (v8u16) abs_delta0 < temp1;
430
431
            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
432
            temp0 = (v8u16) delta0 + p0_src;
433
            temp0 = (v8u16) CLIP_SH_0_255(temp0);
434
            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
435
                                        (v16u8) p_is_pcm_vec);
436
437
            temp2 = (v8i16) q0_src - delta0;
438
            temp2 = CLIP_SH_0_255(temp2);
439
            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
440
                                        (v16u8) q_is_pcm_vec);
441
442
            tmp = ((beta + (beta >> 1)) >> 3);
443
            cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp));
444
            cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp));
445
            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
446
            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
447
448
            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
449
            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
450
            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
451
            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
452
453
            tc_pos >>= 1;
454
            tc_neg = -tc_pos;
455
456
            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
457
            delta1 -= (v8i16) p1_src;
458
            delta1 += delta0;
459
            delta1 >>= 1;
460
            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
461
            delta1 = (v8i16) p1_src + (v8i16) delta1;
462
            delta1 = CLIP_SH_0_255(delta1);
463
            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
464
                                          (v16u8) p_is_pcm_vec);
465
466
            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
467
            delta2 = delta2 - (v8i16) q1_src;
468
            delta2 = delta2 - delta0;
469
            delta2 = delta2 >> 1;
470
            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
471
            delta2 = (v8i16) q1_src + (v8i16) delta2;
472
            delta2 = CLIP_SH_0_255(delta2);
473
            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
474
                                          (v16u8) q_is_pcm_vec);
475
            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
476
                                         (v16u8) abs_delta0);
477
            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
478
                                        (v16u8) abs_delta0);
479
            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
480
                                        (v16u8) abs_delta0);
481
            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
482
                                         (v16u8) abs_delta0);
483
484
            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
485
            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
486
            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
487
            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
488
            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
489
            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
490
491
            cmp0 = __msa_fill_d(d00 + d30 >= beta);
492
            dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta);
493
            cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0);
494
            dst6 = (v16u8) __msa_ceqi_d(cmp0, 0);
495
496
            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6);
497
            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6);
498
            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6);
499
            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6);
500
            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6);
501
            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6);
502
503
            PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3,
504
                        dst0, dst1, dst2, dst3);
505
            PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5);
506
507
            TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
508
                               dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
509
510
            src += 1;
511
512
            tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
513
            tmp0 = __msa_copy_u_h((v8i16) dst0, 2);
514
            tmp3 = __msa_copy_u_w((v4i32) dst1, 0);
515
            tmp1 = __msa_copy_u_h((v8i16) dst1, 2);
516
            SW(tmp2, src);
517
            SH(tmp0, src + 4);
518
            src += stride;
519
            SW(tmp3, src);
520
            SH(tmp1, src + 4);
521
            src += stride;
522
523
            tmp2 = __msa_copy_u_w((v4i32) dst2, 0);
524
            tmp0 = __msa_copy_u_h((v8i16) dst2, 2);
525
            tmp3 = __msa_copy_u_w((v4i32) dst3, 0);
526
            tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
527
            SW(tmp2, src);
528
            SH(tmp0, src + 4);
529
            src += stride;
530
            SW(tmp3, src);
531
            SH(tmp1, src + 4);
532
            src += stride;
533
534
            tmp2 = __msa_copy_u_w((v4i32) dst4, 0);
535
            tmp0 = __msa_copy_u_h((v8i16) dst4, 2);
536
            tmp3 = __msa_copy_u_w((v4i32) dst5, 0);
537
            tmp1 = __msa_copy_u_h((v8i16) dst5, 2);
538
            SW(tmp2, src);
539
            SH(tmp0, src + 4);
540
            src += stride;
541
            SW(tmp3, src);
542
            SH(tmp1, src + 4);
543
            src += stride;
544
545
            tmp2 = __msa_copy_u_w((v4i32) dst6, 0);
546
            tmp0 = __msa_copy_u_h((v8i16) dst6, 2);
547
            tmp3 = __msa_copy_u_w((v4i32) dst7, 0);
548
            tmp1 = __msa_copy_u_h((v8i16) dst7, 2);
549
            SW(tmp2, src);
550
            SH(tmp0, src + 4);
551
            src += stride;
552
            SW(tmp3, src);
553
            SH(tmp1, src + 4);
554
        }
555
    }
556
}
557
558
static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
559
                                           int32_t *tc, uint8_t *p_is_pcm,
560
                                           uint8_t *q_is_pcm)
561
{
562
    uint8_t *p1_ptr = src - (stride << 1);
563
    uint8_t *p0_ptr = src - stride;
564
    uint8_t *q0_ptr = src;
565
    uint8_t *q1_ptr = src + stride;
566
    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
567
    v8u16 p1, p0, q0, q1;
568
    v8i16 tc_pos, tc_neg;
569
    v16i8 zero = { 0 };
570
    v8i16 temp0, temp1, delta;
571
572
    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
573
        cmp0 = (v2i64) __msa_fill_h(tc[0]);
574
        cmp1 = (v2i64) __msa_fill_h(tc[1]);
575
        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
576
        tc_neg = -tc_pos;
577
578
        cmp0 = __msa_fill_d(p_is_pcm[0]);
579
        cmp1 = __msa_fill_d(p_is_pcm[1]);
580
        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
581
        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
582
583
        cmp0 = __msa_fill_d(q_is_pcm[0]);
584
        cmp1 = __msa_fill_d(q_is_pcm[1]);
585
        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
586
        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
587
588
        p1 = LD_UH(p1_ptr);
589
        p0 = LD_UH(p0_ptr);
590
        q0 = LD_UH(q0_ptr);
591
        q1 = LD_UH(q1_ptr);
592
593
        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
594
595
        temp0 = (v8i16) (q0 - p0);
596
        temp1 = (v8i16) (p1 - q1);
597
        temp0 <<= 2;
598
        temp0 += temp1;
599
        delta = __msa_srari_h((v8i16) temp0, 3);
600
        delta = CLIP_SH(delta, tc_neg, tc_pos);
601
602
        temp0 = (v8i16) ((v8i16) p0 + delta);
603
        temp0 = CLIP_SH_0_255(temp0);
604
        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
605
                                    (v16u8) p_is_pcm_vec);
606
607
        temp1 = (v8i16) ((v8i16) q0 - delta);
608
        temp1 = CLIP_SH_0_255(temp1);
609
        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
610
                                    (v16u8) q_is_pcm_vec);
611
612
        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
613
        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
614
        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
615
616
        temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
617
        ST8x2_UB(temp0, p0_ptr, stride);
618
    }
619
}
620
621
static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
622
                                           int32_t *tc, uint8_t *p_is_pcm,
623
                                           uint8_t *q_is_pcm)
624
{
625
    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
626
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
627
    v8u16 p1, p0, q0, q1;
628
    v8i16 tc_pos, tc_neg;
629
    v16i8 zero = { 0 };
630
    v8i16 temp0, temp1, delta;
631
632
    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
633
        cmp0 = (v2i64) __msa_fill_h(tc[0]);
634
        cmp1 = (v2i64) __msa_fill_h(tc[1]);
635
        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
636
        tc_neg = -tc_pos;
637
638
        cmp0 = __msa_fill_d(p_is_pcm[0]);
639
        cmp1 = __msa_fill_d(p_is_pcm[1]);
640
        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
641
        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
642
643
        cmp0 = __msa_fill_d(q_is_pcm[0]);
644
        cmp1 = __msa_fill_d(q_is_pcm[1]);
645
        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
646
        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
647
648
        src -= 2;
649
        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
650
        TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
651
                           p1, p0, q0, q1);
652
        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
653
654
        temp0 = (v8i16) (q0 - p0);
655
        temp1 = (v8i16) (p1 - q1);
656
        temp0 <<= 2;
657
        temp0 += temp1;
658
        delta = __msa_srari_h((v8i16) temp0, 3);
659
        delta = CLIP_SH(delta, tc_neg, tc_pos);
660
661
        temp0 = (v8i16) ((v8i16) p0 + delta);
662
        temp0 = CLIP_SH_0_255(temp0);
663
        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
664
                                    (v16u8) p_is_pcm_vec);
665
666
        temp1 = (v8i16) ((v8i16) q0 - delta);
667
        temp1 = CLIP_SH_0_255(temp1);
668
        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
669
                                    (v16u8) q_is_pcm_vec);
670
671
        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
672
        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
673
        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
674
675
        temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
676
677
        src += 1;
678
        ST2x4_UB(temp0, 0, src, stride);
679
        src += (4 * stride);
680
        ST2x4_UB(temp0, 4, src, stride);
681
    }
682
}
683
684
static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
685
                                            uint8_t *src, int32_t src_stride,
686
                                            int32_t sao_left_class,
687
                                            int16_t *sao_offset_val,
688
                                            int32_t height)
689
{
690
    int32_t h_cnt;
691
    v16u8 src0, src1, src2, src3;
692
    v16i8 src0_r, src1_r;
693
    v16i8 offset, offset_val, mask;
694
    v16i8 offset0 = { 0 };
695
    v16i8 offset1 = { 0 };
696
    v16i8 zero = { 0 };
697
    v8i16 temp0, temp1, dst0, dst1;
698
699
    offset_val = LD_SB(sao_offset_val + 1);
700
    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
701
702
    offset_val = __msa_pckev_b(offset_val, offset_val);
703
    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
704
    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
705
    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
706
707
    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
708
        SWAP(offset0, offset1);
709
    }
710
711
    for (h_cnt = height >> 2; h_cnt--;) {
712
        LD_UB4(src, src_stride, src0, src1, src2, src3);
713
        src += (4 * src_stride);
714
715
        ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
716
717
        src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
718
        mask = __msa_srli_b(src0_r, 3);
719
        offset = __msa_vshf_b(mask, offset1, offset0);
720
721
        UNPCK_SB_SH(offset, temp0, temp1);
722
        ILVRL_B2_SH(zero, src0_r, dst0, dst1);
723
        ADD2(dst0, temp0, dst1, temp1, dst0, dst1);
724
        CLIP_SH2_0_255(dst0, dst1);
725
        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
726
        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
727
        dst += (4 * dst_stride);
728
    }
729
}
730
731
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
732
                                            uint8_t *src, int32_t src_stride,
733
                                            int32_t sao_left_class,
734
                                            int16_t *sao_offset_val,
735
                                            int32_t height)
736
{
737
    int32_t h_cnt;
738
    v16u8 src0, src1, src2, src3;
739
    v16i8 src0_r, src1_r, mask0, mask1;
740
    v16i8 offset, offset_val;
741
    v16i8 offset0 = { 0 };
742
    v16i8 offset1 = { 0 };
743
    v16i8 zero = { 0 };
744
    v8i16 dst0, dst1, dst2, dst3;
745
    v8i16 temp0, temp1, temp2, temp3;
746
747
    offset_val = LD_SB(sao_offset_val + 1);
748
    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
749
    offset_val = __msa_pckev_b(offset_val, offset_val);
750
    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
751
    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
752
    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
753
754
    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
755
        SWAP(offset0, offset1);
756
    }
757
758
    for (h_cnt = height >> 2; h_cnt--;) {
759
        LD_UB4(src, src_stride, src0, src1, src2, src3);
760
        src += (4 * src_stride);
761
762
        ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
763
764
        mask0 = __msa_srli_b(src0_r, 3);
765
        mask1 = __msa_srli_b(src1_r, 3);
766
767
        offset = __msa_vshf_b(mask0, offset1, offset0);
768
        UNPCK_SB_SH(offset, temp0, temp1);
769
770
        offset = __msa_vshf_b(mask1, offset1, offset0);
771
        UNPCK_SB_SH(offset, temp2, temp3);
772
773
        UNPCK_UB_SH(src0_r, dst0, dst1);
774
        UNPCK_UB_SH(src1_r, dst2, dst3);
775
        ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
776
             dst0, dst1, dst2, dst3);
777
        CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
778
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst2);
779
        ST8x4_UB(dst0, dst2, dst, dst_stride);
780
        dst += (4 * dst_stride);
781
    }
782
}
783
784
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
785
                                                int32_t dst_stride,
786
                                                uint8_t *src,
787
                                                int32_t src_stride,
788
                                                int32_t sao_left_class,
789
                                                int16_t *sao_offset_val,
790
                                                int32_t width, int32_t height)
791
{
792
    int32_t h_cnt, w_cnt;
793
    v16u8 src0, src1, src2, src3;
794
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
795
    v16i8 out0, out1, out2, out3;
796
    v16i8 mask0, mask1, mask2, mask3;
797
    v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
798
    v16i8 offset0 = { 0 };
799
    v16i8 offset1 = { 0 };
800
    v16i8 zero = { 0 };
801
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
802
803
    offset_val = LD_SB(sao_offset_val + 1);
804
    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
805
    offset_val = __msa_pckev_b(offset_val, offset_val);
806
    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
807
    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
808
    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
809
810
    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
811
        SWAP(offset0, offset1);
812
    }
813
814
    for (h_cnt = height >> 2; h_cnt--;) {
815
        for (w_cnt = 0; w_cnt < (width >> 4); w_cnt++) {
816
            LD_UB4(src + w_cnt * 16, src_stride, src0, src1, src2, src3);
817
818
            mask0 = __msa_srli_b((v16i8) src0, 3);
819
            mask1 = __msa_srli_b((v16i8) src1, 3);
820
            mask2 = __msa_srli_b((v16i8) src2, 3);
821
            mask3 = __msa_srli_b((v16i8) src3, 3);
822
823
            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
824
                       tmp0, tmp1);
825
            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
826
                       tmp2, tmp3);
827
            UNPCK_SB_SH(tmp0, temp0, temp1);
828
            UNPCK_SB_SH(tmp1, temp2, temp3);
829
            UNPCK_SB_SH(tmp2, temp4, temp5);
830
            UNPCK_SB_SH(tmp3, temp6, temp7);
831
            ILVRL_B2_SH(zero, src0, dst0, dst1);
832
            ILVRL_B2_SH(zero, src1, dst2, dst3);
833
            ILVRL_B2_SH(zero, src2, dst4, dst5);
834
            ILVRL_B2_SH(zero, src3, dst6, dst7);
835
            ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
836
                 dst0, dst1, dst2, dst3);
837
            ADD4(dst4, temp4, dst5, temp5, dst6, temp6, dst7, temp7,
838
                 dst4, dst5, dst6, dst7);
839
            CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
840
            CLIP_SH4_0_255(dst4, dst5, dst6, dst7);
841
            PCKEV_B4_SB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
842
                        out0, out1, out2, out3);
843
            ST_SB4(out0, out1, out2, out3, dst + w_cnt * 16, dst_stride);
844
        }
845
846
        src += src_stride << 2;
847
        dst += dst_stride << 2;
848
    }
849
}
850
851
static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
852
                                                    int32_t dst_stride,
853
                                                    uint8_t *src,
854
                                                    int32_t src_stride,
855
                                                    int16_t *sao_offset_val,
856
                                                    int32_t height)
857
{
858
    int32_t h_cnt;
859
    uint32_t dst_val0, dst_val1;
860
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
861
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
862
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
863
    v16u8 src_minus10, src_minus11;
864
    v16i8 zero = { 0 };
865
    v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0;
866
    v8i16 offset_mask0, offset_mask1;
867
    v8i16 sao_offset, src00, src01;
868
869
    sao_offset = LD_SH(sao_offset_val);
870
    src -= 1;
871
872
    for (h_cnt = (height >> 1); h_cnt--;) {
873
        LD_UB2(src, src_stride, src_minus10, src_minus11);
874
        src += (2 * src_stride);
875
876
        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
877
        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
878
        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
879
                   src_minus10, src_minus11);
880
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
881
                   src_zero1);
882
883
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
884
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
885
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
886
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
887
888
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
889
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
890
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
891
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
892
893
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
894
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
895
896
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
897
                   offset_mask0, offset_mask0, offset_mask0);
898
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
899
                   offset_mask1, offset_mask1, offset_mask1);
900
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
901
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
902
             offset_mask1);
903
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
904
905
        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
906
        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
907
        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
908
        SW(dst_val0, dst);
909
        dst += dst_stride;
910
        SW(dst_val1, dst);
911
        dst += dst_stride;
912
    }
913
}
914
915
static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
916
                                                    int32_t dst_stride,
917
                                                    uint8_t *src,
918
                                                    int32_t src_stride,
919
                                                    int16_t *sao_offset_val,
920
                                                    int32_t height)
921
{
922
    uint8_t *src_minus1;
923
    int32_t h_cnt;
924
    uint64_t dst_val0, dst_val1;
925
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
926
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
927
    v16i8 dst0, dst1;
928
    v16i8 zero = { 0 };
929
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
930
    v16u8 src_minus10, src_minus11;
931
    v16i8 src_zero0, src_plus10, src_zero1, src_plus11;
932
    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
933
934
    sao_offset = LD_SH(sao_offset_val);
935
936
    for (h_cnt = (height >> 1); h_cnt--;) {
937
        src_minus1 = src - 1;
938
        LD_UB2(src_minus1, src_stride, src_minus10, src_minus11);
939
940
        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
941
        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
942
        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
943
                   src_minus10, src_minus11);
944
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
945
                   src_zero0, src_zero1);
946
947
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
948
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
949
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
950
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
951
952
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
953
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
954
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
955
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
956
957
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
958
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
959
960
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
961
                   offset_mask0, offset_mask0, offset_mask0);
962
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
963
                   offset_mask1, offset_mask1, offset_mask1);
964
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
965
966
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
967
             offset_mask1);
968
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
969
        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
970
                    dst0, dst1);
971
972
        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
973
        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
974
        SD(dst_val0, dst);
975
        dst += dst_stride;
976
        SD(dst_val1, dst);
977
        dst += dst_stride;
978
        src += (src_stride << 1);
979
    }
980
}
981
982
static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
983
                                                        int32_t dst_stride,
984
                                                        uint8_t *src,
985
                                                        int32_t src_stride,
986
                                                        int16_t *sao_offset_val,
987
                                                        int32_t width,
988
                                                        int32_t height)
989
{
990
    uint8_t *dst_ptr, *src_minus1;
991
    int32_t h_cnt, v_cnt;
992
    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
993
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
994
    v16i8 sao_offset;
995
    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
996
    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
997
    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
998
    v16u8 diff_plus13;
999
    v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1000
    v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1001
    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1002
    v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1003
    v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1004
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1005
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1006
1007
    sao_offset = LD_SB(sao_offset_val);
1008
    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1009
1010
    for (h_cnt = (height >> 2); h_cnt--;) {
1011
        src_minus1 = src - 1;
1012
        LD_UB4(src_minus1, src_stride,
1013
               src_minus10, src_minus11, src_minus12, src_minus13);
1014
1015
        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1016
            src_minus1 += 16;
1017
            dst_ptr = dst + (v_cnt << 4);
1018
            LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1019
1020
            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
1021
                       src_zero1, 1);
1022
            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
1023
                       src_zero3, 1);
1024
            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
1025
                       src_plus11, 2);
1026
            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
1027
                       src_plus13, 2);
1028
1029
            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1030
            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1031
            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1032
            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1033
            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1034
            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1035
            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1036
            cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1037
1038
            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1039
            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1040
            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1041
            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1042
            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1043
            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1044
            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1045
            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1046
1047
            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1048
            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1049
            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1050
            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1051
            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1052
            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1053
            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1054
            cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1055
1056
            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1057
            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1058
            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1059
            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1060
            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1061
            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1062
            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1063
            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1064
1065
            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1066
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1067
                       offset_mask0, offset_mask0, offset_mask0);
1068
            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1069
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1070
                       offset_mask1, offset_mask1, offset_mask1);
1071
            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1072
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1073
                       offset_mask2, offset_mask2, offset_mask2);
1074
            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1075
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1076
                       offset_mask3, offset_mask3, offset_mask3);
1077
1078
            UNPCK_UB_SH(src_zero0, src0, src1);
1079
            UNPCK_SB_SH(offset_mask0, temp0, temp1);
1080
            UNPCK_UB_SH(src_zero1, src2, src3);
1081
            UNPCK_SB_SH(offset_mask1, temp2, temp3);
1082
            UNPCK_UB_SH(src_zero2, src4, src5);
1083
            UNPCK_SB_SH(offset_mask2, temp4, temp5);
1084
            UNPCK_UB_SH(src_zero3, src6, src7);
1085
            UNPCK_SB_SH(offset_mask3, temp6, temp7);
1086
            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1087
                 temp1, temp2, temp3);
1088
            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1089
                 temp5, temp6, temp7);
1090
            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1091
            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1092
            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1093
                        dst0, dst1, dst2, dst3);
1094
1095
            src_minus10 = src10;
1096
            ST_UB(dst0, dst_ptr);
1097
            src_minus11 = src11;
1098
            ST_UB(dst1, dst_ptr + dst_stride);
1099
            src_minus12 = src12;
1100
            ST_UB(dst2, dst_ptr + (dst_stride << 1));
1101
            src_minus13 = src13;
1102
            ST_UB(dst3, dst_ptr + (dst_stride * 3));
1103
        }
1104
1105
        src += (src_stride << 2);
1106
        dst += (dst_stride << 2);
1107
    }
1108
}
1109
1110
static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
1111
                                                     int32_t dst_stride,
1112
                                                     uint8_t *src,
1113
                                                     int32_t src_stride,
1114
                                                     int16_t *sao_offset_val,
1115
                                                     int32_t height)
1116
{
1117
    int32_t h_cnt;
1118
    uint32_t dst_val0, dst_val1;
1119
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1120
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1121
    v16i8 dst0;
1122
    v16i8 zero = { 0 };
1123
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1124
    v16u8 src_minus10, src_minus11, src10, src11;
1125
    v16i8 src_zero0, src_zero1;
1126
    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
1127
1128
    sao_offset = LD_SH(sao_offset_val);
1129
1130
    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1131
1132
    for (h_cnt = (height >> 1); h_cnt--;) {
1133
        LD_UB2(src + src_stride, src_stride, src10, src11);
1134
1135
        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1136
        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1137
        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1138
        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1139
1140
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1141
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1142
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1143
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1144
1145
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1146
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1147
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1148
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1149
1150
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1151
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1152
1153
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1154
                   offset_mask0, offset_mask0, offset_mask0);
1155
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1156
                   offset_mask1, offset_mask1, offset_mask1);
1157
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1158
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1159
             offset_mask1);
1160
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1161
        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1162
1163
        src_minus10 = src10;
1164
        src_minus11 = src11;
1165
1166
        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1167
        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1168
        SW(dst_val0, dst);
1169
        dst += dst_stride;
1170
        SW(dst_val1, dst);
1171
1172
        dst += dst_stride;
1173
        src += (src_stride << 1);
1174
    }
1175
}
1176
1177
static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
1178
                                                     int32_t dst_stride,
1179
                                                     uint8_t *src,
1180
                                                     int32_t src_stride,
1181
                                                     int16_t *sao_offset_val,
1182
                                                     int32_t height)
1183
{
1184
    int32_t h_cnt;
1185
    uint64_t dst_val0, dst_val1;
1186
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1187
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1188
    v16i8 zero = { 0 };
1189
    v16i8 src_zero0, src_zero1, dst0, dst1;
1190
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1191
    v16u8 src_minus10, src_minus11, src10, src11;
1192
    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1193
1194
    sao_offset = LD_SH(sao_offset_val);
1195
1196
    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1197
1198
    for (h_cnt = (height >> 1); h_cnt--;) {
1199
        LD_UB2(src + src_stride, src_stride, src10, src11);
1200
1201
        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1202
        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1203
        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1204
        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1205
1206
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1207
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1208
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1209
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1210
1211
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1212
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1213
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1214
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1215
1216
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1217
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1218
1219
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1220
                   offset_mask0, offset_mask0, offset_mask0);
1221
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1222
                   offset_mask1, offset_mask1, offset_mask1);
1223
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1224
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1225
             offset_mask1);
1226
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1227
        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1228
                    dst0, dst1);
1229
1230
        src_minus10 = src10;
1231
        src_minus11 = src11;
1232
1233
        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1234
        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1235
        SD(dst_val0, dst);
1236
        dst += dst_stride;
1237
        SD(dst_val1, dst);
1238
        dst += dst_stride;
1239
        src += (src_stride << 1);
1240
    }
1241
}
1242
1243
static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
1244
                                                         int32_t dst_stride,
1245
                                                         uint8_t *src,
1246
                                                         int32_t src_stride,
1247
                                                         int16_t *
1248
                                                         sao_offset_val,
1249
                                                         int32_t width,
1250
                                                         int32_t height)
1251
{
1252
    uint8_t *src_orig = src;
1253
    uint8_t *dst_orig = dst;
1254
    int32_t h_cnt, v_cnt;
1255
    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1256
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1257
    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1258
    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1259
    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1260
    v16u8 diff_plus13;
1261
    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1262
    v16u8 src12, dst2, src13, dst3;
1263
    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1264
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1265
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1266
1267
    sao_offset = LD_SB(sao_offset_val);
1268
    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1269
1270
    for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1271
        src = src_orig + (v_cnt << 4);
1272
        dst = dst_orig + (v_cnt << 4);
1273
1274
        LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1275
1276
        for (h_cnt = (height >> 2); h_cnt--;) {
1277
            LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
1278
1279
            cmp_minus10 = (src_minus11 == src_minus10);
1280
            cmp_plus10 = (src_minus11 == src10);
1281
            cmp_minus11 = (src10 == src_minus11);
1282
            cmp_plus11 = (src10 == src11);
1283
            cmp_minus12 = (src11 == src10);
1284
            cmp_plus12 = (src11 == src12);
1285
            cmp_minus13 = (src12 == src11);
1286
            cmp_plus13 = (src12 == src13);
1287
1288
            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1289
            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1290
            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1291
            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1292
            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1293
            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1294
            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1295
            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1296
1297
            cmp_minus10 = (src_minus10 < src_minus11);
1298
            cmp_plus10 = (src10 < src_minus11);
1299
            cmp_minus11 = (src_minus11 < src10);
1300
            cmp_plus11 = (src11 < src10);
1301
            cmp_minus12 = (src10 < src11);
1302
            cmp_plus12 = (src12 < src11);
1303
            cmp_minus13 = (src11 < src12);
1304
            cmp_plus13 = (src13 < src12);
1305
1306
            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1307
            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1308
            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1309
            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1310
            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1311
            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1312
            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1313
            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1314
1315
            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1316
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1317
                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1318
            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1319
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1320
                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1321
            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1322
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1323
                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1324
            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1325
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1326
                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1327
1328
            UNPCK_UB_SH(src_minus11, src0, src1);
1329
            UNPCK_SB_SH(offset_mask0, temp0, temp1);
1330
            UNPCK_UB_SH(src10, src2, src3);
1331
            UNPCK_SB_SH(offset_mask1, temp2, temp3);
1332
            UNPCK_UB_SH(src11, src4, src5);
1333
            UNPCK_SB_SH(offset_mask2, temp4, temp5);
1334
            UNPCK_UB_SH(src12, src6, src7);
1335
            UNPCK_SB_SH(offset_mask3, temp6, temp7);
1336
            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1337
                 temp1, temp2, temp3);
1338
            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1339
                 temp5, temp6, temp7);
1340
            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1341
            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1342
            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1343
                        dst0, dst1, dst2, dst3);
1344
1345
            src_minus10 = src12;
1346
            src_minus11 = src13;
1347
1348
            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1349
1350
            src += (src_stride << 2);
1351
            dst += (dst_stride << 2);
1352
        }
1353
    }
1354
}
1355
1356
static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
1357
                                                     int32_t dst_stride,
1358
                                                     uint8_t *src,
1359
                                                     int32_t src_stride,
1360
                                                     int16_t *sao_offset_val,
1361
                                                     int32_t height)
1362
{
1363
    uint8_t *src_orig;
1364
    int32_t h_cnt;
1365
    uint32_t dst_val0, dst_val1;
1366
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1367
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1368
    v16i8 zero = { 0 };
1369
    v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1370
    v16u8 src_minus11, src10, src11;
1371
    v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1372
    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
1373
1374
    sao_offset = LD_SH(sao_offset_val);
1375
1376
    src_orig = src - 1;
1377
    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1378
1379
    for (h_cnt = (height >> 1); h_cnt--;) {
1380
        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1381
1382
        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1383
        SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
1384
1385
        ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1386
                   src_minus11);
1387
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1388
                   src_zero1);
1389
1390
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1391
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1392
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1393
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1394
1395
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1396
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1397
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1398
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1399
1400
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1401
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1402
1403
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1404
                   offset_mask0, offset_mask0, offset_mask0);
1405
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1406
                   offset_mask1, offset_mask1, offset_mask1);
1407
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1408
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1409
             offset_mask1);
1410
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1411
1412
        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1413
1414
        src_minus10 = src10;
1415
        src_minus11 = src11;
1416
1417
        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1418
        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1419
        SW(dst_val0, dst);
1420
        dst += dst_stride;
1421
        SW(dst_val1, dst);
1422
1423
        dst += dst_stride;
1424
        src_orig += (src_stride << 1);
1425
    }
1426
}
1427
1428
static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
1429
                                                     int32_t dst_stride,
1430
                                                     uint8_t *src,
1431
                                                     int32_t src_stride,
1432
                                                     int16_t *sao_offset_val,
1433
                                                     int32_t height)
1434
{
1435
    uint8_t *src_orig;
1436
    int32_t h_cnt;
1437
    uint64_t dst_val0, dst_val1;
1438
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1439
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1440
    v16i8 zero = { 0 };
1441
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1442
    v16u8 src_minus10, src10, src_minus11, src11;
1443
    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0, dst1;
1444
    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1445
1446
    sao_offset = LD_SH(sao_offset_val);
1447
    src_orig = src - 1;
1448
1449
    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1450
1451
    for (h_cnt = (height >> 1); h_cnt--;) {
1452
        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1453
1454
        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1455
        SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
1456
1457
        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
1458
                   src_minus10, src_minus11);
1459
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
1460
                   src_zero0, src_zero1);
1461
1462
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1463
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1464
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1465
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1466
1467
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1468
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1469
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1470
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1471
1472
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1473
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1474
1475
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1476
                   offset_mask0, offset_mask0, offset_mask0);
1477
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1478
                   offset_mask1, offset_mask1, offset_mask1);
1479
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1480
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1481
             offset_mask1);
1482
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1483
        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1484
                    dst0, dst1);
1485
1486
        src_minus10 = src10;
1487
        src_minus11 = src11;
1488
1489
        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1490
        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1491
        SD(dst_val0, dst);
1492
        dst += dst_stride;
1493
        SD(dst_val1, dst);
1494
1495
        dst += dst_stride;
1496
        src_orig += (src_stride << 1);
1497
    }
1498
}
1499
1500
static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
1501
                                                         int32_t dst_stride,
1502
                                                         uint8_t *src,
1503
                                                         int32_t src_stride,
1504
                                                         int16_t *
1505
                                                         sao_offset_val,
1506
                                                         int32_t width,
1507
                                                         int32_t height)
1508
{
1509
    uint8_t *src_orig = src;
1510
    uint8_t *dst_orig = dst;
1511
    int32_t h_cnt, v_cnt;
1512
    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1513
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1514
    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1515
    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1516
    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1517
    v16u8 diff_plus13, src_minus14, src_plus13;
1518
    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1519
    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1520
    v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
1521
    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
1522
    v16i8 src_zero3, sao_offset;
1523
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1524
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1525
1526
    sao_offset = LD_SB(sao_offset_val);
1527
    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1528
1529
    for (h_cnt = (height >> 2); h_cnt--;) {
1530
        src_orig = src - 1;
1531
        dst_orig = dst;
1532
        LD_UB4(src_orig, src_stride,
1533
               src_minus11, src_minus12, src_minus13, src_minus14);
1534
1535
        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1536
            src_minus10 = LD_UB(src_orig - src_stride);
1537
            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
1538
            src_plus13 = LD_UB(src + 1 + (v_cnt << 4) + (src_stride << 2));
1539
            src_orig += 16;
1540
1541
            SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
1542
                       src_zero1, 1);
1543
            SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
1544
                       src_zero3, 1);
1545
            SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
1546
                       src_plus11, 2);
1547
1548
            src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
1549
1550
            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1551
            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1552
            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1553
            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1554
            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1555
            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1556
            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1557
            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
1558
1559
            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1560
            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1561
            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1562
            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1563
            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1564
            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1565
            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1566
            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1567
1568
            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1569
            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1570
            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1571
            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1572
            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1573
            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1574
            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1575
            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
1576
1577
            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1578
            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1579
            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1580
            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1581
            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1582
            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1583
            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1584
            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1585
1586
            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1587
            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1588
            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1589
            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1590
1591
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1592
                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1593
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1594
                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1595
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1596
                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1597
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1598
                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1599
1600
            UNPCK_UB_SH(src_zero0, src0, src1);
1601
            UNPCK_SB_SH(offset_mask0, temp0, temp1);
1602
            UNPCK_UB_SH(src_zero1, src2, src3);
1603
            UNPCK_SB_SH(offset_mask1, temp2, temp3);
1604
            UNPCK_UB_SH(src_zero2, src4, src5);
1605
            UNPCK_SB_SH(offset_mask2, temp4, temp5);
1606
            UNPCK_UB_SH(src_zero3, src6, src7);
1607
            UNPCK_SB_SH(offset_mask3, temp6, temp7);
1608
            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1609
                 temp1, temp2, temp3);
1610
            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1611
                 temp5, temp6, temp7);
1612
            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1613
            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1614
            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4,
1615
                        temp7, temp6, dst0, dst1, dst2, dst3);
1616
1617
            src_minus11 = src10;
1618
            src_minus12 = src11;
1619
            src_minus13 = src12;
1620
            src_minus14 = src13;
1621
1622
            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
1623
            dst_orig += 16;
1624
        }
1625
1626
        src += (src_stride << 2);
1627
        dst += (dst_stride << 2);
1628
    }
1629
}
1630
1631
static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
1632
                                                      int32_t dst_stride,
1633
                                                      uint8_t *src,
1634
                                                      int32_t src_stride,
1635
                                                      int16_t *sao_offset_val,
1636
                                                      int32_t height)
1637
{
1638
    uint8_t *src_orig;
1639
    int32_t h_cnt;
1640
    uint32_t dst_val0, dst_val1;
1641
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1642
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1643
    v16i8 zero = { 0 };
1644
    v16i8 src_zero0, src_zero1, dst0;
1645
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1646
    v16u8 src_minus10, src10, src_minus11, src11;
1647
    v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01;
1648
1649
    sao_offset = LD_SH(sao_offset_val);
1650
    src_orig = src - 1;
1651
1652
    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1653
1654
    for (h_cnt = (height >> 1); h_cnt--;) {
1655
        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1656
1657
        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1658
        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
1659
1660
        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
1661
                   src_minus11);
1662
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1663
                   src_zero1);
1664
1665
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1666
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1667
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1668
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1669
1670
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1671
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1672
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1673
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1674
1675
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1676
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1677
1678
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1679
                   offset_mask0, offset_mask0, offset_mask0);
1680
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1681
                   offset_mask1, offset_mask1, offset_mask1);
1682
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1683
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1684
             offset_mask1);
1685
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1686
        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1687
1688
        src_minus10 = src10;
1689
        src_minus11 = src11;
1690
1691
        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1692
        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1693
1694
        SW(dst_val0, dst);
1695
        dst += dst_stride;
1696
        SW(dst_val1, dst);
1697
1698
        dst += dst_stride;
1699
        src_orig += (src_stride << 1);
1700
    }
1701
}
1702
1703
static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
1704
                                                      int32_t dst_stride,
1705
                                                      uint8_t *src,
1706
                                                      int32_t src_stride,
1707
                                                      int16_t *sao_offset_val,
1708
                                                      int32_t height)
1709
{
1710
    uint8_t *src_orig;
1711
    int32_t h_cnt;
1712
    uint64_t dst_val0, dst_val1;
1713
    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1714
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1715
    v16i8 zero = { 0 };
1716
    v16i8 src_zero0, src_zero1, dst0, dst1;
1717
    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1718
    v16u8 src_minus10, src10, src_minus11, src11;
1719
    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1720
1721
    sao_offset = LD_SH(sao_offset_val);
1722
    src_orig = src - 1;
1723
1724
    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1725
1726
    for (h_cnt = (height >> 1); h_cnt--;) {
1727
        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1728
1729
        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1730
        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
1731
        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
1732
                   src_minus11);
1733
        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1734
                   src_zero1);
1735
1736
        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1737
        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1738
        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1739
        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1740
1741
        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1742
        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1743
        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1744
        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1745
1746
        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1747
        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1748
1749
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1750
                   offset_mask0, offset_mask0, offset_mask0);
1751
        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1752
                   offset_mask1, offset_mask1, offset_mask1);
1753
        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1754
        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1755
             offset_mask1);
1756
        CLIP_SH2_0_255(offset_mask0, offset_mask1);
1757
        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1758
                    dst0, dst1);
1759
1760
        src_minus10 = src10;
1761
        src_minus11 = src11;
1762
1763
        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1764
        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1765
1766
        SD(dst_val0, dst);
1767
        dst += dst_stride;
1768
        SD(dst_val1, dst);
1769
        dst += dst_stride;
1770
1771
        src_orig += (src_stride << 1);
1772
    }
1773
}
1774
1775
static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
1776
                                                          int32_t dst_stride,
1777
                                                          uint8_t *src,
1778
                                                          int32_t src_stride,
1779
                                                          int16_t *
1780
                                                          sao_offset_val,
1781
                                                          int32_t width,
1782
                                                          int32_t height)
1783
{
1784
    uint8_t *src_orig, *dst_orig;
1785
    int32_t h_cnt, v_cnt;
1786
    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1787
    v16u8 const1 = (v16u8) __msa_ldi_b(1);
1788
    v16u8 dst0, dst1, dst2, dst3;
1789
    v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
1790
    v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
1791
    v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
1792
    v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
1793
    v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
1794
    v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
1795
    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1796
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1797
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1798
1799
    sao_offset = LD_SB(sao_offset_val);
1800
    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1801
1802
    for (h_cnt = (height >> 2); h_cnt--;) {
1803
        src_orig = src - 1;
1804
        dst_orig = dst;
1805
1806
        LD_UB4(src_orig, src_stride,
1807
               src_minus11, src_plus10, src_plus11, src_plus12);
1808
1809
        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1810
            src_minus10 = LD_UB(src_orig + 2 - src_stride);
1811
            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
1812
            src_plus13 = LD_UB(src_orig + (src_stride << 2));
1813
            src_orig += 16;
1814
1815
            src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
1816
            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1817
            cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
1818
1819
            src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
1820
            src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
1821
                                               (v16i8) src_minus11, 2);
1822
            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1823
            cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
1824
1825
            src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
1826
            src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
1827
            cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
1828
            cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
1829
1830
            src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
1831
            src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
1832
            cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
1833
            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
1834
1835
            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1836
            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1837
            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1838
            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1839
            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1840
            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1841
            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1842
            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1843
1844
            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1845
            cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
1846
            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1847
            cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
1848
            cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
1849
            cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
1850
            cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
1851
            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
1852
1853
            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1854
            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1855
            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1856
            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1857
            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1858
            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1859
            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1860
            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1861
1862
            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1863
            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1864
            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1865
            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1866
1867
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1868
                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1869
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1870
                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1871
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1872
                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1873
            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1874
                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1875
1876
            UNPCK_UB_SH(src_zero0, src0, src1);
1877
            UNPCK_SB_SH(offset_mask0, temp0, temp1);
1878
            UNPCK_UB_SH(src_zero1, src2, src3);
1879
            UNPCK_SB_SH(offset_mask1, temp2, temp3);
1880
            UNPCK_UB_SH(src_zero2, src4, src5);
1881
            UNPCK_SB_SH(offset_mask2, temp4, temp5);
1882
            UNPCK_UB_SH(src_zero3, src6, src7);
1883
            UNPCK_SB_SH(offset_mask3, temp6, temp7);
1884
1885
            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1886
                 temp1, temp2, temp3);
1887
            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1888
                 temp5, temp6, temp7);
1889
            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1890
            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1891
            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1892
                        dst0, dst1, dst2, dst3);
1893
1894
            src_minus11 = src10;
1895
            src_plus10 = src11;
1896
            src_plus11 = src12;
1897
            src_plus12 = src13;
1898
1899
            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
1900
            dst_orig += 16;
1901
        }
1902
1903
        src += (src_stride << 2);
1904
        dst += (dst_stride << 2);
1905
    }
1906
}
1907
1908
void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
1909
                                      ptrdiff_t src_stride,
1910
                                      int32_t beta, int32_t *tc,
1911
                                      uint8_t *no_p, uint8_t *no_q)
1912
{
1913
    hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
1914
}
1915
1916
void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
1917
                                      ptrdiff_t src_stride,
1918
                                      int32_t beta, int32_t *tc,
1919
                                      uint8_t *no_p, uint8_t *no_q)
1920
{
1921
    hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
1922
}
1923
1924
void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
1925
                                        ptrdiff_t src_stride,
1926
                                        int32_t *tc, uint8_t *no_p,
1927
                                        uint8_t *no_q)
1928
{
1929
    hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
1930
}
1931
1932
void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
1933
                                        ptrdiff_t src_stride,
1934
                                        int32_t *tc, uint8_t *no_p,
1935
                                        uint8_t *no_q)
1936
{
1937
    hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
1938
}
1939
1940
void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
1941
                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
1942
                                     int16_t *sao_offset_val, int sao_left_class,
1943
                                     int width, int height)
1944
{
1945
    if (width >> 4) {
1946
        hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
1947
                                            sao_left_class, sao_offset_val,
1948
                                            width - (width % 16), height);
1949
        dst += width - (width % 16);
1950
        src += width - (width % 16);
1951
        width %= 16;
1952
    }
1953
1954
    if (width >> 3) {
1955
        hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
1956
                                        sao_left_class, sao_offset_val, height);
1957
        dst += 8;
1958
        src += 8;
1959
        width %= 8;
1960
    }
1961
1962
    if (width) {
1963
        hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
1964
                                        sao_left_class, sao_offset_val, height);
1965
    }
1966
}
1967
1968
void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
1969
                                   ptrdiff_t stride_dst,
1970
                                   int16_t *sao_offset_val,
1971
                                   int eo, int width, int height)
1972
{
1973
    ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
1974
1975
    switch (eo) {
1976
    case 0:
1977
        if (width >> 4) {
1978
            hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
1979
                                                        src, stride_src,
1980
                                                        sao_offset_val,
1981
                                                        width - (width % 16),
1982
                                                        height);
1983
            dst += width - (width % 16);
1984
            src += width - (width % 16);
1985
            width %= 16;
1986
        }
1987
1988
        if (width >> 3) {
1989
            hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
1990
                                                    src, stride_src,
1991
                                                    sao_offset_val, height);
1992
            dst += 8;
1993
            src += 8;
1994
            width %= 8;
1995
        }
1996
1997
        if (width) {
1998
            hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
1999
                                                    src, stride_src,
2000
                                                    sao_offset_val, height);
2001
        }
2002
        break;
2003
2004
    case 1:
2005
        if (width >> 4) {
2006
            hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
2007
                                                         src, stride_src,
2008
                                                         sao_offset_val,
2009
                                                         width - (width % 16),
2010
                                                         height);
2011
            dst += width - (width % 16);
2012
            src += width - (width % 16);
2013
            width %= 16;
2014
        }
2015
2016
        if (width >> 3) {
2017
            hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
2018
                                                     src, stride_src,
2019
                                                     sao_offset_val, height);
2020
            dst += 8;
2021
            src += 8;
2022
            width %= 8;
2023
        }
2024
2025
        if (width) {
2026
            hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
2027
                                                     src, stride_src,
2028
                                                     sao_offset_val, height);
2029
        }
2030
        break;
2031
2032
    case 2:
2033
        if (width >> 4) {
2034
            hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
2035
                                                         src, stride_src,
2036
                                                         sao_offset_val,
2037
                                                         width - (width % 16),
2038
                                                         height);
2039
            dst += width - (width % 16);
2040
            src += width - (width % 16);
2041
            width %= 16;
2042
        }
2043
2044
        if (width >> 3) {
2045
            hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
2046
                                                     src, stride_src,
2047
                                                     sao_offset_val, height);
2048
            dst += 8;
2049
            src += 8;
2050
            width %= 8;
2051
        }
2052
2053
        if (width) {
2054
            hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
2055
                                                     src, stride_src,
2056
                                                     sao_offset_val, height);
2057
        }
2058
        break;
2059
2060
    case 3:
2061
        if (width >> 4) {
2062
            hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
2063
                                                          src, stride_src,
2064
                                                          sao_offset_val,
2065
                                                          width - (width % 16),
2066
                                                          height);
2067
            dst += width - (width % 16);
2068
            src += width - (width % 16);
2069
            width %= 16;
2070
        }
2071
2072
        if (width >> 3) {
2073
            hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
2074
                                                      src, stride_src,
2075
                                                      sao_offset_val, height);
2076
            dst += 8;
2077
            src += 8;
2078
            width %= 8;
2079
        }
2080
2081
        if (width) {
2082
            hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
2083
                                                      src, stride_src,
2084
                                                      sao_offset_val, height);
2085
        }
2086
        break;
2087
    }
2088
}