2
* Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4
* This file is part of FFmpeg.
6
* FFmpeg is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
#include "libavutil/mips/generic_macros_msa.h"
22
#include "libavcodec/mips/hevcdsp_mips.h"
24
static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
25
int32_t beta, int32_t *tc,
26
uint8_t *p_is_pcm, uint8_t *q_is_pcm)
28
uint8_t *p3 = src - (stride << 2);
29
uint8_t *p2 = src - ((stride << 1) + stride);
30
uint8_t *p1 = src - (stride << 1);
31
uint8_t *p0 = src - stride;
33
uint8_t *q1 = src + stride;
34
uint8_t *q2 = src + (stride << 1);
35
uint8_t *q3 = src + (stride << 1) + stride;
37
int32_t dp00, dq00, dp30, dq30, d00, d30;
38
int32_t dp04, dq04, dp34, dq34, d04, d34;
39
int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
40
int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
41
uint64_t dst_val0, dst_val1;
42
v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
43
v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
47
v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
49
v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
51
dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
52
dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
53
dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
54
dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
57
p_is_pcm0 = p_is_pcm[0];
58
q_is_pcm0 = q_is_pcm[0];
59
dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
60
dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
61
dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
62
dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65
p_is_pcm4 = p_is_pcm[1];
66
q_is_pcm4 = q_is_pcm[1];
68
if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
69
if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
82
tc250 = ((tc0 * 5 + 1) >> 1);
84
tc254 = ((tc4 * 5 + 1) >> 1);
86
flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
87
abs(p0[0] - q0[0]) < tc250 &&
88
abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
89
abs(p0[3] - q0[3]) < tc250 &&
90
(d00 << 1) < beta20 && (d30 << 1) < beta20);
91
cmp0 = __msa_fill_d(flag0);
93
flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
94
abs(p0[4] - q0[4]) < tc254 &&
95
abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
96
abs(p0[7] - q0[7]) < tc254 &&
97
(d04 << 1) < beta20 && (d34 << 1) < beta20);
98
cmp1 = __msa_fill_d(flag1);
99
cmp2 = __msa_ilvev_d(cmp1, cmp0);
100
cmp2 = __msa_ceqi_d(cmp2, 0);
102
ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
103
zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
104
p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
107
cmp0 = (v2i64) __msa_fill_h(tc0);
108
cmp1 = (v2i64) __msa_fill_h(tc4);
109
tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
113
temp0 = (p1_src + p0_src + q0_src);
114
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
115
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
116
temp2 = (v8i16) (temp1 - p2_src);
117
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
118
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
120
temp1 = temp0 + p2_src;
121
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
122
temp2 = (v8i16) (temp1 - p1_src);
123
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
124
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
126
temp1 = (temp0 << 1) + p2_src + q1_src;
127
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
128
temp2 = (v8i16) (temp1 - p0_src);
129
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
130
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
132
cmp0 = __msa_fill_d(p_is_pcm0);
133
cmp1 = __msa_fill_d(p_is_pcm4);
134
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
135
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
137
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
138
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
139
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
141
temp0 = (q1_src + p0_src + q0_src);
143
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
144
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
145
temp2 = (v8i16) (temp1 - q2_src);
146
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
147
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
149
temp1 = temp0 + q2_src;
150
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
151
temp2 = (v8i16) (temp1 - q1_src);
152
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
153
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
155
temp1 = (temp0 << 1) + p1_src + q2_src;
156
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
157
temp2 = (v8i16) (temp1 - q0_src);
158
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
159
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
161
cmp0 = __msa_fill_d(q_is_pcm0);
162
cmp1 = __msa_fill_d(q_is_pcm4);
163
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
164
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
166
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
167
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
168
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
173
diff0 = (v8i16) (q0_src - p0_src);
174
diff1 = (v8i16) (q1_src - p1_src);
175
diff0 = (diff0 << 3) + diff0;
176
diff1 = (diff1 << 1) + diff1;
177
delta0 = diff0 - diff1;
178
delta0 = __msa_srari_h(delta0, 4);
180
temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
181
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
182
abs_delta0 = (v8u16) abs_delta0 < temp1;
184
delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
186
temp0 = (v8u16) (delta0 + p0_src);
187
temp0 = (v8u16) CLIP_SH_0_255(temp0);
188
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
189
(v16u8) p_is_pcm_vec);
191
temp2 = (v8i16) (q0_src - delta0);
192
temp2 = CLIP_SH_0_255(temp2);
193
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
194
(v16u8) q_is_pcm_vec);
196
tmp = (beta + (beta >> 1)) >> 3;
197
cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
198
cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
199
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
200
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
202
cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
203
cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
204
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
205
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
210
delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
211
delta1 -= (v8i16) p1_src;
214
delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
215
delta1 = (v8i16) p1_src + (v8i16) delta1;
216
delta1 = CLIP_SH_0_255(delta1);
217
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
218
(v16u8) p_is_pcm_vec);
220
delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
221
delta2 = delta2 - (v8i16) q1_src;
222
delta2 = delta2 - delta0;
223
delta2 = delta2 >> 1;
224
delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
225
delta2 = (v8i16) q1_src + (v8i16) delta2;
226
delta2 = CLIP_SH_0_255(delta2);
227
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
228
(v16u8) q_is_pcm_vec);
230
delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
232
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
234
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
236
delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
239
dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
240
dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
241
dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
242
dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
243
dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
244
dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
246
cmp0 = __msa_fill_d(d00 + d30 >= beta);
247
cmp1 = __msa_fill_d(d04 + d34 >= beta);
248
cmp0 = __msa_ilvev_d(cmp1, cmp0);
249
cmp0 = __msa_ceqi_d(cmp0, 0);
251
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0);
252
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0);
253
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0);
254
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0);
255
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0);
256
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0);
258
PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
259
dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
261
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
262
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
264
ST8x4_UB(dst0, dst1, p2, stride);
273
static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
274
int32_t beta, int32_t *tc,
275
uint8_t *p_is_pcm, uint8_t *q_is_pcm)
278
uint8_t *p2 = src + 3 * stride;
279
uint8_t *p1 = src + (stride << 2);
280
uint8_t *p0 = src + 7 * stride;
281
uint8_t flag0, flag1;
284
int32_t dp00, dq00, dp30, dq30, d00, d30;
285
int32_t dp04, dq04, dp34, dq34, d04, d34;
286
int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
287
int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
288
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
289
v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
292
v8i16 tc_pos, tc_neg;
293
v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
295
v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
297
dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
298
dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
299
dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
300
dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
303
p_is_pcm0 = p_is_pcm[0];
304
q_is_pcm0 = q_is_pcm[0];
306
dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
307
dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
308
dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
309
dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
312
p_is_pcm4 = p_is_pcm[1];
313
q_is_pcm4 = q_is_pcm[1];
315
if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
316
if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
319
p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
325
tc250 = ((tc0 * 5 + 1) >> 1);
328
tc254 = ((tc4 * 5 + 1) >> 1);
330
TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
331
q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
332
q0_src, q1_src, q2_src, q3_src);
334
flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
335
abs(p3[-1] - p3[0]) < tc250 &&
336
abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
337
abs(p2[-1] - p2[0]) < tc250 &&
338
(d00 << 1) < beta20 && (d30 << 1) < beta20);
339
cmp0 = __msa_fill_d(flag0);
341
flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
342
abs(p1[-1] - p1[0]) < tc254 &&
343
abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
344
abs(p0[-1] - p0[0]) < tc254 &&
345
(d04 << 1) < beta20 && (d34 << 1) < beta20);
346
cmp1 = __msa_fill_d(flag1);
347
cmp2 = __msa_ilvev_d(cmp1, cmp0);
348
cmp2 = __msa_ceqi_d(cmp2, 0);
350
ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
351
zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
352
p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
355
cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
356
cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
357
tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
360
temp0 = (p1_src + p0_src + q0_src);
362
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
363
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
364
temp2 = (v8i16) (temp1 - p2_src);
365
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
366
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
368
temp1 = temp0 + p2_src;
369
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
370
temp2 = (v8i16) (temp1 - p1_src);
371
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
372
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
374
temp1 = (temp0 << 1) + p2_src + q1_src;
375
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
376
temp2 = (v8i16) (temp1 - p0_src);
377
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
378
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
380
cmp0 = __msa_fill_d(p_is_pcm0);
381
cmp1 = __msa_fill_d(p_is_pcm4);
382
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
383
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
385
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
386
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
387
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
389
temp0 = (q1_src + p0_src + q0_src);
390
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
391
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
392
temp2 = (v8i16) (temp1 - q2_src);
393
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
394
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
396
temp1 = temp0 + q2_src;
397
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
398
temp2 = (v8i16) (temp1 - q1_src);
399
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
400
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
402
temp1 = (temp0 << 1) + p1_src + q2_src;
403
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
404
temp2 = (v8i16) (temp1 - q0_src);
405
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
406
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
408
cmp0 = __msa_fill_d(q_is_pcm0);
409
cmp1 = __msa_fill_d(q_is_pcm4);
410
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
411
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
413
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
414
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
415
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
420
diff0 = (v8i16) (q0_src - p0_src);
421
diff1 = (v8i16) (q1_src - p1_src);
422
diff0 = (v8i16) (diff0 << 3) + diff0;
423
diff1 = (v8i16) (diff1 << 1) + diff1;
424
delta0 = diff0 - diff1;
425
delta0 = __msa_srari_h(delta0, 4);
427
temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
428
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
429
abs_delta0 = (v8u16) abs_delta0 < temp1;
431
delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
432
temp0 = (v8u16) delta0 + p0_src;
433
temp0 = (v8u16) CLIP_SH_0_255(temp0);
434
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
435
(v16u8) p_is_pcm_vec);
437
temp2 = (v8i16) q0_src - delta0;
438
temp2 = CLIP_SH_0_255(temp2);
439
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
440
(v16u8) q_is_pcm_vec);
442
tmp = ((beta + (beta >> 1)) >> 3);
443
cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp));
444
cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp));
445
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
446
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
448
cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
449
cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
450
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
451
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
456
delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
457
delta1 -= (v8i16) p1_src;
460
delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
461
delta1 = (v8i16) p1_src + (v8i16) delta1;
462
delta1 = CLIP_SH_0_255(delta1);
463
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
464
(v16u8) p_is_pcm_vec);
466
delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
467
delta2 = delta2 - (v8i16) q1_src;
468
delta2 = delta2 - delta0;
469
delta2 = delta2 >> 1;
470
delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
471
delta2 = (v8i16) q1_src + (v8i16) delta2;
472
delta2 = CLIP_SH_0_255(delta2);
473
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
474
(v16u8) q_is_pcm_vec);
475
delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
477
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
479
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
481
delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
484
dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
485
dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
486
dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
487
dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
488
dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
489
dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
491
cmp0 = __msa_fill_d(d00 + d30 >= beta);
492
dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta);
493
cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0);
494
dst6 = (v16u8) __msa_ceqi_d(cmp0, 0);
496
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6);
497
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6);
498
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6);
499
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6);
500
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6);
501
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6);
503
PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3,
504
dst0, dst1, dst2, dst3);
505
PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5);
507
TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
508
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
512
tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
513
tmp0 = __msa_copy_u_h((v8i16) dst0, 2);
514
tmp3 = __msa_copy_u_w((v4i32) dst1, 0);
515
tmp1 = __msa_copy_u_h((v8i16) dst1, 2);
523
tmp2 = __msa_copy_u_w((v4i32) dst2, 0);
524
tmp0 = __msa_copy_u_h((v8i16) dst2, 2);
525
tmp3 = __msa_copy_u_w((v4i32) dst3, 0);
526
tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
534
tmp2 = __msa_copy_u_w((v4i32) dst4, 0);
535
tmp0 = __msa_copy_u_h((v8i16) dst4, 2);
536
tmp3 = __msa_copy_u_w((v4i32) dst5, 0);
537
tmp1 = __msa_copy_u_h((v8i16) dst5, 2);
545
tmp2 = __msa_copy_u_w((v4i32) dst6, 0);
546
tmp0 = __msa_copy_u_h((v8i16) dst6, 2);
547
tmp3 = __msa_copy_u_w((v4i32) dst7, 0);
548
tmp1 = __msa_copy_u_h((v8i16) dst7, 2);
558
static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
559
int32_t *tc, uint8_t *p_is_pcm,
562
uint8_t *p1_ptr = src - (stride << 1);
563
uint8_t *p0_ptr = src - stride;
564
uint8_t *q0_ptr = src;
565
uint8_t *q1_ptr = src + stride;
566
v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
567
v8u16 p1, p0, q0, q1;
568
v8i16 tc_pos, tc_neg;
570
v8i16 temp0, temp1, delta;
572
if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
573
cmp0 = (v2i64) __msa_fill_h(tc[0]);
574
cmp1 = (v2i64) __msa_fill_h(tc[1]);
575
tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
578
cmp0 = __msa_fill_d(p_is_pcm[0]);
579
cmp1 = __msa_fill_d(p_is_pcm[1]);
580
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
581
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
583
cmp0 = __msa_fill_d(q_is_pcm[0]);
584
cmp1 = __msa_fill_d(q_is_pcm[1]);
585
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
586
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
593
ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
595
temp0 = (v8i16) (q0 - p0);
596
temp1 = (v8i16) (p1 - q1);
599
delta = __msa_srari_h((v8i16) temp0, 3);
600
delta = CLIP_SH(delta, tc_neg, tc_pos);
602
temp0 = (v8i16) ((v8i16) p0 + delta);
603
temp0 = CLIP_SH_0_255(temp0);
604
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
605
(v16u8) p_is_pcm_vec);
607
temp1 = (v8i16) ((v8i16) q0 - delta);
608
temp1 = CLIP_SH_0_255(temp1);
609
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
610
(v16u8) q_is_pcm_vec);
612
tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
613
temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
614
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
616
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
617
ST8x2_UB(temp0, p0_ptr, stride);
621
static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
622
int32_t *tc, uint8_t *p_is_pcm,
625
v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
626
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
627
v8u16 p1, p0, q0, q1;
628
v8i16 tc_pos, tc_neg;
630
v8i16 temp0, temp1, delta;
632
if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
633
cmp0 = (v2i64) __msa_fill_h(tc[0]);
634
cmp1 = (v2i64) __msa_fill_h(tc[1]);
635
tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
638
cmp0 = __msa_fill_d(p_is_pcm[0]);
639
cmp1 = __msa_fill_d(p_is_pcm[1]);
640
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
641
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
643
cmp0 = __msa_fill_d(q_is_pcm[0]);
644
cmp1 = __msa_fill_d(q_is_pcm[1]);
645
q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
646
q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
649
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
650
TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
652
ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
654
temp0 = (v8i16) (q0 - p0);
655
temp1 = (v8i16) (p1 - q1);
658
delta = __msa_srari_h((v8i16) temp0, 3);
659
delta = CLIP_SH(delta, tc_neg, tc_pos);
661
temp0 = (v8i16) ((v8i16) p0 + delta);
662
temp0 = CLIP_SH_0_255(temp0);
663
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
664
(v16u8) p_is_pcm_vec);
666
temp1 = (v8i16) ((v8i16) q0 - delta);
667
temp1 = CLIP_SH_0_255(temp1);
668
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
669
(v16u8) q_is_pcm_vec);
671
tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
672
temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
673
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
675
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
678
ST2x4_UB(temp0, 0, src, stride);
680
ST2x4_UB(temp0, 4, src, stride);
684
static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
685
uint8_t *src, int32_t src_stride,
686
int32_t sao_left_class,
687
int16_t *sao_offset_val,
691
v16u8 src0, src1, src2, src3;
692
v16i8 src0_r, src1_r;
693
v16i8 offset, offset_val, mask;
694
v16i8 offset0 = { 0 };
695
v16i8 offset1 = { 0 };
697
v8i16 temp0, temp1, dst0, dst1;
699
offset_val = LD_SB(sao_offset_val + 1);
700
offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
702
offset_val = __msa_pckev_b(offset_val, offset_val);
703
offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
704
offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
705
offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
707
if (!((sao_left_class > 12) & (sao_left_class < 29))) {
708
SWAP(offset0, offset1);
711
for (h_cnt = height >> 2; h_cnt--;) {
712
LD_UB4(src, src_stride, src0, src1, src2, src3);
713
src += (4 * src_stride);
715
ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
717
src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
718
mask = __msa_srli_b(src0_r, 3);
719
offset = __msa_vshf_b(mask, offset1, offset0);
721
UNPCK_SB_SH(offset, temp0, temp1);
722
ILVRL_B2_SH(zero, src0_r, dst0, dst1);
723
ADD2(dst0, temp0, dst1, temp1, dst0, dst1);
724
CLIP_SH2_0_255(dst0, dst1);
725
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
726
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
727
dst += (4 * dst_stride);
731
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
732
uint8_t *src, int32_t src_stride,
733
int32_t sao_left_class,
734
int16_t *sao_offset_val,
738
v16u8 src0, src1, src2, src3;
739
v16i8 src0_r, src1_r, mask0, mask1;
740
v16i8 offset, offset_val;
741
v16i8 offset0 = { 0 };
742
v16i8 offset1 = { 0 };
744
v8i16 dst0, dst1, dst2, dst3;
745
v8i16 temp0, temp1, temp2, temp3;
747
offset_val = LD_SB(sao_offset_val + 1);
748
offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
749
offset_val = __msa_pckev_b(offset_val, offset_val);
750
offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
751
offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
752
offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
754
if (!((sao_left_class > 12) & (sao_left_class < 29))) {
755
SWAP(offset0, offset1);
758
for (h_cnt = height >> 2; h_cnt--;) {
759
LD_UB4(src, src_stride, src0, src1, src2, src3);
760
src += (4 * src_stride);
762
ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
764
mask0 = __msa_srli_b(src0_r, 3);
765
mask1 = __msa_srli_b(src1_r, 3);
767
offset = __msa_vshf_b(mask0, offset1, offset0);
768
UNPCK_SB_SH(offset, temp0, temp1);
770
offset = __msa_vshf_b(mask1, offset1, offset0);
771
UNPCK_SB_SH(offset, temp2, temp3);
773
UNPCK_UB_SH(src0_r, dst0, dst1);
774
UNPCK_UB_SH(src1_r, dst2, dst3);
775
ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
776
dst0, dst1, dst2, dst3);
777
CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
778
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst2);
779
ST8x4_UB(dst0, dst2, dst, dst_stride);
780
dst += (4 * dst_stride);
784
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
788
int32_t sao_left_class,
789
int16_t *sao_offset_val,
790
int32_t width, int32_t height)
792
int32_t h_cnt, w_cnt;
793
v16u8 src0, src1, src2, src3;
794
v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
795
v16i8 out0, out1, out2, out3;
796
v16i8 mask0, mask1, mask2, mask3;
797
v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
798
v16i8 offset0 = { 0 };
799
v16i8 offset1 = { 0 };
801
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
803
offset_val = LD_SB(sao_offset_val + 1);
804
offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
805
offset_val = __msa_pckev_b(offset_val, offset_val);
806
offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
807
offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
808
offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
810
if (!((sao_left_class > 12) & (sao_left_class < 29))) {
811
SWAP(offset0, offset1);
814
for (h_cnt = height >> 2; h_cnt--;) {
815
for (w_cnt = 0; w_cnt < (width >> 4); w_cnt++) {
816
LD_UB4(src + w_cnt * 16, src_stride, src0, src1, src2, src3);
818
mask0 = __msa_srli_b((v16i8) src0, 3);
819
mask1 = __msa_srli_b((v16i8) src1, 3);
820
mask2 = __msa_srli_b((v16i8) src2, 3);
821
mask3 = __msa_srli_b((v16i8) src3, 3);
823
VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
825
VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
827
UNPCK_SB_SH(tmp0, temp0, temp1);
828
UNPCK_SB_SH(tmp1, temp2, temp3);
829
UNPCK_SB_SH(tmp2, temp4, temp5);
830
UNPCK_SB_SH(tmp3, temp6, temp7);
831
ILVRL_B2_SH(zero, src0, dst0, dst1);
832
ILVRL_B2_SH(zero, src1, dst2, dst3);
833
ILVRL_B2_SH(zero, src2, dst4, dst5);
834
ILVRL_B2_SH(zero, src3, dst6, dst7);
835
ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
836
dst0, dst1, dst2, dst3);
837
ADD4(dst4, temp4, dst5, temp5, dst6, temp6, dst7, temp7,
838
dst4, dst5, dst6, dst7);
839
CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
840
CLIP_SH4_0_255(dst4, dst5, dst6, dst7);
841
PCKEV_B4_SB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
842
out0, out1, out2, out3);
843
ST_SB4(out0, out1, out2, out3, dst + w_cnt * 16, dst_stride);
846
src += src_stride << 2;
847
dst += dst_stride << 2;
851
static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
855
int16_t *sao_offset_val,
859
uint32_t dst_val0, dst_val1;
860
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
861
v16u8 const1 = (v16u8) __msa_ldi_b(1);
862
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
863
v16u8 src_minus10, src_minus11;
865
v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0;
866
v8i16 offset_mask0, offset_mask1;
867
v8i16 sao_offset, src00, src01;
869
sao_offset = LD_SH(sao_offset_val);
872
for (h_cnt = (height >> 1); h_cnt--;) {
873
LD_UB2(src, src_stride, src_minus10, src_minus11);
874
src += (2 * src_stride);
876
SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
877
SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
878
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
879
src_minus10, src_minus11);
880
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
883
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
884
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
885
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
886
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
888
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
889
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
890
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
891
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
893
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
894
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
896
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
897
offset_mask0, offset_mask0, offset_mask0);
898
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
899
offset_mask1, offset_mask1, offset_mask1);
900
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
901
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
903
CLIP_SH2_0_255(offset_mask0, offset_mask1);
905
dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
906
dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
907
dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
915
static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
919
int16_t *sao_offset_val,
924
uint64_t dst_val0, dst_val1;
925
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
926
v16u8 const1 = (v16u8) __msa_ldi_b(1);
929
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
930
v16u8 src_minus10, src_minus11;
931
v16i8 src_zero0, src_plus10, src_zero1, src_plus11;
932
v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
934
sao_offset = LD_SH(sao_offset_val);
936
for (h_cnt = (height >> 1); h_cnt--;) {
937
src_minus1 = src - 1;
938
LD_UB2(src_minus1, src_stride, src_minus10, src_minus11);
940
SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
941
SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
942
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
943
src_minus10, src_minus11);
944
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
945
src_zero0, src_zero1);
947
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
948
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
949
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
950
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
952
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
953
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
954
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
955
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
957
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
958
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
960
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
961
offset_mask0, offset_mask0, offset_mask0);
962
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
963
offset_mask1, offset_mask1, offset_mask1);
964
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
966
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
968
CLIP_SH2_0_255(offset_mask0, offset_mask1);
969
PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
972
dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
973
dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
978
src += (src_stride << 1);
982
static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
986
int16_t *sao_offset_val,
990
uint8_t *dst_ptr, *src_minus1;
991
int32_t h_cnt, v_cnt;
992
v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
993
v16u8 const1 = (v16u8) __msa_ldi_b(1);
995
v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
996
v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
997
v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
999
v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1000
v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1001
v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1002
v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1003
v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1004
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1005
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1007
sao_offset = LD_SB(sao_offset_val);
1008
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1010
for (h_cnt = (height >> 2); h_cnt--;) {
1011
src_minus1 = src - 1;
1012
LD_UB4(src_minus1, src_stride,
1013
src_minus10, src_minus11, src_minus12, src_minus13);
1015
for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1017
dst_ptr = dst + (v_cnt << 4);
1018
LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1020
SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
1022
SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
1024
SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
1026
SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
1029
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1030
cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1031
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1032
cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1033
cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1034
cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1035
cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1036
cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1038
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1039
diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1040
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1041
diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1042
diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1043
diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1044
diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1045
diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1047
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1048
cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1049
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1050
cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1051
cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1052
cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1053
cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1054
cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1056
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1057
diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1058
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1059
diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1060
diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1061
diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1062
diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1063
diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1065
offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1066
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1067
offset_mask0, offset_mask0, offset_mask0);
1068
offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1069
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1070
offset_mask1, offset_mask1, offset_mask1);
1071
offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1072
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1073
offset_mask2, offset_mask2, offset_mask2);
1074
offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1075
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1076
offset_mask3, offset_mask3, offset_mask3);
1078
UNPCK_UB_SH(src_zero0, src0, src1);
1079
UNPCK_SB_SH(offset_mask0, temp0, temp1);
1080
UNPCK_UB_SH(src_zero1, src2, src3);
1081
UNPCK_SB_SH(offset_mask1, temp2, temp3);
1082
UNPCK_UB_SH(src_zero2, src4, src5);
1083
UNPCK_SB_SH(offset_mask2, temp4, temp5);
1084
UNPCK_UB_SH(src_zero3, src6, src7);
1085
UNPCK_SB_SH(offset_mask3, temp6, temp7);
1086
ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1087
temp1, temp2, temp3);
1088
ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1089
temp5, temp6, temp7);
1090
CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1091
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1092
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1093
dst0, dst1, dst2, dst3);
1095
src_minus10 = src10;
1096
ST_UB(dst0, dst_ptr);
1097
src_minus11 = src11;
1098
ST_UB(dst1, dst_ptr + dst_stride);
1099
src_minus12 = src12;
1100
ST_UB(dst2, dst_ptr + (dst_stride << 1));
1101
src_minus13 = src13;
1102
ST_UB(dst3, dst_ptr + (dst_stride * 3));
1105
src += (src_stride << 2);
1106
dst += (dst_stride << 2);
1110
static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
1114
int16_t *sao_offset_val,
1118
uint32_t dst_val0, dst_val1;
1119
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1120
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1123
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1124
v16u8 src_minus10, src_minus11, src10, src11;
1125
v16i8 src_zero0, src_zero1;
1126
v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
1128
sao_offset = LD_SH(sao_offset_val);
1130
LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1132
for (h_cnt = (height >> 1); h_cnt--;) {
1133
LD_UB2(src + src_stride, src_stride, src10, src11);
1135
src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1136
src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1137
src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1138
src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1140
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1141
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1142
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1143
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1145
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1146
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1147
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1148
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1150
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1151
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1153
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1154
offset_mask0, offset_mask0, offset_mask0);
1155
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1156
offset_mask1, offset_mask1, offset_mask1);
1157
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1158
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1160
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1161
dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1163
src_minus10 = src10;
1164
src_minus11 = src11;
1166
dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1167
dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1173
src += (src_stride << 1);
1177
static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
1181
int16_t *sao_offset_val,
1185
uint64_t dst_val0, dst_val1;
1186
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1187
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1189
v16i8 src_zero0, src_zero1, dst0, dst1;
1190
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1191
v16u8 src_minus10, src_minus11, src10, src11;
1192
v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1194
sao_offset = LD_SH(sao_offset_val);
1196
LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1198
for (h_cnt = (height >> 1); h_cnt--;) {
1199
LD_UB2(src + src_stride, src_stride, src10, src11);
1201
src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1202
src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1203
src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1204
src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1206
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1207
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1208
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1209
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1211
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1212
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1213
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1214
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1216
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1217
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1219
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1220
offset_mask0, offset_mask0, offset_mask0);
1221
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1222
offset_mask1, offset_mask1, offset_mask1);
1223
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1224
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1226
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1227
PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1230
src_minus10 = src10;
1231
src_minus11 = src11;
1233
dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1234
dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1239
src += (src_stride << 1);
1243
static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
1252
uint8_t *src_orig = src;
1253
uint8_t *dst_orig = dst;
1254
int32_t h_cnt, v_cnt;
1255
v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1256
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1257
v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1258
v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1259
v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1261
v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1262
v16u8 src12, dst2, src13, dst3;
1263
v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1264
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1265
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1267
sao_offset = LD_SB(sao_offset_val);
1268
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1270
for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1271
src = src_orig + (v_cnt << 4);
1272
dst = dst_orig + (v_cnt << 4);
1274
LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1276
for (h_cnt = (height >> 2); h_cnt--;) {
1277
LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
1279
cmp_minus10 = (src_minus11 == src_minus10);
1280
cmp_plus10 = (src_minus11 == src10);
1281
cmp_minus11 = (src10 == src_minus11);
1282
cmp_plus11 = (src10 == src11);
1283
cmp_minus12 = (src11 == src10);
1284
cmp_plus12 = (src11 == src12);
1285
cmp_minus13 = (src12 == src11);
1286
cmp_plus13 = (src12 == src13);
1288
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1289
diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1290
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1291
diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1292
diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1293
diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1294
diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1295
diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1297
cmp_minus10 = (src_minus10 < src_minus11);
1298
cmp_plus10 = (src10 < src_minus11);
1299
cmp_minus11 = (src_minus11 < src10);
1300
cmp_plus11 = (src11 < src10);
1301
cmp_minus12 = (src10 < src11);
1302
cmp_plus12 = (src12 < src11);
1303
cmp_minus13 = (src11 < src12);
1304
cmp_plus13 = (src13 < src12);
1306
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1307
diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1308
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1309
diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1310
diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1311
diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1312
diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1313
diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1315
offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1316
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1317
offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1318
offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1319
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1320
offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1321
offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1322
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1323
offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1324
offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1325
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1326
offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1328
UNPCK_UB_SH(src_minus11, src0, src1);
1329
UNPCK_SB_SH(offset_mask0, temp0, temp1);
1330
UNPCK_UB_SH(src10, src2, src3);
1331
UNPCK_SB_SH(offset_mask1, temp2, temp3);
1332
UNPCK_UB_SH(src11, src4, src5);
1333
UNPCK_SB_SH(offset_mask2, temp4, temp5);
1334
UNPCK_UB_SH(src12, src6, src7);
1335
UNPCK_SB_SH(offset_mask3, temp6, temp7);
1336
ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1337
temp1, temp2, temp3);
1338
ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1339
temp5, temp6, temp7);
1340
CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1341
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1342
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1343
dst0, dst1, dst2, dst3);
1345
src_minus10 = src12;
1346
src_minus11 = src13;
1348
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1350
src += (src_stride << 2);
1351
dst += (dst_stride << 2);
1356
static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
1360
int16_t *sao_offset_val,
1365
uint32_t dst_val0, dst_val1;
1366
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1367
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1369
v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1370
v16u8 src_minus11, src10, src11;
1371
v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1372
v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
1374
sao_offset = LD_SH(sao_offset_val);
1377
LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1379
for (h_cnt = (height >> 1); h_cnt--;) {
1380
LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1382
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1383
SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
1385
ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1387
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1390
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1391
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1392
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1393
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1395
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1396
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1397
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1398
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1400
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1401
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1403
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1404
offset_mask0, offset_mask0, offset_mask0);
1405
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1406
offset_mask1, offset_mask1, offset_mask1);
1407
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1408
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1410
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1412
dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1414
src_minus10 = src10;
1415
src_minus11 = src11;
1417
dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1418
dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1424
src_orig += (src_stride << 1);
1428
static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
1432
int16_t *sao_offset_val,
1437
uint64_t dst_val0, dst_val1;
1438
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1439
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1441
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1442
v16u8 src_minus10, src10, src_minus11, src11;
1443
v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0, dst1;
1444
v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1446
sao_offset = LD_SH(sao_offset_val);
1449
LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1451
for (h_cnt = (height >> 1); h_cnt--;) {
1452
LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1454
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1455
SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
1457
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
1458
src_minus10, src_minus11);
1459
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
1460
src_zero0, src_zero1);
1462
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1463
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1464
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1465
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1467
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1468
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1469
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1470
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1472
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1473
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1475
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1476
offset_mask0, offset_mask0, offset_mask0);
1477
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1478
offset_mask1, offset_mask1, offset_mask1);
1479
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1480
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1482
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1483
PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1486
src_minus10 = src10;
1487
src_minus11 = src11;
1489
dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1490
dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1496
src_orig += (src_stride << 1);
1500
static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
1509
uint8_t *src_orig = src;
1510
uint8_t *dst_orig = dst;
1511
int32_t h_cnt, v_cnt;
1512
v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1513
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1514
v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1515
v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1516
v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1517
v16u8 diff_plus13, src_minus14, src_plus13;
1518
v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1519
v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1520
v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
1521
v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
1522
v16i8 src_zero3, sao_offset;
1523
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1524
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1526
sao_offset = LD_SB(sao_offset_val);
1527
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1529
for (h_cnt = (height >> 2); h_cnt--;) {
1532
LD_UB4(src_orig, src_stride,
1533
src_minus11, src_minus12, src_minus13, src_minus14);
1535
for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1536
src_minus10 = LD_UB(src_orig - src_stride);
1537
LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
1538
src_plus13 = LD_UB(src + 1 + (v_cnt << 4) + (src_stride << 2));
1541
SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
1543
SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
1545
SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
1548
src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
1550
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1551
cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1552
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1553
cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1554
cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1555
cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1556
cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1557
cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
1559
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1560
diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1561
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1562
diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1563
diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1564
diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1565
diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1566
diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1568
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1569
cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1570
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1571
cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1572
cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1573
cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1574
cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1575
cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
1577
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1578
diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1579
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1580
diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1581
diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1582
diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1583
diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1584
diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1586
offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1587
offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1588
offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1589
offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1591
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1592
offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1593
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1594
offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1595
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1596
offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1597
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1598
offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1600
UNPCK_UB_SH(src_zero0, src0, src1);
1601
UNPCK_SB_SH(offset_mask0, temp0, temp1);
1602
UNPCK_UB_SH(src_zero1, src2, src3);
1603
UNPCK_SB_SH(offset_mask1, temp2, temp3);
1604
UNPCK_UB_SH(src_zero2, src4, src5);
1605
UNPCK_SB_SH(offset_mask2, temp4, temp5);
1606
UNPCK_UB_SH(src_zero3, src6, src7);
1607
UNPCK_SB_SH(offset_mask3, temp6, temp7);
1608
ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1609
temp1, temp2, temp3);
1610
ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1611
temp5, temp6, temp7);
1612
CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1613
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1614
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4,
1615
temp7, temp6, dst0, dst1, dst2, dst3);
1617
src_minus11 = src10;
1618
src_minus12 = src11;
1619
src_minus13 = src12;
1620
src_minus14 = src13;
1622
ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
1626
src += (src_stride << 2);
1627
dst += (dst_stride << 2);
1631
static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
1635
int16_t *sao_offset_val,
1640
uint32_t dst_val0, dst_val1;
1641
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1642
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1644
v16i8 src_zero0, src_zero1, dst0;
1645
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1646
v16u8 src_minus10, src10, src_minus11, src11;
1647
v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01;
1649
sao_offset = LD_SH(sao_offset_val);
1652
LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1654
for (h_cnt = (height >> 1); h_cnt--;) {
1655
LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1657
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1658
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
1660
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
1662
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1665
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1666
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1667
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1668
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1670
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1671
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1672
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1673
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1675
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1676
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1678
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1679
offset_mask0, offset_mask0, offset_mask0);
1680
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1681
offset_mask1, offset_mask1, offset_mask1);
1682
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1683
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1685
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1686
dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1688
src_minus10 = src10;
1689
src_minus11 = src11;
1691
dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1692
dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1699
src_orig += (src_stride << 1);
1703
static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
1707
int16_t *sao_offset_val,
1712
uint64_t dst_val0, dst_val1;
1713
v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
1714
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1716
v16i8 src_zero0, src_zero1, dst0, dst1;
1717
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1718
v16u8 src_minus10, src10, src_minus11, src11;
1719
v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
1721
sao_offset = LD_SH(sao_offset_val);
1724
LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1726
for (h_cnt = (height >> 1); h_cnt--;) {
1727
LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1729
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1730
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
1731
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
1733
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1736
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1737
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1738
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1739
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1741
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1742
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1743
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1744
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1746
offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1747
offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1749
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1750
offset_mask0, offset_mask0, offset_mask0);
1751
VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1752
offset_mask1, offset_mask1, offset_mask1);
1753
ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
1754
ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
1756
CLIP_SH2_0_255(offset_mask0, offset_mask1);
1757
PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
1760
src_minus10 = src10;
1761
src_minus11 = src11;
1763
dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1764
dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1771
src_orig += (src_stride << 1);
1775
static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
1784
uint8_t *src_orig, *dst_orig;
1785
int32_t h_cnt, v_cnt;
1786
v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1787
v16u8 const1 = (v16u8) __msa_ldi_b(1);
1788
v16u8 dst0, dst1, dst2, dst3;
1789
v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
1790
v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
1791
v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
1792
v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
1793
v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
1794
v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
1795
v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1796
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
1797
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1799
sao_offset = LD_SB(sao_offset_val);
1800
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1802
for (h_cnt = (height >> 2); h_cnt--;) {
1806
LD_UB4(src_orig, src_stride,
1807
src_minus11, src_plus10, src_plus11, src_plus12);
1809
for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
1810
src_minus10 = LD_UB(src_orig + 2 - src_stride);
1811
LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
1812
src_plus13 = LD_UB(src_orig + (src_stride << 2));
1815
src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
1816
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1817
cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
1819
src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
1820
src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
1821
(v16i8) src_minus11, 2);
1822
cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1823
cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
1825
src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
1826
src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
1827
cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
1828
cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
1830
src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
1831
src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
1832
cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
1833
cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
1835
diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1836
diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1837
diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1838
diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1839
diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1840
diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1841
diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1842
diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1844
cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1845
cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
1846
cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1847
cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
1848
cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
1849
cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
1850
cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
1851
cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
1853
diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1854
diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1855
diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1856
diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1857
diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1858
diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1859
diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1860
diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1862
offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1863
offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1864
offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1865
offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1867
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1868
offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1869
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1870
offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1871
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1872
offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1873
VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1874
offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1876
UNPCK_UB_SH(src_zero0, src0, src1);
1877
UNPCK_SB_SH(offset_mask0, temp0, temp1);
1878
UNPCK_UB_SH(src_zero1, src2, src3);
1879
UNPCK_SB_SH(offset_mask1, temp2, temp3);
1880
UNPCK_UB_SH(src_zero2, src4, src5);
1881
UNPCK_SB_SH(offset_mask2, temp4, temp5);
1882
UNPCK_UB_SH(src_zero3, src6, src7);
1883
UNPCK_SB_SH(offset_mask3, temp6, temp7);
1885
ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
1886
temp1, temp2, temp3);
1887
ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
1888
temp5, temp6, temp7);
1889
CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
1890
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
1891
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1892
dst0, dst1, dst2, dst3);
1894
src_minus11 = src10;
1899
ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
1903
src += (src_stride << 2);
1904
dst += (dst_stride << 2);
1908
void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
1909
ptrdiff_t src_stride,
1910
int32_t beta, int32_t *tc,
1911
uint8_t *no_p, uint8_t *no_q)
1913
hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
1916
void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
1917
ptrdiff_t src_stride,
1918
int32_t beta, int32_t *tc,
1919
uint8_t *no_p, uint8_t *no_q)
1921
hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
1924
void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
1925
ptrdiff_t src_stride,
1926
int32_t *tc, uint8_t *no_p,
1929
hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
1932
void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
1933
ptrdiff_t src_stride,
1934
int32_t *tc, uint8_t *no_p,
1937
hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
1940
void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
1941
ptrdiff_t stride_dst, ptrdiff_t stride_src,
1942
int16_t *sao_offset_val, int sao_left_class,
1943
int width, int height)
1946
hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
1947
sao_left_class, sao_offset_val,
1948
width - (width % 16), height);
1949
dst += width - (width % 16);
1950
src += width - (width % 16);
1955
hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
1956
sao_left_class, sao_offset_val, height);
1963
hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
1964
sao_left_class, sao_offset_val, height);
1968
void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
1969
ptrdiff_t stride_dst,
1970
int16_t *sao_offset_val,
1971
int eo, int width, int height)
1973
ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
1978
hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
1981
width - (width % 16),
1983
dst += width - (width % 16);
1984
src += width - (width % 16);
1989
hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
1991
sao_offset_val, height);
1998
hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
2000
sao_offset_val, height);
2006
hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
2009
width - (width % 16),
2011
dst += width - (width % 16);
2012
src += width - (width % 16);
2017
hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
2019
sao_offset_val, height);
2026
hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
2028
sao_offset_val, height);
2034
hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
2037
width - (width % 16),
2039
dst += width - (width % 16);
2040
src += width - (width % 16);
2045
hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
2047
sao_offset_val, height);
2054
hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
2056
sao_offset_val, height);
2062
hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
2065
width - (width % 16),
2067
dst += width - (width % 16);
2068
src += width - (width % 16);
2073
hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
2075
sao_offset_val, height);
2082
hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
2084
sao_offset_val, height);