2
* Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
4
* This file is part of FFmpeg.
6
* FFmpeg is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
#include "libavcodec/hevc.h"
22
#include "libavutil/mips/generic_macros_msa.h"
23
#include "hevcpred_mips.h"
25
static const int8_t intra_pred_angle_up[17] = {
26
-32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
29
static const int8_t intra_pred_angle_low[16] = {
30
32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
33
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34
mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35
res0, res1, mul_val_b0, mul_val_b1, round) \
37
v8i16 res0_m, res1_m, res2_m, res3_m; \
39
MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40
mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
42
res0_m += mul_val_h1 * tmp0; \
43
res1_m += mul_val_h3 * tmp0; \
44
res2_m += mul_val_h1 * tmp0; \
45
res3_m += mul_val_h3 * tmp0; \
47
res0_m += mul_val_b0 * src0_r; \
48
res1_m += mul_val_b0 * src0_l; \
49
res2_m += (mul_val_b0 - 1) * src0_r; \
50
res3_m += (mul_val_b0 - 1) * src0_l; \
52
res0_m += mul_val_b1 * tmp1; \
53
res1_m += mul_val_b1 * tmp1; \
54
res2_m += (mul_val_b1 + 1) * tmp1; \
55
res3_m += (mul_val_b1 + 1) * tmp1; \
57
SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58
PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
61
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62
const uint8_t *src_left,
63
uint8_t *dst, int32_t stride,
68
v8i16 vec0, vec1, vec2;
71
src_data = LW(src_top);
72
SW4(src_data, src_data, src_data, src_data, dst, stride);
75
src_data = LW(src_left);
77
vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
79
vec0 = __msa_fill_h(src_left[-1]);
80
vec1 = __msa_fill_h(src_top[0]);
82
vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
86
vec2 = CLIP_SH_0_255(vec2);
88
for (col = 0; col < 4; col++) {
89
dst[stride * col] = (uint8_t) vec2[col];
94
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95
const uint8_t *src_left,
96
uint8_t *dst, int32_t stride,
99
uint8_t *tmp_dst = dst;
101
uint16_t val0, val1, val2, val3;
103
v8i16 vec0, vec1, vec2;
106
src_data1 = LD(src_top);
108
for (row = 8; row--;) {
109
SD(src_data1, tmp_dst);
114
src_data1 = LD(src_left);
116
vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
118
vec0 = __msa_fill_h(src_left[-1]);
119
vec1 = __msa_fill_h(src_top[0]);
121
vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
125
vec2 = CLIP_SH_0_255(vec2);
134
dst[2 * stride] = val2;
135
dst[3 * stride] = val3;
142
dst[4 * stride] = val0;
143
dst[5 * stride] = val1;
144
dst[6 * stride] = val2;
145
dst[7 * stride] = val3;
149
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150
const uint8_t *src_left,
151
uint8_t *dst, int32_t stride,
155
uint8_t *tmp_dst = dst;
158
v8i16 vec0, vec1, vec2, vec3;
160
src = LD_UB(src_top);
162
for (row = 16; row--;) {
168
src = LD_UB(src_left);
170
vec0 = __msa_fill_h(src_left[-1]);
171
vec1 = __msa_fill_h(src_top[0]);
173
UNPCK_UB_SH(src, vec2, vec3);
174
SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
179
ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180
CLIP_SH2_0_255(vec2, vec3);
182
src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
184
for (col = 0; col < 16; col++) {
185
dst[stride * col] = src[col];
190
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191
const uint8_t *src_left,
192
uint8_t *dst, int32_t stride,
195
uint32_t val0, val1, val2, val3;
197
v8i16 src0_r, src_top_val, src_left_val;
200
val0 = src_left[0] * 0x01010101;
201
val1 = src_left[1] * 0x01010101;
202
val2 = src_left[2] * 0x01010101;
203
val3 = src_left[3] * 0x01010101;
204
SW4(val0, val1, val2, val3, dst, stride);
208
src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209
src_top_val = __msa_fill_h(src_top[-1]);
210
src_left_val = __msa_fill_h(src_left[0]);
212
src0_r = (v8i16) __msa_ilvr_b(zero, src0);
214
src0_r -= src_top_val;
216
src0_r += src_left_val;
217
src0_r = CLIP_SH_0_255(src0_r);
218
src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219
val0 = __msa_copy_s_w((v4i32) src0, 0);
224
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225
const uint8_t *src_left,
226
uint8_t *dst, int32_t stride,
229
uint64_t val0, val1, val2, val3;
231
v8i16 src0_r, src_top_val, src_left_val;
234
val0 = src_left[0] * 0x0101010101010101;
235
val1 = src_left[1] * 0x0101010101010101;
236
val2 = src_left[2] * 0x0101010101010101;
237
val3 = src_left[3] * 0x0101010101010101;
238
SD4(val0, val1, val2, val3, dst, stride);
240
val0 = src_left[4] * 0x0101010101010101;
241
val1 = src_left[5] * 0x0101010101010101;
242
val2 = src_left[6] * 0x0101010101010101;
243
val3 = src_left[7] * 0x0101010101010101;
244
SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
248
src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249
src_top_val = __msa_fill_h(src_top[-1]);
250
src_left_val = __msa_fill_h(src_left[0]);
252
src0_r = (v8i16) __msa_ilvr_b(zero, src0);
254
src0_r -= src_top_val;
256
src0_r += src_left_val;
257
src0_r = CLIP_SH_0_255(src0_r);
258
src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259
val0 = __msa_copy_s_d((v2i64) src0, 0);
264
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265
const uint8_t *src_left,
266
uint8_t *dst, int32_t stride,
269
uint8_t *tmp_dst = dst;
271
uint8_t inp0, inp1, inp2, inp3;
272
v16i8 src0, src1, src2, src3;
273
v8i16 src0_r, src0_l, src_left_val, src_top_val;
275
src_left_val = __msa_fill_h(src_left[0]);
277
for (row = 4; row--;) {
284
src0 = __msa_fill_b(inp0);
285
src1 = __msa_fill_b(inp1);
286
src2 = __msa_fill_b(inp2);
287
src3 = __msa_fill_b(inp3);
289
ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290
tmp_dst += (4 * stride);
294
src0 = LD_SB(src_top);
295
src_top_val = __msa_fill_h(src_top[-1]);
297
UNPCK_UB_SH(src0, src0_r, src0_l);
298
SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
303
ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304
CLIP_SH2_0_255(src0_r, src0_l);
305
src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
310
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311
const uint8_t *src_left,
312
uint8_t *dst, int32_t stride)
315
uint8_t inp0, inp1, inp2, inp3;
316
v16i8 src0, src1, src2, src3;
318
for (row = 0; row < 8; row++) {
319
inp0 = src_left[row * 4];
320
inp1 = src_left[row * 4 + 1];
321
inp2 = src_left[row * 4 + 2];
322
inp3 = src_left[row * 4 + 3];
324
src0 = __msa_fill_b(inp0);
325
src1 = __msa_fill_b(inp1);
326
src2 = __msa_fill_b(inp2);
327
src3 = __msa_fill_b(inp3);
329
ST_SB2(src0, src0, dst, 16);
331
ST_SB2(src1, src1, dst, 16);
333
ST_SB2(src2, src2, dst, 16);
335
ST_SB2(src3, src3, dst, 16);
340
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341
const uint8_t *src_left,
342
uint8_t *dst, int32_t stride,
345
uint8_t *tmp_dst = dst;
346
uint32_t addition = 0;
347
uint32_t val0, val1, val2;
351
v8u16 sum, vec0, vec1;
355
INSERT_W2_SB(val0, val1, src);
356
sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357
sum = (v8u16) __msa_hadd_u_w(sum, sum);
358
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359
sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360
addition = __msa_copy_u_w((v4i32) sum, 0);
361
store = (v16u8) __msa_fill_b(addition);
362
val0 = __msa_copy_u_w((v4i32) store, 0);
363
SW4(val0, val0, val0, val0, dst, stride)
366
ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
372
vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373
store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374
val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375
store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376
val0 = __msa_copy_u_w((v4i32) store, 0);
385
ADD2(val0, addition, val1, addition, val0, val1);
395
tmp_dst[stride * 1] = val0;
396
tmp_dst[stride * 2] = val1;
397
tmp_dst[stride * 3] = val2;
401
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402
const uint8_t *src_left,
403
uint8_t *dst, int32_t stride,
406
uint8_t *tmp_dst = dst;
407
uint32_t row, col, val;
408
uint32_t addition = 0;
412
v8u16 sum, vec0, vec1;
417
INSERT_D2_UB(val0, val1, src);
418
sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419
sum = (v8u16) __msa_hadd_u_w(sum, sum);
420
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421
sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423
sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424
addition = __msa_copy_u_w((v4i32) sum, 0);
425
store = (v16u8) __msa_fill_b(addition);
426
val0 = __msa_copy_u_d((v2i64) store, 0);
428
for (row = 8; row--;) {
434
ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
439
vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440
store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441
val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442
store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443
val0 = __msa_copy_u_d((v2i64) store, 0);
447
src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448
vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449
vec0 = (v8u16) __msa_fill_h(addition);
452
vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
454
for (col = 1; col < 8; col++) {
455
tmp_dst[stride * col] = vec1[col];
460
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461
const uint8_t *src_left,
462
uint8_t *dst, int32_t stride,
465
uint8_t *tmp_dst = dst;
466
uint32_t row, col, val;
467
uint32_t addition = 0;
468
v16u8 src_above1, store, src_left1;
469
v8u16 sum, sum_above, sum_left;
470
v8u16 vec0, vec1, vec2;
473
src_above1 = LD_UB(src_top);
474
src_left1 = LD_UB(src_left);
476
HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477
sum = sum_above + sum_left;
478
sum = (v8u16) __msa_hadd_u_w(sum, sum);
479
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480
sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482
sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483
addition = __msa_copy_u_w((v4i32) sum, 0);
484
store = (v16u8) __msa_fill_b(addition);
486
for (row = 16; row--;) {
492
vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493
ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494
ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
496
ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497
SRARI_H2_UH(vec1, vec2, 2);
498
store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499
val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500
store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501
ST_UB(store, tmp_dst);
503
ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504
vec0 = (v8u16) __msa_fill_h(addition);
506
ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507
SRARI_H2_UH(vec1, vec2, 2);
508
store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
510
for (col = 1; col < 16; col++) {
511
tmp_dst[stride * col] = store[col];
516
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517
const uint8_t *src_left,
518
uint8_t *dst, int32_t stride)
521
v16u8 src_above1, src_above2, store, src_left1, src_left2;
522
v8u16 sum_above1, sum_above2;
523
v8u16 sum_left1, sum_left2;
524
v8u16 sum, sum_above, sum_left;
526
LD_UB2(src_top, 16, src_above1, src_above2);
527
LD_UB2(src_left, 16, src_left1, src_left2);
528
HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529
HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530
sum_above = sum_above1 + sum_above2;
531
sum_left = sum_left1 + sum_left2;
532
sum = sum_above + sum_left;
533
sum = (v8u16) __msa_hadd_u_w(sum, sum);
534
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535
sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536
sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537
sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538
store = (v16u8) __msa_splati_b((v16i8) sum, 0);
540
for (row = 16; row--;) {
541
ST_UB2(store, store, dst, 16);
543
ST_UB2(store, store, dst, 16);
548
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549
const uint8_t *src_left,
550
uint8_t *dst, int32_t stride)
553
v16i8 src_vec0, src_vec1;
554
v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555
v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556
v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
562
mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
564
src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565
src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
567
ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568
SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
570
tmp0 = __msa_fill_h(src_top[4]);
571
tmp1 = __msa_fill_h(src_left[4]);
573
MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574
res0, res1, res2, res3);
576
res0 += mul_val1 * tmp0;
577
res1 += mul_val1 * tmp0;
578
res2 += mul_val1 * tmp0;
579
res3 += mul_val1 * tmp0;
581
res0 += 3 * src_vec0_r;
582
res1 += 2 * src_vec0_r;
589
PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590
SRARI_H2_SH(res0, res1, 3);
591
src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592
ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
595
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596
const uint8_t *src_left,
597
uint8_t *dst, int32_t stride)
600
v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601
v8i16 src_vec0_r, src_vec1_r;
602
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604
v8i16 tmp0, tmp1, tmp2;
605
v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606
v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
612
src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613
src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
615
ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616
SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617
SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
619
tmp0 = __msa_fill_h(src_top[8]);
620
tmp1 = __msa_fill_h(src_left[8]);
622
MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623
res0, res1, res2, res3);
624
MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625
res4, res5, res6, res7);
627
tmp2 = mul_val1 * tmp0;
637
res0 += 7 * src_vec0_r;
638
res1 += 6 * src_vec0_r;
639
res2 += 5 * src_vec0_r;
640
res3 += 4 * src_vec0_r;
641
res4 += 3 * src_vec0_r;
642
res5 += 2 * src_vec0_r;
654
SRARI_H4_SH(res0, res1, res2, res3, 4);
655
SRARI_H4_SH(res4, res5, res6, res7, 4);
656
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657
src_vec0, src_vec1, src_vec2, src_vec3);
659
ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
662
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
663
const uint8_t *src_left,
664
uint8_t *dst, int32_t stride)
667
v8i16 src0_r, src1_r, src0_l, src1_l;
669
v8i16 res0, res1, tmp0, tmp1;
670
v8i16 mul_val2, mul_val3;
671
v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
672
v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674
src0 = LD_UB(src_top);
675
src1 = LD_UB(src_left);
677
UNPCK_UB_SH(src0, src0_r, src0_l);
678
UNPCK_UB_SH(src1, src1_r, src1_l);
680
mul_val2 = mul_val0 - 8;
681
mul_val3 = mul_val1 + 8;
683
tmp0 = __msa_fill_h(src_top[16]);
684
tmp1 = __msa_fill_h(src_left[16]);
686
SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
687
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
688
mul_val0, mul_val1, mul_val2, mul_val3,
689
res0, res1, 15, 1, 5);
690
ST_SH2(res0, res1, dst, stride);
693
SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
694
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
695
mul_val0, mul_val1, mul_val2, mul_val3,
696
res0, res1, 13, 3, 5);
697
ST_SH2(res0, res1, dst, stride);
700
SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
701
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
702
mul_val0, mul_val1, mul_val2, mul_val3,
703
res0, res1, 11, 5, 5);
704
ST_SH2(res0, res1, dst, stride);
707
SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
708
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
709
mul_val0, mul_val1, mul_val2, mul_val3,
710
res0, res1, 9, 7, 5);
711
ST_SH2(res0, res1, dst, stride);
714
SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
715
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
716
mul_val0, mul_val1, mul_val2, mul_val3,
717
res0, res1, 7, 9, 5);
718
ST_SH2(res0, res1, dst, stride);
721
SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
722
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
723
mul_val0, mul_val1, mul_val2, mul_val3,
724
res0, res1, 5, 11, 5);
725
ST_SH2(res0, res1, dst, stride);
728
SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
729
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
730
mul_val0, mul_val1, mul_val2, mul_val3,
731
res0, res1, 3, 13, 5);
732
ST_SH2(res0, res1, dst, stride);
735
SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
736
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
737
mul_val0, mul_val1, mul_val2, mul_val3,
738
res0, res1, 1, 15, 5);
739
ST_SH2(res0, res1, dst, stride);
742
static void process_intra_upper_16x16_msa(const uint8_t *src_top,
743
const uint8_t *src_left,
744
uint8_t *dst, int32_t stride,
748
v8i16 src0_r, src1_r, src0_l, src1_l;
749
v8i16 vec0, vec1, res0, res1;
751
v8i16 mul_val2, mul_val3;
752
v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
753
v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755
tmp0 = __msa_fill_h(src_top[32 - offset]);
756
tmp1 = __msa_fill_h(src_left[32]);
758
src0 = LD_SB(src_top);
759
src1 = LD_SB(src_left);
761
UNPCK_UB_SH(src0, src0_r, src0_l);
762
UNPCK_UB_SH(src1, src1_r, src1_l);
766
mul_val2 = mul_val0 - 8;
767
mul_val3 = mul_val1 + 8;
769
SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
770
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
771
mul_val0, mul_val1, mul_val2, mul_val3,
772
res0, res1, 31, 1, 6);
773
ST_SH2(res0, res1, dst, stride);
776
SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
777
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
778
mul_val0, mul_val1, mul_val2, mul_val3,
779
res0, res1, 29, 3, 6);
780
ST_SH2(res0, res1, dst, stride);
783
SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
784
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
785
mul_val0, mul_val1, mul_val2, mul_val3,
786
res0, res1, 27, 5, 6);
787
ST_SH2(res0, res1, dst, stride);
790
SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
791
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
792
mul_val0, mul_val1, mul_val2, mul_val3,
793
res0, res1, 25, 7, 6);
794
ST_SH2(res0, res1, dst, stride);
797
SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
798
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
799
mul_val0, mul_val1, mul_val2, mul_val3,
800
res0, res1, 23, 9, 6);
801
ST_SH2(res0, res1, dst, stride);
804
SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
805
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
806
mul_val0, mul_val1, mul_val2, mul_val3,
807
res0, res1, 21, 11, 6);
808
ST_SH2(res0, res1, dst, stride);
811
SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
812
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
813
mul_val0, mul_val1, mul_val2, mul_val3,
814
res0, res1, 19, 13, 6);
815
ST_SH2(res0, res1, dst, stride);
818
SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
819
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
820
mul_val0, mul_val1, mul_val2, mul_val3,
821
res0, res1, 17, 15, 6);
822
ST_SH2(res0, res1, dst, stride);
825
static void process_intra_lower_16x16_msa(const uint8_t *src_top,
826
const uint8_t *src_left,
827
uint8_t *dst, int32_t stride,
831
v8i16 src0_r, src1_r, src0_l, src1_l;
832
v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
833
v8i16 mul_val2, mul_val3;
834
v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
835
v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837
tmp0 = __msa_fill_h(src_top[32 - offset]);
838
tmp1 = __msa_fill_h(src_left[16]);
840
src0 = LD_SB(src_top);
841
src1 = LD_SB(src_left);
843
UNPCK_UB_SH(src0, src0_r, src0_l);
844
UNPCK_UB_SH(src1, src1_r, src1_l);
848
mul_val2 = mul_val0 - 8;
849
mul_val3 = mul_val1 + 8;
851
SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
852
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
853
mul_val0, mul_val1, mul_val2, mul_val3,
854
res0, res1, 15, 17, 6);
855
ST_SH2(res0, res1, dst, stride);
858
SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
859
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
860
mul_val0, mul_val1, mul_val2, mul_val3,
861
res0, res1, 13, 19, 6);
862
ST_SH2(res0, res1, dst, stride);
865
SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
866
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
867
mul_val0, mul_val1, mul_val2, mul_val3,
868
res0, res1, 11, 21, 6);
869
ST_SH2(res0, res1, dst, stride);
872
SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
873
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
874
mul_val0, mul_val1, mul_val2, mul_val3,
875
res0, res1, 9, 23, 6);
876
ST_SH2(res0, res1, dst, stride);
879
SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
880
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
881
mul_val0, mul_val1, mul_val2, mul_val3,
882
res0, res1, 7, 25, 6);
883
ST_SH2(res0, res1, dst, stride);
886
SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
887
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
888
mul_val0, mul_val1, mul_val2, mul_val3,
889
res0, res1, 5, 27, 6);
890
ST_SH2(res0, res1, dst, stride);
893
SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
894
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
895
mul_val0, mul_val1, mul_val2, mul_val3,
896
res0, res1, 3, 29, 6);
897
ST_SH2(res0, res1, dst, stride);
900
SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
901
HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
902
mul_val0, mul_val1, mul_val2, mul_val3,
903
res0, res1, 1, 31, 6);
904
ST_SH2(res0, res1, dst, stride);
907
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
908
const uint8_t *src_left,
909
uint8_t *dst, int32_t stride)
911
process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
912
process_intra_upper_16x16_msa((src_top + 16), src_left,
913
(dst + 16), stride, 16);
914
dst += (16 * stride);
917
process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
918
process_intra_lower_16x16_msa((src_top + 16), src_left,
919
(dst + 16), stride, 16);
922
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
923
const uint8_t *src_left,
928
int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
929
uint8_t ref_array[3 * 32 + 4];
930
uint8_t *ref_tmp = ref_array + 4;
933
int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
934
int32_t idx2, fact_val2, idx3, fact_val3;
935
int32_t angle, angle_loop;
936
int32_t inv_angle_val, offset;
938
v16i8 top0, top1, top2, top3;
941
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
942
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944
angle = intra_pred_angle_up[mode - 18];
945
inv_angle_val = inv_angle[mode - 18];
950
if (angle < 0 && last < -1) {
951
inv_angle_val = inv_angle[mode - 18];
956
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
957
offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
958
ref_tmp[h_cnt] = src_left[offset];
964
idx0 = angle_loop >> 5;
965
fact_val0 = angle_loop & 31;
968
idx1 = angle_loop >> 5;
969
fact_val1 = angle_loop & 31;
972
idx2 = angle_loop >> 5;
973
fact_val2 = angle_loop & 31;
976
idx3 = angle_loop >> 5;
977
fact_val3 = angle_loop & 31;
979
top0 = LD_SB(ref + idx0 + 1);
980
top1 = LD_SB(ref + idx1 + 1);
981
top2 = LD_SB(ref + idx2 + 1);
982
top3 = LD_SB(ref + idx3 + 1);
984
fact0 = __msa_fill_h(fact_val0);
985
fact1 = __msa_fill_h(32 - fact_val0);
987
fact2 = __msa_fill_h(fact_val1);
988
fact3 = __msa_fill_h(32 - fact_val1);
990
fact4 = __msa_fill_h(fact_val2);
991
fact5 = __msa_fill_h(32 - fact_val2);
993
fact6 = __msa_fill_h(fact_val3);
994
fact7 = __msa_fill_h(32 - fact_val3);
996
ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
997
ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
998
ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
999
diff0, diff2, diff4, diff6);
1000
SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1001
ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1002
ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1003
MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1005
diff1 += diff0 * fact1;
1006
diff3 += diff2 * fact3;
1008
SRARI_H2_SH(diff1, diff3, 5);
1009
dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1010
ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
1013
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1014
const uint8_t *src_left,
1019
int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1020
uint8_t ref_array[3 * 32 + 4];
1021
uint8_t *ref_tmp = ref_array + 8;
1023
const uint8_t *src_left_tmp = src_left - 1;
1024
int32_t last, offset;
1025
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1026
int32_t idx2, fact_val2, idx3, fact_val3;
1027
int32_t angle, angle_loop;
1028
int32_t inv_angle_val, inv_angle_val_loop;
1029
int32_t tmp0, tmp1, tmp2;
1030
v16i8 top0, top1, top2, top3;
1031
v16u8 dst_val0, dst_val1;
1032
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1033
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1035
angle = intra_pred_angle_up[mode - 18];
1036
inv_angle_val = inv_angle[mode - 18];
1037
last = (angle) >> 2;
1042
inv_angle_val_loop = inv_angle_val * last;
1048
SW(tmp1, ref_tmp + 4);
1049
SW(tmp2, ref_tmp + 8);
1051
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1052
offset = (inv_angle_val_loop + 128) >> 8;
1053
ref_tmp[h_cnt] = src_left_tmp[offset];
1054
inv_angle_val_loop += inv_angle_val;
1059
for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1060
idx0 = (angle_loop) >> 5;
1061
fact_val0 = (angle_loop) & 31;
1062
angle_loop += angle;
1064
idx1 = (angle_loop) >> 5;
1065
fact_val1 = (angle_loop) & 31;
1066
angle_loop += angle;
1068
idx2 = (angle_loop) >> 5;
1069
fact_val2 = (angle_loop) & 31;
1070
angle_loop += angle;
1072
idx3 = (angle_loop) >> 5;
1073
fact_val3 = (angle_loop) & 31;
1074
angle_loop += angle;
1076
top0 = LD_SB(ref + idx0 + 1);
1077
top1 = LD_SB(ref + idx1 + 1);
1078
top2 = LD_SB(ref + idx2 + 1);
1079
top3 = LD_SB(ref + idx3 + 1);
1081
fact0 = __msa_fill_h(fact_val0);
1082
fact1 = __msa_fill_h(32 - fact_val0);
1083
fact2 = __msa_fill_h(fact_val1);
1084
fact3 = __msa_fill_h(32 - fact_val1);
1085
fact4 = __msa_fill_h(fact_val2);
1086
fact5 = __msa_fill_h(32 - fact_val2);
1087
fact6 = __msa_fill_h(fact_val3);
1088
fact7 = __msa_fill_h(32 - fact_val3);
1090
UNPCK_UB_SH(top0, diff0, diff1);
1091
UNPCK_UB_SH(top1, diff2, diff3);
1092
UNPCK_UB_SH(top2, diff4, diff5);
1093
UNPCK_UB_SH(top3, diff6, diff7);
1095
SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1096
SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1097
MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1098
diff1, diff3, diff5, diff7);
1100
diff1 += diff0 * fact1;
1101
diff3 += diff2 * fact3;
1102
diff5 += diff4 * fact5;
1103
diff7 += diff6 * fact7;
1105
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1106
PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1107
ST8x4_UB(dst_val0, dst_val1, dst, stride);
1108
dst += (4 * stride);
1112
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1113
const uint8_t *src_left,
1118
int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1119
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1120
int32_t idx2, fact_val2, idx3, fact_val3;
1122
int32_t angle, angle_loop, offset;
1123
int32_t inv_angle_val, inv_angle_val_loop;
1124
uint8_t ref_array[3 * 32 + 4];
1125
uint8_t *ref_tmp = ref_array + 16;
1127
const uint8_t *src_left_tmp = src_left - 1;
1129
v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1130
v16i8 dst0, dst1, dst2, dst3;
1131
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1132
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1133
v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1135
angle = intra_pred_angle_up[mode - 18];
1136
inv_angle_val = inv_angle[mode - 18];
1142
inv_angle_val_loop = inv_angle_val * last;
1145
tmp0 = LW(ref + 16);
1146
ST_UB(top0, ref_tmp);
1147
SW(tmp0, ref_tmp + 16);
1149
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1150
offset = (inv_angle_val_loop + 128) >> 8;
1151
ref_tmp[h_cnt] = src_left_tmp[offset];
1152
inv_angle_val_loop += inv_angle_val;
1157
for (v_cnt = 4; v_cnt--;) {
1158
idx0 = (angle_loop) >> 5;
1159
fact_val0 = (angle_loop) & 31;
1160
angle_loop += angle;
1162
idx1 = (angle_loop) >> 5;
1163
fact_val1 = (angle_loop) & 31;
1164
angle_loop += angle;
1166
idx2 = (angle_loop) >> 5;
1167
fact_val2 = (angle_loop) & 31;
1168
angle_loop += angle;
1170
idx3 = (angle_loop) >> 5;
1171
fact_val3 = (angle_loop) & 31;
1172
angle_loop += angle;
1174
LD_UB2(ref + idx0 + 1, 16, top0, top1);
1175
LD_UB2(ref + idx1 + 1, 16, top2, top3);
1176
LD_UB2(ref + idx2 + 1, 16, top4, top5);
1177
LD_UB2(ref + idx3 + 1, 16, top6, top7);
1179
fact0 = __msa_fill_h(fact_val0);
1180
fact1 = __msa_fill_h(32 - fact_val0);
1181
fact2 = __msa_fill_h(fact_val1);
1182
fact3 = __msa_fill_h(32 - fact_val1);
1183
fact4 = __msa_fill_h(fact_val2);
1184
fact5 = __msa_fill_h(32 - fact_val2);
1185
fact6 = __msa_fill_h(fact_val3);
1186
fact7 = __msa_fill_h(32 - fact_val3);
1188
SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1189
SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1190
UNPCK_UB_SH(top0, diff0, diff1);
1191
UNPCK_UB_SH(top1, diff2, diff3);
1192
UNPCK_UB_SH(top2, diff4, diff5);
1193
UNPCK_UB_SH(top3, diff6, diff7);
1194
UNPCK_UB_SH(top4, diff8, diff9);
1195
UNPCK_UB_SH(top5, diff10, diff11);
1196
UNPCK_UB_SH(top6, diff12, diff13);
1197
UNPCK_UB_SH(top7, diff14, diff15);
1199
MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1200
diff2, diff3, diff6, diff7);
1201
MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1202
diff10, diff11, diff14, diff15);
1204
diff2 += diff0 * fact1;
1205
diff3 += diff1 * fact1;
1206
diff6 += diff4 * fact3;
1207
diff7 += diff5 * fact3;
1208
diff10 += diff8 * fact5;
1209
diff11 += diff9 * fact5;
1210
diff14 += diff12 * fact7;
1211
diff15 += diff13 * fact7;
1213
SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1214
SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1215
PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1216
dst0, dst1, dst2, dst3);
1217
ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1218
dst += (4 * stride);
1222
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1223
const uint8_t *src_left,
1228
int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1229
uint8_t ref_array[3 * 32 + 4];
1232
const uint8_t *src_left_tmp = src_left - 1;
1233
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1234
int32_t tmp0, tmp1, tmp2, tmp3;
1235
int32_t angle, angle_loop;
1236
int32_t inv_angle_val, inv_angle_val_loop;
1237
int32_t last, offset;
1238
v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1239
v16i8 dst0, dst1, dst2, dst3;
1240
v8i16 fact0, fact1, fact2, fact3;
1241
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1242
v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1244
ref_tmp = ref_array + 32;
1246
angle = intra_pred_angle_up[mode - 18];
1247
inv_angle_val = inv_angle[mode - 18];
1253
inv_angle_val_loop = inv_angle_val * last;
1254
LD_UB2(ref, 16, top0, top1);
1260
ST_UB2(top0, top1, ref_tmp, 16);
1266
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1267
offset = (inv_angle_val_loop + 128) >> 8;
1268
ref_tmp[h_cnt] = src_left_tmp[offset];
1269
inv_angle_val_loop += inv_angle_val;
1275
for (v_cnt = 16; v_cnt--;) {
1276
idx0 = (angle_loop) >> 5;
1277
fact_val0 = (angle_loop) & 31;
1278
angle_loop += angle;
1280
idx1 = (angle_loop) >> 5;
1281
fact_val1 = (angle_loop) & 31;
1282
angle_loop += angle;
1284
top0 = LD_UB(ref + idx0 + 1);
1285
top4 = LD_UB(ref + idx1 + 1);
1286
top1 = LD_UB(ref + idx0 + 17);
1287
top5 = LD_UB(ref + idx1 + 17);
1288
top3 = LD_UB(ref + idx0 + 33);
1289
top7 = LD_UB(ref + idx1 + 33);
1291
fact0 = __msa_fill_h(fact_val0);
1292
fact1 = __msa_fill_h(32 - fact_val0);
1293
fact2 = __msa_fill_h(fact_val1);
1294
fact3 = __msa_fill_h(32 - fact_val1);
1299
SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1300
SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1301
UNPCK_UB_SH(top0, diff0, diff1);
1302
UNPCK_UB_SH(top1, diff2, diff3);
1303
UNPCK_UB_SH(top2, diff4, diff5);
1304
UNPCK_UB_SH(top3, diff6, diff7);
1305
UNPCK_UB_SH(top4, diff8, diff9);
1306
UNPCK_UB_SH(top5, diff10, diff11);
1307
UNPCK_UB_SH(top6, diff12, diff13);
1308
UNPCK_UB_SH(top7, diff14, diff15);
1310
MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1311
diff2, diff3, diff6, diff7);
1312
MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1313
diff10, diff11, diff14, diff15);
1315
diff2 += diff0 * fact1;
1316
diff3 += diff1 * fact1;
1317
diff6 += diff4 * fact1;
1318
diff7 += diff5 * fact1;
1319
diff10 += diff8 * fact3;
1320
diff11 += diff9 * fact3;
1321
diff14 += diff12 * fact3;
1322
diff15 += diff13 * fact3;
1324
SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1325
SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1326
PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1327
dst0, dst1, dst2, dst3);
1329
ST_SB2(dst0, dst1, dst, 16);
1331
ST_SB2(dst2, dst3, dst, 16);
1336
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1337
const uint8_t *src_left,
1342
int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1343
uint8_t ref_array[3 * 32 + 4];
1344
uint8_t *ref_tmp = ref_array + 4;
1346
int32_t last, offset;
1347
int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1348
int32_t idx2, fact_val2, idx3, fact_val3;
1349
int32_t angle, angle_loop, inv_angle_val;
1351
v16i8 dst_val0, dst_val1;
1352
v16u8 top0, top1, top2, top3;
1354
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1355
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1357
angle = intra_pred_angle_low[mode - 2];
1363
inv_angle_val = inv_angle[mode - 11];
1368
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1369
offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1370
ref_tmp[h_cnt] = src_top[offset];
1376
idx0 = angle_loop >> 5;
1377
fact_val0 = angle_loop & 31;
1378
angle_loop += angle;
1380
idx1 = angle_loop >> 5;
1381
fact_val1 = angle_loop & 31;
1382
angle_loop += angle;
1384
idx2 = angle_loop >> 5;
1385
fact_val2 = angle_loop & 31;
1386
angle_loop += angle;
1388
idx3 = angle_loop >> 5;
1389
fact_val3 = angle_loop & 31;
1391
top0 = LD_UB(ref + idx0 + 1);
1392
top1 = LD_UB(ref + idx1 + 1);
1393
top2 = LD_UB(ref + idx2 + 1);
1394
top3 = LD_UB(ref + idx3 + 1);
1396
fact0 = __msa_fill_h(fact_val0);
1397
fact1 = __msa_fill_h(32 - fact_val0);
1398
fact2 = __msa_fill_h(fact_val1);
1399
fact3 = __msa_fill_h(32 - fact_val1);
1400
fact4 = __msa_fill_h(fact_val2);
1401
fact5 = __msa_fill_h(32 - fact_val2);
1402
fact6 = __msa_fill_h(fact_val3);
1403
fact7 = __msa_fill_h(32 - fact_val3);
1405
ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1406
ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1407
ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1408
diff0, diff2, diff4, diff6);
1409
SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1410
ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1411
ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1412
MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1414
diff1 += diff0 * fact1;
1415
diff3 += diff2 * fact3;
1417
SRARI_H2_SH(diff1, diff3, 5);
1418
PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1420
diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1421
diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1423
diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1425
dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1426
dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1428
ST4x2_UB(dst_val0, dst, stride);
1429
dst += (2 * stride);
1430
ST4x2_UB(dst_val1, dst, stride);
1433
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1434
const uint8_t *src_left,
1439
int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1440
uint8_t ref_array[3 * 32 + 4];
1441
uint8_t *ref_tmp = ref_array + 8;
1443
const uint8_t *src_top_tmp = src_top - 1;
1445
int32_t last, offset, tmp0, tmp1, tmp2;
1446
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1447
int32_t idx2, fact_val2, idx3, fact_val3;
1448
int32_t angle, angle_loop, inv_angle_val;
1449
v16i8 top0, top1, top2, top3;
1450
v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1451
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1452
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1454
angle = intra_pred_angle_low[mode - 2];
1455
last = (angle) >> 2;
1460
inv_angle_val = inv_angle[mode - 11];
1466
SW(tmp1, ref_tmp + 4);
1467
SW(tmp2, ref_tmp + 8);
1469
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1470
offset = (h_cnt * inv_angle_val + 128) >> 8;
1471
ref_tmp[h_cnt] = src_top_tmp[offset];
1477
for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480
idx0 = angle_loop >> 5;
1481
fact_val0 = angle_loop & 31;
1482
angle_loop += angle;
1484
idx1 = angle_loop >> 5;
1485
fact_val1 = angle_loop & 31;
1486
angle_loop += angle;
1488
idx2 = angle_loop >> 5;
1489
fact_val2 = angle_loop & 31;
1490
angle_loop += angle;
1492
idx3 = angle_loop >> 5;
1493
fact_val3 = angle_loop & 31;
1494
angle_loop += angle;
1496
top0 = LD_SB(ref + idx0 + 1);
1497
top1 = LD_SB(ref + idx1 + 1);
1498
top2 = LD_SB(ref + idx2 + 1);
1499
top3 = LD_SB(ref + idx3 + 1);
1501
fact0 = __msa_fill_h(fact_val0);
1502
fact1 = __msa_fill_h(32 - fact_val0);
1503
fact2 = __msa_fill_h(fact_val1);
1504
fact3 = __msa_fill_h(32 - fact_val1);
1505
fact4 = __msa_fill_h(fact_val2);
1506
fact5 = __msa_fill_h(32 - fact_val2);
1507
fact6 = __msa_fill_h(fact_val3);
1508
fact7 = __msa_fill_h(32 - fact_val3);
1510
UNPCK_UB_SH(top0, diff0, diff1);
1511
UNPCK_UB_SH(top1, diff2, diff3);
1512
UNPCK_UB_SH(top2, diff4, diff5);
1513
UNPCK_UB_SH(top3, diff6, diff7);
1514
SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1515
SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1516
MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1517
diff1, diff3, diff5, diff7);
1519
diff1 += diff0 * fact1;
1520
diff3 += diff2 * fact3;
1521
diff5 += diff4 * fact5;
1522
diff7 += diff6 * fact7;
1524
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1525
PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1526
dst_val0, dst_val1, dst_val2, dst_val3);
1527
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1528
ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1529
ST4x8_UB(diff3, diff4, dst_org, stride);
1534
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1535
const uint8_t *src_left,
1540
int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1541
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1542
int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1543
v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1544
v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1545
v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1546
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1547
v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1548
int32_t angle, angle_loop, inv_angle_val, offset;
1549
uint8_t ref_array[3 * 32 + 4];
1550
uint8_t *ref_tmp = ref_array + 16;
1551
const uint8_t *ref, *src_top_tmp = src_top - 1;
1555
angle = intra_pred_angle_low[mode - 2];
1556
last = (angle) >> 1;
1561
inv_angle_val = inv_angle[mode - 11];
1564
tmp0 = LW(ref + 16);
1565
ST_SB(top0, ref_tmp);
1566
SW(tmp0, ref_tmp + 16);
1568
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1569
offset = (h_cnt * inv_angle_val + 128) >> 8;
1570
ref_tmp[h_cnt] = src_top_tmp[offset];
1576
for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579
idx0 = angle_loop >> 5;
1580
fact_val0 = angle_loop & 31;
1581
angle_loop += angle;
1583
idx1 = angle_loop >> 5;
1584
fact_val1 = angle_loop & 31;
1585
angle_loop += angle;
1587
idx2 = angle_loop >> 5;
1588
fact_val2 = angle_loop & 31;
1589
angle_loop += angle;
1591
idx3 = angle_loop >> 5;
1592
fact_val3 = angle_loop & 31;
1593
angle_loop += angle;
1595
LD_SB2(ref + idx0 + 1, 16, top0, top1);
1596
LD_SB2(ref + idx1 + 1, 16, top2, top3);
1597
LD_SB2(ref + idx2 + 1, 16, top4, top5);
1598
LD_SB2(ref + idx3 + 1, 16, top6, top7);
1600
fact0 = __msa_fill_h(fact_val0);
1601
fact1 = __msa_fill_h(32 - fact_val0);
1602
fact2 = __msa_fill_h(fact_val1);
1603
fact3 = __msa_fill_h(32 - fact_val1);
1604
fact4 = __msa_fill_h(fact_val2);
1605
fact5 = __msa_fill_h(32 - fact_val2);
1606
fact6 = __msa_fill_h(fact_val3);
1607
fact7 = __msa_fill_h(32 - fact_val3);
1609
SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1610
SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1612
UNPCK_UB_SH(top0, diff0, diff1);
1613
UNPCK_UB_SH(top1, diff2, diff3);
1614
UNPCK_UB_SH(top2, diff4, diff5);
1615
UNPCK_UB_SH(top3, diff6, diff7);
1616
UNPCK_UB_SH(top4, diff8, diff9);
1617
UNPCK_UB_SH(top5, diff10, diff11);
1618
UNPCK_UB_SH(top6, diff12, diff13);
1619
UNPCK_UB_SH(top7, diff14, diff15);
1621
MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1622
diff2, diff3, diff6, diff7);
1623
MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1624
diff10, diff11, diff14, diff15);
1626
diff2 += diff0 * fact1;
1627
diff3 += diff1 * fact1;
1628
diff6 += diff4 * fact3;
1629
diff7 += diff5 * fact3;
1630
diff10 += diff8 * fact5;
1631
diff11 += diff9 * fact5;
1632
diff14 += diff12 * fact7;
1633
diff15 += diff13 * fact7;
1635
SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1636
SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1637
PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1638
dst_val0, dst_val1, dst_val2, dst_val3);
1639
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1640
ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1641
ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1642
ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1643
ST4x8_UB(diff4, diff5, dst_org, stride);
1644
dst_org += (8 * stride);
1645
ST4x8_UB(diff6, diff7, dst_org, stride);
1650
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1651
const uint8_t *src_left,
1656
int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1657
int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1658
v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1659
v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1660
v8i16 fact0, fact1, fact2, fact3;
1661
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1662
v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1663
int32_t angle, angle_loop, inv_angle_val, offset;
1664
uint8_t ref_array[3 * 32 + 4];
1665
uint8_t *ref_tmp = ref_array + 32;
1666
const uint8_t *ref, *src_top_tmp = src_top - 1;
1670
angle = intra_pred_angle_low[mode - 2];
1676
inv_angle_val = inv_angle[mode - 11];
1678
LD_SB2(ref, 16, top0, top1);
1679
tmp0 = LW(ref + 32);
1680
ST_SB2(top0, top1, ref_tmp, 16);
1681
SW(tmp0, ref_tmp + 32);
1683
for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1684
offset = (h_cnt * inv_angle_val + 128) >> 8;
1685
ref_tmp[h_cnt] = src_top_tmp[offset];
1691
for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1693
idx0 = angle_loop >> 5;
1694
fact_val0 = angle_loop & 31;
1695
angle_loop += angle;
1697
idx1 = angle_loop >> 5;
1698
fact_val1 = angle_loop & 31;
1699
angle_loop += angle;
1701
top0 = LD_SB(ref + idx0 + 1);
1702
top4 = LD_SB(ref + idx1 + 1);
1703
top1 = LD_SB(ref + idx0 + 17);
1704
top5 = LD_SB(ref + idx1 + 17);
1705
top3 = LD_SB(ref + idx0 + 33);
1706
top7 = LD_SB(ref + idx1 + 33);
1708
fact0 = __msa_fill_h(fact_val0);
1709
fact1 = __msa_fill_h(32 - fact_val0);
1710
fact2 = __msa_fill_h(fact_val1);
1711
fact3 = __msa_fill_h(32 - fact_val1);
1716
SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1717
SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1719
UNPCK_UB_SH(top0, diff0, diff1);
1720
UNPCK_UB_SH(top1, diff2, diff3);
1721
UNPCK_UB_SH(top2, diff4, diff5);
1722
UNPCK_UB_SH(top3, diff6, diff7);
1723
UNPCK_UB_SH(top4, diff8, diff9);
1724
UNPCK_UB_SH(top5, diff10, diff11);
1725
UNPCK_UB_SH(top6, diff12, diff13);
1726
UNPCK_UB_SH(top7, diff14, diff15);
1728
MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1729
diff2, diff3, diff6, diff7);
1730
MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1731
diff10, diff11, diff14, diff15);
1733
diff2 += diff0 * fact1;
1734
diff3 += diff1 * fact1;
1735
diff6 += diff4 * fact1;
1736
diff7 += diff5 * fact1;
1737
diff10 += diff8 * fact3;
1738
diff11 += diff9 * fact3;
1739
diff14 += diff12 * fact3;
1740
diff15 += diff13 * fact3;
1742
SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1743
SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1744
PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1745
dst_val0, dst_val1, dst_val2, dst_val3);
1746
ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1747
ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1749
ST2x4_UB(diff0, 0, dst_org, stride);
1750
dst_org += (4 * stride);
1751
ST2x4_UB(diff0, 4, dst_org, stride);
1752
dst_org += (4 * stride);
1753
ST2x4_UB(diff1, 0, dst_org, stride);
1754
dst_org += (4 * stride);
1755
ST2x4_UB(diff1, 4, dst_org, stride);
1756
dst_org += (4 * stride);
1758
ST2x4_UB(diff2, 0, dst_org, stride);
1759
dst_org += (4 * stride);
1760
ST2x4_UB(diff2, 4, dst_org, stride);
1761
dst_org += (4 * stride);
1762
ST2x4_UB(diff3, 0, dst_org, stride);
1763
dst_org += (4 * stride);
1764
ST2x4_UB(diff3, 4, dst_org, stride);
1765
dst_org += (4 * stride);
1771
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1778
src2 = LD_UB(src + 16);
1780
for (row = 32; row--;) {
1781
ST_UB2(src1, src2, dst, 16);
1786
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
1787
const uint8_t *src_top,
1788
const uint8_t *src_left,
1791
hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1794
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
1795
const uint8_t *src_top,
1796
const uint8_t *src_left,
1799
hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1802
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
1803
const uint8_t *src_top,
1804
const uint8_t *src_left,
1807
hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1810
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
1811
const uint8_t *src_top,
1812
const uint8_t *src_left,
1815
hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1818
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1819
const uint8_t *src_left,
1820
ptrdiff_t stride, int log2, int c_idx)
1824
hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1828
hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1832
hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1836
hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1841
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
1842
const uint8_t *src_top,
1843
const uint8_t *src_left,
1844
ptrdiff_t stride, int c_idx, int mode)
1847
hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1848
} else if (mode == 26) {
1849
hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1850
} else if (mode >= 18) {
1851
hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1854
hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1859
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
1860
const uint8_t *src_top,
1861
const uint8_t *src_left,
1862
ptrdiff_t stride, int c_idx, int mode)
1865
hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1866
} else if (mode == 26) {
1867
hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1868
} else if (mode >= 18) {
1869
hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1872
hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1877
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
1878
const uint8_t *src_top,
1879
const uint8_t *src_left,
1880
ptrdiff_t stride, int c_idx, int mode)
1883
hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1884
} else if (mode == 26) {
1885
hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1886
} else if (mode >= 18) {
1887
hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
1890
hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
1895
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
1896
const uint8_t *src_top,
1897
const uint8_t *src_left,
1898
ptrdiff_t stride, int c_idx, int mode)
1901
hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1902
} else if (mode == 26) {
1903
intra_predict_vert_32x32_msa(src_top, dst, stride);
1904
} else if (mode >= 18) {
1905
hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
1908
hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
1913
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1916
HEVCLocalContext *lc = s->HEVClc;
1918
int hshift = s->sps->hshift[c_idx];
1919
int vshift = s->sps->vshift[c_idx];
1920
int size_in_luma_h = 16 << hshift;
1921
int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
1922
int size_in_luma_v = 16 << vshift;
1923
int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
1924
int x = x0 >> hshift;
1925
int y = y0 >> vshift;
1926
int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
1927
int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
1930
s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
1932
ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1933
uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1935
int min_pu_width = s->sps->min_pu_width;
1937
enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1938
lc->tu.intra_pred_mode;
1940
uint8_t left_array[2 * 32 + 1];
1941
uint8_t filtered_left_array[2 * 32 + 1];
1942
uint8_t top_array[2 * 32 + 1];
1943
uint8_t filtered_top_array[2 * 32 + 1];
1945
uint8_t *left = left_array + 1;
1946
uint8_t *top = top_array + 1;
1947
uint8_t *filtered_left = filtered_left_array + 1;
1948
uint8_t *filtered_top = filtered_top_array + 1;
1949
int cand_bottom_left = lc->na.cand_bottom_left
1951
s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
1952
(s->sps->tb_mask + 2) + (x_tb - 1)];
1953
int cand_left = lc->na.cand_left;
1954
int cand_up_left = lc->na.cand_up_left;
1955
int cand_up = lc->na.cand_up;
1956
int cand_up_right = lc->na.cand_up_right
1958
s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
1959
((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
1961
int bottom_left_size =
1962
(((y0 + 2 * size_in_luma_v) >
1963
(s->sps->height) ? (s->sps->height) : (y0 +
1964
2 * size_in_luma_v)) -
1965
(y0 + size_in_luma_v)) >> vshift;
1966
int top_right_size =
1967
(((x0 + 2 * size_in_luma_h) >
1968
(s->sps->width) ? (s->sps->width) : (x0 + 2 * size_in_luma_h)) -
1969
(x0 + size_in_luma_h)) >> hshift;
1971
if (s->pps->constrained_intra_pred_flag == 1) {
1972
int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
1973
int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
1974
int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
1975
int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
1976
if (!size_in_luma_pu_h)
1977
size_in_luma_pu_h++;
1978
if (cand_bottom_left == 1 && on_pu_edge_x) {
1979
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
1981
((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
1983
((size_in_luma_pu_v) >
1984
(s->sps->min_pu_height -
1985
y_bottom_pu) ? (s->sps->min_pu_height -
1986
y_bottom_pu) : (size_in_luma_pu_v));
1987
cand_bottom_left = 0;
1988
for (i = 0; i < max; i += 2)
1990
((s->ref->tab_mvf[(x_left_pu) +
1992
i) * min_pu_width]).pred_flag ==
1995
if (cand_left == 1 && on_pu_edge_x) {
1996
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
1997
int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
1999
((size_in_luma_pu_v) >
2000
(s->sps->min_pu_height -
2001
y_left_pu) ? (s->sps->min_pu_height -
2002
y_left_pu) : (size_in_luma_pu_v));
2004
for (i = 0; i < max; i += 2)
2006
((s->ref->tab_mvf[(x_left_pu) +
2008
i) * min_pu_width]).pred_flag ==
2011
if (cand_up_left == 1) {
2012
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
2013
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2015
(s->ref->tab_mvf[(x_left_pu) +
2016
(y_top_pu) * min_pu_width]).pred_flag ==
2019
if (cand_up == 1 && on_pu_edge_y) {
2020
int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
2021
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2023
((size_in_luma_pu_h) >
2024
(s->sps->min_pu_width -
2025
x_top_pu) ? (s->sps->min_pu_width -
2026
x_top_pu) : (size_in_luma_pu_h));
2028
for (i = 0; i < max; i += 2)
2030
((s->ref->tab_mvf[(x_top_pu + i) +
2032
min_pu_width]).pred_flag == PF_INTRA);
2034
if (cand_up_right == 1 && on_pu_edge_y) {
2035
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2037
((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
2039
((size_in_luma_pu_h) >
2040
(s->sps->min_pu_width -
2041
x_right_pu) ? (s->sps->min_pu_width -
2042
x_right_pu) : (size_in_luma_pu_h));
2044
for (i = 0; i < max; i += 2)
2046
((s->ref->tab_mvf[(x_right_pu + i) +
2048
min_pu_width]).pred_flag == PF_INTRA);
2051
vec0 = (v16u8) __msa_ldi_b(128);
2053
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2055
ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2060
left[-1] = src[(-1) + stride * (-1)];
2064
vec0 = LD_UB(src - stride);
2067
if (cand_up_right) {
2068
vec0 = LD_UB(src - stride + 16);
2069
ST_UB(vec0, (top + 16));
2073
((src[(16 + top_right_size - 1) + stride * (-1)]) *
2075
for (i = 0; i < (16 - top_right_size); i += 4)
2076
((((union unaligned_32 *) (top + 16 + top_right_size +
2081
for (i = 0; i < 16; i++)
2082
left[i] = src[(-1) + stride * (i)];
2083
if (cand_bottom_left) {
2084
for (i = 16; i < 16 + bottom_left_size; i++)
2085
left[i] = src[(-1) + stride * (i)];
2088
((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2090
for (i = 0; i < (16 - bottom_left_size); i += 4)
2091
((((union unaligned_32 *) (left + 16 + bottom_left_size +
2096
if (s->pps->constrained_intra_pred_flag == 1) {
2097
if (cand_bottom_left || cand_left || cand_up_left || cand_up
2100
x0 + ((2 * 16) << hshift) <
2101
s->sps->width ? 2 * 16 : (s->sps->width - x0) >> hshift;
2103
y0 + ((2 * 16) << vshift) <
2104
s->sps->height ? 2 * 16 : (s->sps->height - y0) >> vshift;
2105
int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2106
if (!cand_up_right) {
2107
size_max_x = x0 + ((16) << hshift) < s->sps->width ?
2108
16 : (s->sps->width - x0) >> hshift;
2110
if (!cand_bottom_left) {
2111
size_max_y = y0 + ((16) << vshift) < s->sps->height ?
2112
16 : (s->sps->height - y0) >> vshift;
2114
if (cand_bottom_left || cand_left || cand_up_left) {
2117
!((s->ref->tab_mvf[(((x0 +
2118
((-1) << hshift)) >> s->sps->
2119
log2_min_pu_size)) + (((y0 +
2124
* min_pu_width]).pred_flag ==
2128
((s->ref->tab_mvf[(((x0 +
2129
((-1) << hshift)) >> s->sps->
2130
log2_min_pu_size)) + (((y0 + ((j)
2135
* min_pu_width]).pred_flag == PF_INTRA)) {
2137
while (j < size_max_x
2139
!((s->ref->tab_mvf[(((x0 +
2140
((j) << hshift)) >> s->sps->
2141
log2_min_pu_size)) + (((y0 +
2147
* min_pu_width]).pred_flag ==
2150
for (i = j; i > (j) - (j + 1); i--)
2152
((s->ref->tab_mvf[(((x0 +
2154
1) << hshift)) >> s->sps->
2155
log2_min_pu_size)) + (((y0 +
2161
* min_pu_width]).pred_flag ==
2163
top[i - 1] = top[i];
2168
while (j < size_max_x
2170
!((s->ref->tab_mvf[(((x0 +
2171
((j) << hshift)) >> s->sps->
2172
log2_min_pu_size)) + (((y0 + ((-1)
2177
* min_pu_width]).pred_flag ==
2182
for (i = j; i > (j) - (j + 1); i--)
2184
((s->ref->tab_mvf[(((x0 +
2187
s->sps->log2_min_pu_size))
2191
s->sps->log2_min_pu_size))
2193
min_pu_width]).pred_flag ==
2195
top[i - 1] = top[i];
2197
for (i = j; i > (j) - (j); i--)
2199
((s->ref->tab_mvf[(((x0 +
2202
s->sps->log2_min_pu_size))
2206
s->sps->log2_min_pu_size))
2208
min_pu_width]).pred_flag ==
2210
top[i - 1] = top[i];
2216
if (cand_bottom_left || cand_left) {
2217
a = ((left[-1]) * 0x01010101U);
2218
for (i = 0; i < (0) + (size_max_y); i += 4)
2220
((s->ref->tab_mvf[(((x0 +
2221
((-1) << hshift)) >> s->sps->
2222
log2_min_pu_size)) + (((y0 +
2227
* min_pu_width]).pred_flag ==
2229
((((union unaligned_32 *) (&left[i]))->l) = (a));
2231
a = ((left[i + 3]) * 0x01010101U);
2234
vec0 = (v16u8) __msa_fill_b(left[-1]);
2238
if (!cand_bottom_left) {
2240
vec0 = (v16u8) __msa_fill_b(left[15]);
2242
ST_UB(vec0, (left + 16));
2244
if (x0 != 0 && y0 != 0) {
2245
a = ((left[size_max_y - 1]) * 0x01010101U);
2246
for (i = (size_max_y - 1);
2247
i > (size_max_y - 1) - (size_max_y); i -= 4)
2249
((s->ref->tab_mvf[(((x0 +
2250
((-1) << hshift)) >> s->sps->
2251
log2_min_pu_size)) + (((y0 +
2257
* min_pu_width]).pred_flag ==
2259
((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2261
a = ((left[i - 3]) * 0x01010101U);
2263
((s->ref->tab_mvf[(((x0 +
2264
((-1) << hshift)) >> s->sps->
2265
log2_min_pu_size)) + (((y0 + ((-1)
2270
* min_pu_width]).pred_flag == PF_INTRA))
2272
} else if (x0 == 0) {
2274
uint32_t pix = ((0) * 0x01010101U);
2275
for (i = 0; i < (size_max_y); i += 4)
2276
((((union unaligned_32 *) (left + i))->l) = (pix));
2279
a = ((left[size_max_y - 1]) * 0x01010101U);
2280
for (i = (size_max_y - 1);
2281
i > (size_max_y - 1) - (size_max_y); i -= 4)
2283
((s->ref->tab_mvf[(((x0 +
2284
((-1) << hshift)) >> s->sps->
2285
log2_min_pu_size)) + (((y0 +
2291
* min_pu_width]).pred_flag ==
2293
((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2295
a = ((left[i - 3]) * 0x01010101U);
2299
a = ((left[-1]) * 0x01010101U);
2300
for (i = 0; i < (0) + (size_max_x); i += 4)
2302
((s->ref->tab_mvf[(((x0 +
2303
((i) << hshift)) >> s->sps->
2304
log2_min_pu_size)) + (((y0 + ((-1)
2309
* min_pu_width]).pred_flag ==
2311
((((union unaligned_32 *) (&top[i]))->l) = (a));
2313
a = ((top[i + 3]) * 0x01010101U);
2318
if (!cand_bottom_left) {
2320
vec0 = (v16u8) __msa_fill_b(left[15]);
2322
ST_UB(vec0, (left + 16));
2324
} else if (cand_up_left) {
2325
vec0 = (v16u8) __msa_fill_b(left[-1]);
2327
ST_UB2(vec0, vec0, left, 16);
2330
} else if (cand_up) {
2333
vec0 = (v16u8) __msa_fill_b(left[-1]);
2335
ST_UB2(vec0, vec0, left, 16);
2339
} else if (cand_up_right) {
2340
vec0 = (v16u8) __msa_fill_b(top[16]);
2346
ST_UB2(vec0, vec0, left, 16);
2353
vec0 = (v16u8) __msa_ldi_b(128);
2355
ST_UB2(vec0, vec0, top, 16);
2356
ST_UB2(vec0, vec0, left, 16);
2361
vec0 = (v16u8) __msa_fill_b(left[16]);
2364
if (!cand_up_left) {
2368
vec0 = (v16u8) __msa_fill_b(left[-1]);
2371
if (!cand_up_right) {
2372
vec0 = (v16u8) __msa_fill_b(top[15]);
2373
ST_UB(vec0, (top + 16));
2379
if (!s->sps->intra_smoothing_disabled_flag
2380
&& (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
2381
if (mode != INTRA_DC && 16 != 4) {
2382
int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2383
int min_dist_vert_hor =
2384
(((((int) (mode - 26U)) >=
2385
0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2386
((((int) (mode - 10U)) >=
2387
0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2388
? ((((int) (mode - 10U)) >=
2389
0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2390
: ((((int) (mode - 26U)) >=
2391
0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2392
if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2393
filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2394
filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2395
for (i = 2 * 16 - 2; i >= 0; i--)
2396
filtered_left[i] = (left[i + 1] + 2 * left[i] +
2397
left[i - 1] + 2) >> 2;
2400
(left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2401
for (i = 2 * 16 - 2; i >= 0; i--)
2402
filtered_top[i] = (top[i + 1] + 2 * top[i] +
2403
top[i - 1] + 2) >> 2;
2404
left = filtered_left;
2412
s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2413
(uint8_t *) left, stride);
2416
s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2417
(uint8_t *) left, stride, 4, c_idx);
2420
s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2421
(uint8_t *) left, stride, c_idx, mode);
2426
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2429
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2430
v8i16 res0, res1, res2, res3;
2431
v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2432
v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2433
HEVCLocalContext *lc = s->HEVClc;
2435
int hshift = s->sps->hshift[c_idx];
2436
int vshift = s->sps->vshift[c_idx];
2437
int size_in_luma_h = 32 << hshift;
2438
int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
2439
int size_in_luma_v = 32 << vshift;
2440
int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
2441
int x = x0 >> hshift;
2442
int y = y0 >> vshift;
2443
int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
2444
int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
2447
s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
2449
ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2450
uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2452
int min_pu_width = s->sps->min_pu_width;
2454
enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2455
lc->tu.intra_pred_mode;
2457
uint8_t left_array[2 * 32 + 1];
2458
uint8_t filtered_left_array[2 * 32 + 1];
2459
uint8_t top_array[2 * 32 + 1];
2460
uint8_t filtered_top_array[2 * 32 + 1];
2462
uint8_t *left = left_array + 1;
2463
uint8_t *top = top_array + 1;
2464
uint8_t *filtered_left = filtered_left_array + 1;
2465
uint8_t *filtered_top = filtered_top_array + 1;
2466
int cand_bottom_left = lc->na.cand_bottom_left
2468
s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
2469
(s->sps->tb_mask + 2) + (x_tb - 1)];
2470
int cand_left = lc->na.cand_left;
2471
int cand_up_left = lc->na.cand_up_left;
2472
int cand_up = lc->na.cand_up;
2473
int cand_up_right = lc->na.cand_up_right
2475
s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
2476
((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
2478
int bottom_left_size =
2479
(((y0 + 2 * size_in_luma_v) >
2480
(s->sps->height) ? (s->sps->height) : (y0 +
2481
2 * size_in_luma_v)) -
2482
(y0 + size_in_luma_v)) >> vshift;
2483
int top_right_size =
2484
(((x0 + 2 * size_in_luma_h) >
2485
(s->sps->width) ? (s->sps->width) : (x0 + 2 * size_in_luma_h)) -
2486
(x0 + size_in_luma_h)) >> hshift;
2488
if (s->pps->constrained_intra_pred_flag == 1) {
2489
int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
2490
int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
2491
int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
2492
int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
2493
if (!size_in_luma_pu_h)
2494
size_in_luma_pu_h++;
2495
if (cand_bottom_left == 1 && on_pu_edge_x) {
2496
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
2498
((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
2500
((size_in_luma_pu_v) >
2501
(s->sps->min_pu_height -
2502
y_bottom_pu) ? (s->sps->min_pu_height -
2503
y_bottom_pu) : (size_in_luma_pu_v));
2504
cand_bottom_left = 0;
2505
for (i = 0; i < max; i += 2)
2507
((s->ref->tab_mvf[(x_left_pu) +
2509
i) * min_pu_width]).pred_flag ==
2512
if (cand_left == 1 && on_pu_edge_x) {
2513
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
2514
int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
2516
((size_in_luma_pu_v) >
2517
(s->sps->min_pu_height -
2518
y_left_pu) ? (s->sps->min_pu_height -
2519
y_left_pu) : (size_in_luma_pu_v));
2521
for (i = 0; i < max; i += 2)
2523
((s->ref->tab_mvf[(x_left_pu) +
2525
i) * min_pu_width]).pred_flag ==
2528
if (cand_up_left == 1) {
2529
int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
2530
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2532
(s->ref->tab_mvf[(x_left_pu) +
2533
(y_top_pu) * min_pu_width]).pred_flag ==
2536
if (cand_up == 1 && on_pu_edge_y) {
2537
int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
2538
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2540
((size_in_luma_pu_h) >
2541
(s->sps->min_pu_width -
2542
x_top_pu) ? (s->sps->min_pu_width -
2543
x_top_pu) : (size_in_luma_pu_h));
2545
for (i = 0; i < max; i += 2)
2547
((s->ref->tab_mvf[(x_top_pu + i) +
2549
min_pu_width]).pred_flag == PF_INTRA);
2551
if (cand_up_right == 1 && on_pu_edge_y) {
2552
int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
2554
((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
2556
((size_in_luma_pu_h) >
2557
(s->sps->min_pu_width -
2558
x_right_pu) ? (s->sps->min_pu_width -
2559
x_right_pu) : (size_in_luma_pu_h));
2561
for (i = 0; i < max; i += 2)
2563
((s->ref->tab_mvf[(x_right_pu + i) +
2565
min_pu_width]).pred_flag == PF_INTRA);
2567
vec0 = (v16u8) __msa_ldi_b(128);
2569
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2570
ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2575
left[-1] = src[(-1) + stride * (-1)];
2579
LD_UB2(src - stride, 16, vec0, vec1);
2580
ST_UB2(vec0, vec1, top, 16);
2583
if (cand_up_right) {
2584
LD_UB2(src - stride + 32, 16, vec0, vec1);
2585
ST_UB2(vec0, vec1, (top + 32), 16);
2588
((src[(32 + top_right_size - 1) + stride * (-1)]) *
2590
for (i = 0; i < (32 - top_right_size); i += 4)
2591
((((union unaligned_32 *) (top + 32 + top_right_size +
2596
for (i = 0; i < 32; i++)
2597
left[i] = src[(-1) + stride * (i)];
2598
if (cand_bottom_left) {
2599
for (i = 32; i < 32 + bottom_left_size; i++)
2600
left[i] = src[(-1) + stride * (i)];
2603
((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2605
for (i = 0; i < (32 - bottom_left_size); i += 4)
2606
((((union unaligned_32 *) (left + 32 + bottom_left_size +
2611
if (s->pps->constrained_intra_pred_flag == 1) {
2612
if (cand_bottom_left || cand_left || cand_up_left || cand_up
2615
x0 + ((2 * 32) << hshift) <
2616
s->sps->width ? 2 * 32 : (s->sps->width - x0) >> hshift;
2618
y0 + ((2 * 32) << vshift) <
2619
s->sps->height ? 2 * 32 : (s->sps->height - y0) >> vshift;
2620
int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2621
if (!cand_up_right) {
2622
size_max_x = x0 + ((32) << hshift) < s->sps->width ?
2623
32 : (s->sps->width - x0) >> hshift;
2625
if (!cand_bottom_left) {
2626
size_max_y = y0 + ((32) << vshift) < s->sps->height ?
2627
32 : (s->sps->height - y0) >> vshift;
2629
if (cand_bottom_left || cand_left || cand_up_left) {
2632
!((s->ref->tab_mvf[(((x0 +
2633
((-1) << hshift)) >> s->sps->
2634
log2_min_pu_size)) + (((y0 +
2639
* min_pu_width]).pred_flag ==
2643
((s->ref->tab_mvf[(((x0 +
2644
((-1) << hshift)) >> s->sps->
2645
log2_min_pu_size)) + (((y0 + ((j)
2650
* min_pu_width]).pred_flag == PF_INTRA)) {
2652
while (j < size_max_x
2654
!((s->ref->tab_mvf[(((x0 +
2655
((j) << hshift)) >> s->sps->
2656
log2_min_pu_size)) + (((y0 +
2662
* min_pu_width]).pred_flag ==
2665
for (i = j; i > (j) - (j + 1); i--)
2667
((s->ref->tab_mvf[(((x0 +
2669
1) << hshift)) >> s->sps->
2670
log2_min_pu_size)) + (((y0 +
2676
* min_pu_width]).pred_flag ==
2678
top[i - 1] = top[i];
2683
while (j < size_max_x
2685
!((s->ref->tab_mvf[(((x0 +
2686
((j) << hshift)) >> s->sps->
2687
log2_min_pu_size)) + (((y0 + ((-1)
2692
* min_pu_width]).pred_flag ==
2697
for (i = j; i > (j) - (j + 1); i--)
2699
((s->ref->tab_mvf[(((x0 +
2702
s->sps->log2_min_pu_size))
2706
s->sps->log2_min_pu_size))
2708
min_pu_width]).pred_flag ==
2710
top[i - 1] = top[i];
2712
for (i = j; i > (j) - (j); i--)
2714
((s->ref->tab_mvf[(((x0 +
2717
s->sps->log2_min_pu_size))
2721
s->sps->log2_min_pu_size))
2723
min_pu_width]).pred_flag ==
2725
top[i - 1] = top[i];
2731
if (cand_bottom_left || cand_left) {
2732
a = ((left[-1]) * 0x01010101U);
2733
for (i = 0; i < (0) + (size_max_y); i += 4)
2735
((s->ref->tab_mvf[(((x0 +
2736
((-1) << hshift)) >> s->sps->
2737
log2_min_pu_size)) + (((y0 +
2742
* min_pu_width]).pred_flag ==
2744
((((union unaligned_32 *) (&left[i]))->l) = (a));
2746
a = ((left[i + 3]) * 0x01010101U);
2749
vec0 = (v16u8) __msa_fill_b(left[-1]);
2751
ST_UB2(vec0, vec0, left, 16);
2753
if (!cand_bottom_left) {
2754
vec0 = (v16u8) __msa_fill_b(left[31]);
2756
ST_UB2(vec0, vec0, (left + 32), 16);
2758
if (x0 != 0 && y0 != 0) {
2759
a = ((left[size_max_y - 1]) * 0x01010101U);
2760
for (i = (size_max_y - 1);
2761
i > (size_max_y - 1) - (size_max_y); i -= 4)
2763
((s->ref->tab_mvf[(((x0 +
2764
((-1) << hshift)) >> s->sps->
2765
log2_min_pu_size)) + (((y0 +
2771
* min_pu_width]).pred_flag ==
2773
((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2775
a = ((left[i - 3]) * 0x01010101U);
2777
((s->ref->tab_mvf[(((x0 +
2778
((-1) << hshift)) >> s->sps->
2779
log2_min_pu_size)) + (((y0 + ((-1)
2784
* min_pu_width]).pred_flag == PF_INTRA))
2786
} else if (x0 == 0) {
2788
uint32_t pix = ((0) * 0x01010101U);
2789
for (i = 0; i < (size_max_y); i += 4)
2790
((((union unaligned_32 *) (left + i))->l) = (pix));
2793
a = ((left[size_max_y - 1]) * 0x01010101U);
2794
for (i = (size_max_y - 1);
2795
i > (size_max_y - 1) - (size_max_y); i -= 4)
2797
((s->ref->tab_mvf[(((x0 +
2798
((-1) << hshift)) >> s->sps->
2799
log2_min_pu_size)) + (((y0 +
2805
* min_pu_width]).pred_flag ==
2807
((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2809
a = ((left[i - 3]) * 0x01010101U);
2813
a = ((left[-1]) * 0x01010101U);
2814
for (i = 0; i < (0) + (size_max_x); i += 4)
2816
((s->ref->tab_mvf[(((x0 +
2817
((i) << hshift)) >> s->sps->
2818
log2_min_pu_size)) + (((y0 + ((-1)
2823
* min_pu_width]).pred_flag ==
2825
((((union unaligned_32 *) (&top[i]))->l) = (a));
2827
a = ((top[i + 3]) * 0x01010101U);
2832
if (!cand_bottom_left) {
2834
vec0 = (v16u8) __msa_fill_b(left[31]);
2836
ST_UB2(vec0, vec0, (left + 32), 16);
2837
} else if (cand_up_left) {
2838
vec0 = (v16u8) __msa_fill_b(left[-1]);
2840
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2843
} else if (cand_up) {
2846
vec0 = (v16u8) __msa_fill_b(left[-1]);
2848
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2852
} else if (cand_up_right) {
2853
vec0 = (v16u8) __msa_fill_b(top[32]);
2855
ST_UB2(vec0, vec0, top, 16);
2859
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2867
vec0 = (v16u8) __msa_ldi_b(128);
2869
ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2870
ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2875
vec0 = (v16u8) __msa_fill_b(left[32]);
2877
ST_UB2(vec0, vec0, left, 16);
2879
if (!cand_up_left) {
2883
vec0 = (v16u8) __msa_fill_b(left[-1]);
2885
ST_UB2(vec0, vec0, top, 16);
2887
if (!cand_up_right) {
2888
vec0 = (v16u8) __msa_fill_b(top[31]);
2890
ST_UB2(vec0, vec0, (top + 32), 16);
2896
if (!s->sps->intra_smoothing_disabled_flag
2897
&& (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
2898
if (mode != INTRA_DC && 32 != 4) {
2899
int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2900
int min_dist_vert_hor =
2901
(((((int) (mode - 26U)) >=
2902
0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2903
((((int) (mode - 10U)) >=
2904
0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2905
? ((((int) (mode - 10U)) >=
2906
0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2907
: ((((int) (mode - 26U)) >=
2908
0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2909
if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2910
int threshold = 1 << (8 - 5);
2911
if (s->sps->sps_strong_intra_smoothing_enable_flag
2913
&& ((top[-1] + top[63] - 2 * top[31]) >=
2914
0 ? (top[-1] + top[63] -
2915
2 * top[31]) : (-(top[-1] + top[63] -
2916
2 * top[31]))) < threshold
2917
&& ((left[-1] + left[63] - 2 * left[31]) >=
2918
0 ? (left[-1] + left[63] -
2919
2 * left[31]) : (-(left[-1] + left[63] -
2920
2 * left[31]))) < threshold) {
2923
filtered_top[-1] = top[-1];
2924
filtered_top[63] = top[63];
2927
for (i = 0; i < 63; i++) {
2929
((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2932
tmp0 = __msa_fill_h(top[-1]);
2933
tmp1 = __msa_fill_h(top[63]);
2935
tmp2 = mul_val0 - 8;
2936
tmp3 = mul_val0 - 16;
2937
tmp4 = mul_val0 - 24;
2938
tmp5 = mul_val1 + 8;
2939
tmp6 = mul_val1 + 16;
2940
tmp7 = mul_val1 + 24;
2942
res0 = mul_val0 * tmp0;
2946
res0 += mul_val1 * tmp1;
2947
res1 += tmp5 * tmp1;
2948
res2 += tmp6 * tmp1;
2949
res3 += tmp7 * tmp1;
2951
res0 = __msa_srari_h(res0, 6);
2952
res1 = __msa_srari_h(res1, 6);
2953
res2 = __msa_srari_h(res2, 6);
2954
res3 = __msa_srari_h(res3, 6);
2956
vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2957
vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2959
ST_UB2(vec0, vec1, filtered_top, 16);
2961
res0 = mul_val0 - 32;
2962
tmp2 = mul_val0 - 40;
2963
tmp3 = mul_val0 - 48;
2964
tmp4 = mul_val0 - 56;
2965
res3 = mul_val1 + 32;
2966
tmp5 = mul_val1 + 40;
2967
tmp6 = mul_val1 + 48;
2968
tmp7 = mul_val1 + 56;
2973
res0 += res3 * tmp1;
2975
res1 += tmp5 * tmp1;
2976
res2 += tmp6 * tmp1;
2977
res3 += tmp7 * tmp1;
2979
res0 = __msa_srari_h(res0, 6);
2980
res1 = __msa_srari_h(res1, 6);
2981
res2 = __msa_srari_h(res2, 6);
2982
res3 = __msa_srari_h(res3, 6);
2984
vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2985
vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2987
ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2989
filtered_top[63] = top[63];
2991
tmp0 = __msa_fill_h(left[-1]);
2992
tmp1 = __msa_fill_h(left[63]);
2994
tmp2 = mul_val0 - 8;
2995
tmp3 = mul_val0 - 16;
2996
tmp4 = mul_val0 - 24;
2997
tmp5 = mul_val1 + 8;
2998
tmp6 = mul_val1 + 16;
2999
tmp7 = mul_val1 + 24;
3001
res0 = mul_val0 * tmp0;
3005
res0 += mul_val1 * tmp1;
3006
res1 += tmp5 * tmp1;
3007
res2 += tmp6 * tmp1;
3008
res3 += tmp7 * tmp1;
3010
res0 = __msa_srari_h(res0, 6);
3011
res1 = __msa_srari_h(res1, 6);
3012
res2 = __msa_srari_h(res2, 6);
3013
res3 = __msa_srari_h(res3, 6);
3015
vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3016
vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3018
ST_UB2(vec0, vec1, left, 16);
3020
res0 = mul_val0 - 32;
3021
tmp2 = mul_val0 - 40;
3022
tmp3 = mul_val0 - 48;
3023
tmp4 = mul_val0 - 56;
3024
res3 = mul_val1 + 32;
3025
tmp5 = mul_val1 + 40;
3026
tmp6 = mul_val1 + 48;
3027
tmp7 = mul_val1 + 56;
3032
res0 += res3 * tmp1;
3034
res1 += tmp5 * tmp1;
3035
res2 += tmp6 * tmp1;
3036
res3 += tmp7 * tmp1;
3038
res0 = __msa_srari_h(res0, 6);
3039
res1 = __msa_srari_h(res1, 6);
3040
res2 = __msa_srari_h(res2, 6);
3041
res3 = __msa_srari_h(res3, 6);
3043
vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3044
vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3046
ST_UB2(vec0, vec1, (left + 32), 16);
3052
filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3053
filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3054
for (i = 2 * 32 - 2; i >= 0; i--)
3055
filtered_left[i] = (left[i + 1] + 2 * left[i] +
3056
left[i - 1] + 2) >> 2;
3059
(left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3060
for (i = 2 * 32 - 2; i >= 0; i--)
3061
filtered_top[i] = (top[i + 1] + 2 * top[i] +
3062
top[i - 1] + 2) >> 2;
3063
left = filtered_left;
3072
s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3073
(uint8_t *) left, stride);
3076
s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3077
(uint8_t *) left, stride, 5, c_idx);
3080
s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3081
(uint8_t *) left, stride, c_idx, mode);