2
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5
* This file is part of Libav.
7
* Libav is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
12
* Libav is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with Libav; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
#include "libavutil/aarch64/asm.S"
25
function ff_h264_idct_add_neon, export=1
26
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
30
add v4.4H, v0.4H, v2.4H
31
sshr v16.4H, v1.4H, #1
32
st1 {v30.8H}, [x1], #16
33
sshr v17.4H, v3.4H, #1
34
st1 {v30.8H}, [x1], #16
35
sub v5.4H, v0.4H, v2.4H
36
add v6.4H, v1.4H, v17.4H
37
sub v7.4H, v16.4H, v3.4H
38
add v0.4H, v4.4H, v6.4H
39
add v1.4H, v5.4H, v7.4H
40
sub v2.4H, v4.4H, v6.4H
41
sub v3.4H, v5.4H, v7.4H
43
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
45
add v4.4H, v0.4H, v3.4H
46
ld1 {v18.S}[0], [x0], x2
47
sshr v16.4H, v2.4H, #1
48
sshr v17.4H, v1.4H, #1
49
ld1 {v19.S}[1], [x0], x2
50
sub v5.4H, v0.4H, v3.4H
51
ld1 {v18.S}[1], [x0], x2
52
add v6.4H, v16.4H, v1.4H
54
sub v7.4H, v2.4H, v17.4H
55
ld1 {v19.S}[0], [x0], x2
57
sub x0, x0, x2, lsl #2
58
add v0.8H, v4.8H, v6.8H
59
sub v1.8H, v4.8H, v6.8H
61
srshr v0.8H, v0.8H, #6
62
srshr v1.8H, v1.8H, #6
64
uaddw v0.8H, v0.8H, v18.8B
65
uaddw v1.8H, v1.8H, v19.8B
70
st1 {v0.S}[0], [x0], x2
71
st1 {v1.S}[1], [x0], x2
72
st1 {v0.S}[1], [x0], x2
73
st1 {v1.S}[0], [x0], x2
79
function ff_h264_idct_dc_add_neon, export=1
84
srshr v2.8H, v2.8H, #6
85
ld1 {v0.S}[0], [x0], x2
86
ld1 {v0.S}[1], [x0], x2
87
uaddw v3.8H, v2.8H, v0.8B
88
ld1 {v1.S}[0], [x0], x2
89
ld1 {v1.S}[1], [x0], x2
90
uaddw v4.8H, v2.8H, v1.8B
93
sub x0, x0, x2, lsl #2
94
st1 {v0.S}[0], [x0], x2
95
st1 {v0.S}[1], [x0], x2
96
st1 {v1.S}[0], [x0], x2
97
st1 {v1.S}[1], [x0], x2
101
function ff_h264_idct_add16_neon, export=1
104
mov x5, x1 // block_offset
109
movrel x13, ff_h264_idct_dc_add_neon
110
movrel x14, ff_h264_idct_add_neon
114
ldrb w3, [x4, w3, uxtw]
120
csel x15, x13, x14, ne
128
function ff_h264_idct_add16intra_neon, export=1
131
mov x5, x1 // block_offset
136
movrel x13, ff_h264_idct_dc_add_neon
137
movrel x14, ff_h264_idct_add_neon
141
ldrb w3, [x4, w3, uxtw]
145
csel x15, x13, x14, eq
155
function ff_h264_idct_add8_neon, export=1
159
ldp x6, x15, [x0] // dest[0], dest[1]
160
add x5, x1, #16*4 // block_offset
161
add x9, x2, #16*32 // block
162
mov w19, w3 // stride
163
movrel x13, ff_h264_idct_dc_add_neon
164
movrel x14, ff_h264_idct_add_neon
169
ldrb w3, [x7, x10] // scan8[i]
170
ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
171
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
172
add x0, x0, x6 // block_offset[i] + dst[j-1]
173
add x1, x9, x10, lsl #5 // block + i * 16
175
ldrsh w3, [x1] // block[i*16]
176
csel x20, x13, x14, eq
182
csel x10, x11, x10, eq // mov x10, #16
191
.macro idct8x8_cols pass
195
sshr v18.8H, v26.8H, #1
196
add v16.8H, v24.8H, v28.8H
197
ld1 {v30.8H, v31.8H}, [x1]
198
st1 {v19.8H}, [x1], #16
199
st1 {v19.8H}, [x1], #16
200
sub v17.8H, v24.8H, v28.8H
201
sshr v19.8H, v30.8H, #1
202
sub v18.8H, v18.8H, v30.8H
203
add v19.8H, v19.8H, v26.8H
207
sshr v30.8H, v26.8H, #1
208
sshr v19.8H, v18.8H, #1
209
add v16.8H, v24.8H, v28.8H
210
sub v17.8H, v24.8H, v28.8H
211
sub v30.8H, v30.8H, v18.8H
212
add v19.8H, v19.8H, v26.8H
214
add v26.8H, v17.8H, va.8H
215
sub v28.8H, v17.8H, va.8H
216
add v24.8H, v16.8H, v19.8H
217
sub vb.8H, v16.8H, v19.8H
218
sub v16.8H, v29.8H, v27.8H
219
add v17.8H, v31.8H, v25.8H
220
sub va.8H, v31.8H, v25.8H
221
add v19.8H, v29.8H, v27.8H
222
sub v16.8H, v16.8H, v31.8H
223
sub v17.8H, v17.8H, v27.8H
224
add va.8H, va.8H, v29.8H
225
add v19.8H, v19.8H, v25.8H
226
sshr v25.8H, v25.8H, #1
227
sshr v27.8H, v27.8H, #1
228
sshr v29.8H, v29.8H, #1
229
sshr v31.8H, v31.8H, #1
230
sub v16.8H, v16.8H, v31.8H
231
sub v17.8H, v17.8H, v27.8H
232
add va.8H, va.8H, v29.8H
233
add v19.8H, v19.8H, v25.8H
234
sshr v25.8H, v16.8H, #2
235
sshr v27.8H, v17.8H, #2
236
sshr v29.8H, va.8H, #2
237
sshr v31.8H, v19.8H, #2
238
sub v19.8H, v19.8H, v25.8H
239
sub va.8H, v27.8H, va.8H
240
add v17.8H, v17.8H, v29.8H
241
add v16.8H, v16.8H, v31.8H
243
sub v31.8H, v24.8H, v19.8H
244
add v24.8H, v24.8H, v19.8H
245
add v25.8H, v26.8H, v18.8H
246
sub v18.8H, v26.8H, v18.8H
247
add v26.8H, v28.8H, v17.8H
248
add v27.8H, v30.8H, v16.8H
249
sub v29.8H, v28.8H, v17.8H
250
sub v28.8H, v30.8H, v16.8H
252
sub v31.8H, v24.8H, v19.8H
253
add v24.8H, v24.8H, v19.8H
254
add v25.8H, v26.8H, v30.8H
255
sub v30.8H, v26.8H, v30.8H
256
add v26.8H, v28.8H, v17.8H
257
sub v29.8H, v28.8H, v17.8H
258
add v27.8H, v18.8H, v16.8H
259
sub v28.8H, v18.8H, v16.8H
265
function ff_h264_idct8_add_neon, export=1
267
ld1 {v24.8H, v25.8H}, [x1]
268
st1 {v19.8H}, [x1], #16
269
st1 {v19.8H}, [x1], #16
270
ld1 {v26.8H, v27.8H}, [x1]
271
st1 {v19.8H}, [x1], #16
272
st1 {v19.8H}, [x1], #16
273
ld1 {v28.8H, v29.8H}, [x1]
274
st1 {v19.8H}, [x1], #16
275
st1 {v19.8H}, [x1], #16
278
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
282
srshr v24.8H, v24.8H, #6
283
ld1 {v0.8B}, [x0], x2
284
srshr v25.8H, v25.8H, #6
285
ld1 {v1.8B}, [x0], x2
286
srshr v26.8H, v26.8H, #6
287
ld1 {v2.8B}, [x0], x2
288
srshr v27.8H, v27.8H, #6
289
ld1 {v3.8B}, [x0], x2
290
srshr v28.8H, v28.8H, #6
291
ld1 {v4.8B}, [x0], x2
292
srshr v29.8H, v29.8H, #6
293
ld1 {v5.8B}, [x0], x2
294
srshr v30.8H, v30.8H, #6
295
ld1 {v6.8B}, [x0], x2
296
srshr v31.8H, v31.8H, #6
297
ld1 {v7.8B}, [x0], x2
298
uaddw v24.8H, v24.8H, v0.8B
299
uaddw v25.8H, v25.8H, v1.8B
300
uaddw v26.8H, v26.8H, v2.8B
302
uaddw v27.8H, v27.8H, v3.8B
304
uaddw v28.8H, v28.8H, v4.8B
306
st1 {v0.8B}, [x3], x2
307
uaddw v29.8H, v29.8H, v5.8B
309
st1 {v1.8B}, [x3], x2
310
uaddw v30.8H, v30.8H, v6.8B
312
st1 {v2.8B}, [x3], x2
313
uaddw v31.8H, v31.8H, v7.8B
315
st1 {v3.8B}, [x3], x2
318
st1 {v4.8B}, [x3], x2
319
st1 {v5.8B}, [x3], x2
320
st1 {v6.8B}, [x3], x2
321
st1 {v7.8B}, [x3], x2
327
function ff_h264_idct8_dc_add_neon, export=1
332
ld1 {v0.8B}, [x0], x2
333
srshr v31.8H, v31.8H, #6
334
ld1 {v1.8B}, [x0], x2
335
ld1 {v2.8B}, [x0], x2
336
uaddw v24.8H, v31.8H, v0.8B
337
ld1 {v3.8B}, [x0], x2
338
uaddw v25.8H, v31.8H, v1.8B
339
ld1 {v4.8B}, [x0], x2
340
uaddw v26.8H, v31.8H, v2.8B
341
ld1 {v5.8B}, [x0], x2
342
uaddw v27.8H, v31.8H, v3.8B
343
ld1 {v6.8B}, [x0], x2
344
uaddw v28.8H, v31.8H, v4.8B
345
ld1 {v7.8B}, [x0], x2
346
uaddw v29.8H, v31.8H, v5.8B
347
uaddw v30.8H, v31.8H, v6.8B
348
uaddw v31.8H, v31.8H, v7.8B
353
sub x0, x0, x2, lsl #3
354
st1 {v0.8B}, [x0], x2
356
st1 {v1.8B}, [x0], x2
358
st1 {v2.8B}, [x0], x2
360
st1 {v3.8B}, [x0], x2
362
st1 {v4.8B}, [x0], x2
363
st1 {v5.8B}, [x0], x2
364
st1 {v6.8B}, [x0], x2
365
st1 {v7.8B}, [x0], x2
369
function ff_h264_idct8_add4_neon, export=1
377
movrel x13, ff_h264_idct8_dc_add_neon
378
movrel x14, ff_h264_idct8_add_neon
381
ldrb w9, [x4, w9, UXTW]
387
csel x15, x13, x14, ne
396
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
397
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
398
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
399
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
400
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
401
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
402
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
403
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
404
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
405
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
406
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
407
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8