~ubuntu-branches/debian/experimental/libav/experimental

« back to all changes in this revision

Viewing changes to libavcodec/aarch64/h264idct_neon.S

  • Committer: Package Import Robot
  • Author(s): Reinhard Tartler
  • Date: 2014-01-18 15:46:55 UTC
  • mfrom: (1.1.24)
  • Revision ID: package-import@ubuntu.com-20140118154655-iz6u00yevkat1jqi
Tags: 6:10~alpha2-1
New Upstream release 10_alpha2. This upstream git snapshot has too many
changes to list here, cf. to the upstream Changelog:
http://git.libav.org/?p=libav.git;a=blob;f=Changelog;hb=refs/tags/v10_alpha2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 
3
 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
 
4
 *
 
5
 * This file is part of Libav.
 
6
 *
 
7
 * Libav is free software; you can redistribute it and/or
 
8
 * modify it under the terms of the GNU Lesser General Public
 
9
 * License as published by the Free Software Foundation; either
 
10
 * version 2.1 of the License, or (at your option) any later version.
 
11
 *
 
12
 * Libav is distributed in the hope that it will be useful,
 
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
15
 * Lesser General Public License for more details.
 
16
 *
 
17
 * You should have received a copy of the GNU Lesser General Public
 
18
 * License along with Libav; if not, write to the Free Software
 
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
20
 */
 
21
 
 
22
#include "libavutil/aarch64/asm.S"
 
23
#include "neon.S"
 
24
 
 
25
function ff_h264_idct_add_neon, export=1
 
26
        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
 
27
        sxtw            x2,     w2
 
28
        movi            v30.8H, #0
 
29
 
 
30
        add             v4.4H,  v0.4H,  v2.4H
 
31
        sshr            v16.4H, v1.4H,  #1
 
32
        st1             {v30.8H},    [x1], #16
 
33
        sshr            v17.4H, v3.4H,  #1
 
34
        st1             {v30.8H},    [x1], #16
 
35
        sub             v5.4H,  v0.4H,  v2.4H
 
36
        add             v6.4H,  v1.4H,  v17.4H
 
37
        sub             v7.4H,  v16.4H, v3.4H
 
38
        add             v0.4H,  v4.4H,  v6.4H
 
39
        add             v1.4H,  v5.4H,  v7.4H
 
40
        sub             v2.4H,  v4.4H,  v6.4H
 
41
        sub             v3.4H,  v5.4H,  v7.4H
 
42
 
 
43
        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
 
44
 
 
45
        add             v4.4H,  v0.4H,  v3.4H
 
46
        ld1             {v18.S}[0], [x0], x2
 
47
        sshr            v16.4H,  v2.4H,  #1
 
48
        sshr            v17.4H,  v1.4H,  #1
 
49
        ld1             {v19.S}[1], [x0], x2
 
50
        sub             v5.4H,  v0.4H,  v3.4H
 
51
        ld1             {v18.S}[1], [x0], x2
 
52
        add             v6.4H,  v16.4H, v1.4H
 
53
        ins             v4.D[1],  v5.D[0]
 
54
        sub             v7.4H,  v2.4H,  v17.4H
 
55
        ld1             {v19.S}[0], [x0], x2
 
56
        ins             v6.D[1],  v7.D[0]
 
57
        sub             x0,  x0,  x2, lsl #2
 
58
        add             v0.8H,  v4.8H,  v6.8H
 
59
        sub             v1.8H,  v4.8H,  v6.8H
 
60
 
 
61
        srshr           v0.8H,  v0.8H,  #6
 
62
        srshr           v1.8H,  v1.8H,  #6
 
63
 
 
64
        uaddw           v0.8H,  v0.8H,  v18.8B
 
65
        uaddw           v1.8H,  v1.8H,  v19.8B
 
66
 
 
67
        sqxtun          v0.8B, v0.8H
 
68
        sqxtun          v1.8B, v1.8H
 
69
 
 
70
        st1             {v0.S}[0],  [x0], x2
 
71
        st1             {v1.S}[1],  [x0], x2
 
72
        st1             {v0.S}[1],  [x0], x2
 
73
        st1             {v1.S}[0],  [x0], x2
 
74
 
 
75
        sub             x1,  x1,  #32
 
76
        ret
 
77
endfunc
 
78
 
 
79
function ff_h264_idct_dc_add_neon, export=1
 
80
        sxtw            x2,  w2
 
81
        mov             w3,       #0
 
82
        ld1r            {v2.8H},  [x1]
 
83
        strh            w3,       [x1]
 
84
        srshr           v2.8H,  v2.8H,  #6
 
85
        ld1             {v0.S}[0],  [x0], x2
 
86
        ld1             {v0.S}[1],  [x0], x2
 
87
        uaddw           v3.8H,  v2.8H,  v0.8B
 
88
        ld1             {v1.S}[0],  [x0], x2
 
89
        ld1             {v1.S}[1],  [x0], x2
 
90
        uaddw           v4.8H,  v2.8H,  v1.8B
 
91
        sqxtun          v0.8B,  v3.8H
 
92
        sqxtun          v1.8B,  v4.8H
 
93
        sub             x0,  x0,  x2, lsl #2
 
94
        st1             {v0.S}[0],  [x0], x2
 
95
        st1             {v0.S}[1],  [x0], x2
 
96
        st1             {v1.S}[0],  [x0], x2
 
97
        st1             {v1.S}[1],  [x0], x2
 
98
        ret
 
99
endfunc
 
100
 
 
101
function ff_h264_idct_add16_neon, export=1
 
102
        mov             x12, x30
 
103
        mov             x6,  x0         // dest
 
104
        mov             x5,  x1         // block_offset
 
105
        mov             x1,  x2         // block
 
106
        mov             w9,  w3         // stride
 
107
        movrel          x7,  scan8
 
108
        mov             x10, #16
 
109
        movrel          x13, ff_h264_idct_dc_add_neon
 
110
        movrel          x14, ff_h264_idct_add_neon
 
111
1:      mov             w2,  w9
 
112
        ldrb            w3,  [x7], #1
 
113
        ldrsw           x0,  [x5], #4
 
114
        ldrb            w3,  [x4,  w3,  uxtw]
 
115
        subs            w3,  w3,  #1
 
116
        b.lt            2f
 
117
        ldrsh           w3,  [x1]
 
118
        add             x0,  x0,  x6
 
119
        ccmp            w3,  #0,  #4,  eq
 
120
        csel            x15, x13, x14, ne
 
121
        blr             x15
 
122
2:      subs            x10, x10, #1
 
123
        add             x1,  x1,  #32
 
124
        b.ne            1b
 
125
        ret             x12
 
126
endfunc
 
127
 
 
128
function ff_h264_idct_add16intra_neon, export=1
 
129
        mov             x12, x30
 
130
        mov             x6,  x0         // dest
 
131
        mov             x5,  x1         // block_offset
 
132
        mov             x1,  x2         // block
 
133
        mov             w9,  w3         // stride
 
134
        movrel          x7,  scan8
 
135
        mov             x10, #16
 
136
        movrel          x13, ff_h264_idct_dc_add_neon
 
137
        movrel          x14, ff_h264_idct_add_neon
 
138
1:      mov             w2,  w9
 
139
        ldrb            w3,  [x7], #1
 
140
        ldrsw           x0,  [x5], #4
 
141
        ldrb            w3,  [x4,  w3,  uxtw]
 
142
        add             x0,  x0,  x6
 
143
        cmp             w3,  #0
 
144
        ldrsh           w3,  [x1]
 
145
        csel            x15, x13, x14, eq
 
146
        ccmp            w3,  #0,  #0,  eq
 
147
        b.eq            2f
 
148
        blr             x15
 
149
2:      subs            x10, x10, #1
 
150
        add             x1,  x1,  #32
 
151
        b.ne            1b
 
152
        ret             x12
 
153
endfunc
 
154
 
 
155
function ff_h264_idct_add8_neon, export=1
 
156
        sub             sp,  sp, #0x40
 
157
        stp             x19, x20, [sp]
 
158
        mov             x12, x30
 
159
        ldp             x6,  x15, [x0]          // dest[0], dest[1]
 
160
        add             x5,  x1,  #16*4         // block_offset
 
161
        add             x9,  x2,  #16*32        // block
 
162
        mov             w19, w3                 // stride
 
163
        movrel          x13, ff_h264_idct_dc_add_neon
 
164
        movrel          x14, ff_h264_idct_add_neon
 
165
        movrel          x7,  scan8+16
 
166
        mov             x10, #0
 
167
        mov             x11, #16
 
168
1:      mov             w2,  w19
 
169
        ldrb            w3,  [x7, x10]          // scan8[i]
 
170
        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
 
171
        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
 
172
        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
 
173
        add             x1,  x9,  x10, lsl #5   // block + i * 16
 
174
        cmp             w3,  #0
 
175
        ldrsh           w3,  [x1]               // block[i*16]
 
176
        csel            x20, x13, x14, eq
 
177
        ccmp            w3,  #0,  #0,  eq
 
178
        b.eq            2f
 
179
        blr             x20
 
180
2:      add             x10, x10, #1
 
181
        cmp             x10, #4
 
182
        csel            x10, x11, x10, eq     // mov x10, #16
 
183
        csel            x6,  x15, x6,  eq
 
184
        cmp             x10, #20
 
185
        b.lt            1b
 
186
        ldp             x19, x20, [sp]
 
187
        add             sp,  sp,  #0x40
 
188
        ret             x12
 
189
endfunc
 
190
 
 
191
.macro  idct8x8_cols    pass
 
192
  .if \pass == 0
 
193
        va      .req    v18
 
194
        vb      .req    v30
 
195
        sshr            v18.8H, v26.8H, #1
 
196
        add             v16.8H, v24.8H, v28.8H
 
197
        ld1             {v30.8H, v31.8H}, [x1]
 
198
        st1             {v19.8H}, [x1],  #16
 
199
        st1             {v19.8H}, [x1],  #16
 
200
        sub             v17.8H,  v24.8H, v28.8H
 
201
        sshr            v19.8H,  v30.8H, #1
 
202
        sub             v18.8H,  v18.8H,  v30.8H
 
203
        add             v19.8H,  v19.8H,  v26.8H
 
204
  .else
 
205
        va      .req    v30
 
206
        vb      .req    v18
 
207
        sshr            v30.8H, v26.8H, #1
 
208
        sshr            v19.8H, v18.8H, #1
 
209
        add             v16.8H, v24.8H, v28.8H
 
210
        sub             v17.8H, v24.8H, v28.8H
 
211
        sub             v30.8H, v30.8H, v18.8H
 
212
        add             v19.8H, v19.8H, v26.8H
 
213
  .endif
 
214
        add             v26.8H, v17.8H, va.8H
 
215
        sub             v28.8H, v17.8H, va.8H
 
216
        add             v24.8H, v16.8H, v19.8H
 
217
        sub             vb.8H,  v16.8H, v19.8H
 
218
        sub             v16.8H, v29.8H, v27.8H
 
219
        add             v17.8H, v31.8H, v25.8H
 
220
        sub             va.8H,  v31.8H, v25.8H
 
221
        add             v19.8H, v29.8H, v27.8H
 
222
        sub             v16.8H, v16.8H, v31.8H
 
223
        sub             v17.8H, v17.8H, v27.8H
 
224
        add             va.8H,  va.8H,  v29.8H
 
225
        add             v19.8H, v19.8H, v25.8H
 
226
        sshr            v25.8H, v25.8H, #1
 
227
        sshr            v27.8H, v27.8H, #1
 
228
        sshr            v29.8H, v29.8H, #1
 
229
        sshr            v31.8H, v31.8H, #1
 
230
        sub             v16.8H, v16.8H, v31.8H
 
231
        sub             v17.8H, v17.8H, v27.8H
 
232
        add             va.8H,  va.8H,  v29.8H
 
233
        add             v19.8H, v19.8H, v25.8H
 
234
        sshr            v25.8H, v16.8H, #2
 
235
        sshr            v27.8H, v17.8H, #2
 
236
        sshr            v29.8H, va.8H,  #2
 
237
        sshr            v31.8H, v19.8H, #2
 
238
        sub             v19.8H, v19.8H, v25.8H
 
239
        sub             va.8H,  v27.8H, va.8H
 
240
        add             v17.8H, v17.8H, v29.8H
 
241
        add             v16.8H, v16.8H, v31.8H
 
242
  .if \pass == 0
 
243
        sub             v31.8H, v24.8H, v19.8H
 
244
        add             v24.8H, v24.8H, v19.8H
 
245
        add             v25.8H, v26.8H, v18.8H
 
246
        sub             v18.8H, v26.8H, v18.8H
 
247
        add             v26.8H, v28.8H, v17.8H
 
248
        add             v27.8H, v30.8H, v16.8H
 
249
        sub             v29.8H, v28.8H, v17.8H
 
250
        sub             v28.8H, v30.8H, v16.8H
 
251
  .else
 
252
        sub             v31.8H, v24.8H, v19.8H
 
253
        add             v24.8H, v24.8H, v19.8H
 
254
        add             v25.8H, v26.8H, v30.8H
 
255
        sub             v30.8H, v26.8H, v30.8H
 
256
        add             v26.8H, v28.8H, v17.8H
 
257
        sub             v29.8H, v28.8H, v17.8H
 
258
        add             v27.8H, v18.8H, v16.8H
 
259
        sub             v28.8H, v18.8H, v16.8H
 
260
  .endif
 
261
        .unreq          va
 
262
        .unreq          vb
 
263
.endm
 
264
 
 
265
function ff_h264_idct8_add_neon, export=1
 
266
        movi            v19.8H,   #0
 
267
        ld1             {v24.8H, v25.8H}, [x1]
 
268
        st1             {v19.8H},  [x1],   #16
 
269
        st1             {v19.8H},  [x1],   #16
 
270
        ld1             {v26.8H, v27.8H}, [x1]
 
271
        st1             {v19.8H},  [x1],   #16
 
272
        st1             {v19.8H},  [x1],   #16
 
273
        ld1             {v28.8H, v29.8H}, [x1]
 
274
        st1             {v19.8H},  [x1],   #16
 
275
        st1             {v19.8H},  [x1],   #16
 
276
 
 
277
        idct8x8_cols    0
 
278
        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
 
279
        idct8x8_cols    1
 
280
 
 
281
        mov             x3,  x0
 
282
        srshr           v24.8H, v24.8H, #6
 
283
        ld1             {v0.8B},     [x0], x2
 
284
        srshr           v25.8H, v25.8H, #6
 
285
        ld1             {v1.8B},     [x0], x2
 
286
        srshr           v26.8H, v26.8H, #6
 
287
        ld1             {v2.8B},     [x0], x2
 
288
        srshr           v27.8H, v27.8H, #6
 
289
        ld1             {v3.8B},     [x0], x2
 
290
        srshr           v28.8H, v28.8H, #6
 
291
        ld1             {v4.8B},     [x0], x2
 
292
        srshr           v29.8H, v29.8H, #6
 
293
        ld1             {v5.8B},     [x0], x2
 
294
        srshr           v30.8H, v30.8H, #6
 
295
        ld1             {v6.8B},     [x0], x2
 
296
        srshr           v31.8H, v31.8H, #6
 
297
        ld1             {v7.8B},     [x0], x2
 
298
        uaddw           v24.8H, v24.8H, v0.8B
 
299
        uaddw           v25.8H, v25.8H, v1.8B
 
300
        uaddw           v26.8H, v26.8H, v2.8B
 
301
        sqxtun          v0.8B,  v24.8H
 
302
        uaddw           v27.8H, v27.8H, v3.8B
 
303
        sqxtun          v1.8B,  v25.8H
 
304
        uaddw           v28.8H, v28.8H, v4.8B
 
305
        sqxtun          v2.8B,  v26.8H
 
306
        st1             {v0.8B},     [x3], x2
 
307
        uaddw           v29.8H, v29.8H, v5.8B
 
308
        sqxtun          v3.8B,  v27.8H
 
309
        st1             {v1.8B},     [x3], x2
 
310
        uaddw           v30.8H, v30.8H, v6.8B
 
311
        sqxtun          v4.8B,  v28.8H
 
312
        st1             {v2.8B},     [x3], x2
 
313
        uaddw           v31.8H, v31.8H, v7.8B
 
314
        sqxtun          v5.8B,  v29.8H
 
315
        st1             {v3.8B},     [x3], x2
 
316
        sqxtun          v6.8B,  v30.8H
 
317
        sqxtun          v7.8B,  v31.8H
 
318
        st1             {v4.8B},     [x3], x2
 
319
        st1             {v5.8B},     [x3], x2
 
320
        st1             {v6.8B},     [x3], x2
 
321
        st1             {v7.8B},     [x3], x2
 
322
 
 
323
        sub             x1,  x1,  #128
 
324
        ret
 
325
endfunc
 
326
 
 
327
function ff_h264_idct8_dc_add_neon, export=1
 
328
        mov             w3,       #0
 
329
        sxtw            x2,       w2
 
330
        ld1r            {v31.8H}, [x1]
 
331
        strh            w3,       [x1]
 
332
        ld1             {v0.8B},  [x0], x2
 
333
        srshr           v31.8H, v31.8H, #6
 
334
        ld1             {v1.8B},     [x0], x2
 
335
        ld1             {v2.8B},     [x0], x2
 
336
        uaddw           v24.8H, v31.8H, v0.8B
 
337
        ld1             {v3.8B},     [x0], x2
 
338
        uaddw           v25.8H, v31.8H, v1.8B
 
339
        ld1             {v4.8B},     [x0], x2
 
340
        uaddw           v26.8H, v31.8H, v2.8B
 
341
        ld1             {v5.8B},     [x0], x2
 
342
        uaddw           v27.8H, v31.8H, v3.8B
 
343
        ld1             {v6.8B},     [x0], x2
 
344
        uaddw           v28.8H, v31.8H, v4.8B
 
345
        ld1             {v7.8B},     [x0], x2
 
346
        uaddw           v29.8H, v31.8H, v5.8B
 
347
        uaddw           v30.8H, v31.8H, v6.8B
 
348
        uaddw           v31.8H, v31.8H, v7.8B
 
349
        sqxtun          v0.8B,  v24.8H
 
350
        sqxtun          v1.8B,  v25.8H
 
351
        sqxtun          v2.8B,  v26.8H
 
352
        sqxtun          v3.8B,  v27.8H
 
353
        sub             x0,  x0,  x2, lsl #3
 
354
        st1             {v0.8B},     [x0], x2
 
355
        sqxtun          v4.8B,  v28.8H
 
356
        st1             {v1.8B},     [x0], x2
 
357
        sqxtun          v5.8B,  v29.8H
 
358
        st1             {v2.8B},     [x0], x2
 
359
        sqxtun          v6.8B,  v30.8H
 
360
        st1             {v3.8B},     [x0], x2
 
361
        sqxtun          v7.8B,  v31.8H
 
362
        st1             {v4.8B},     [x0], x2
 
363
        st1             {v5.8B},     [x0], x2
 
364
        st1             {v6.8B},     [x0], x2
 
365
        st1             {v7.8B},     [x0], x2
 
366
        ret
 
367
endfunc
 
368
 
 
369
function ff_h264_idct8_add4_neon, export=1
 
370
        mov             x12, x30
 
371
        mov             x6,  x0
 
372
        mov             x5,  x1
 
373
        mov             x1,  x2
 
374
        mov             w2,  w3
 
375
        movrel          x7,  scan8
 
376
        mov             w10, #16
 
377
        movrel          x13, ff_h264_idct8_dc_add_neon
 
378
        movrel          x14, ff_h264_idct8_add_neon
 
379
1:      ldrb            w9,  [x7], #4
 
380
        ldrsw           x0,  [x5], #16
 
381
        ldrb            w9,  [x4, w9, UXTW]
 
382
        subs            w9,  w9,  #1
 
383
        b.lt            2f
 
384
        ldrsh           w11,  [x1]
 
385
        add             x0,  x6,  x0
 
386
        ccmp            w11, #0,  #4,  eq
 
387
        csel            x15, x13, x14, ne
 
388
        blr             x15
 
389
2:      subs            w10, w10, #4
 
390
        add             x1,  x1,  #128
 
391
        b.ne            1b
 
392
        ret             x12
 
393
endfunc
 
394
 
 
395
const   scan8
 
396
        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
 
397
        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
 
398
        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
 
399
        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
 
400
        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
 
401
        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
 
402
        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
 
403
        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
 
404
        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
 
405
        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
 
406
        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
 
407
        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
 
408
endconst