~ubuntu-branches/ubuntu/trusty/gst-libav1.0/trusty-proposed

« back to all changes in this revision

Viewing changes to gst-libs/ext/libav/libavcodec/arm/dsputil_neon.S

  • Committer: Package Import Robot
  • Author(s): Sebastian Dröge
  • Date: 2013-09-24 17:07:00 UTC
  • mfrom: (1.1.17) (7.1.9 experimental)
  • Revision ID: package-import@ubuntu.com-20130924170700-4dg62s3pwl0pdakz
Tags: 1.2.0-1
* New upstream stable release:
  + debian/control:
    - Build depend on GStreamer and gst-plugins-base >= 1.2.0.

Show diffs side-by-side

added added

removed removed

Lines of Context:
20
20
 */
21
21
 
22
22
#include "config.h"
23
 
#include "asm.S"
24
 
 
25
 
        preserve8
 
23
#include "libavutil/arm/asm.S"
26
24
 
27
25
function ff_clear_block_neon, export=1
28
26
        vmov.i16        q0,  #0
44
42
  .if \avg
45
43
        mov             r12, r0
46
44
  .endif
47
 
1:      vld1.64         {q0},     [r1], r2
48
 
        vld1.64         {q1},     [r1], r2
49
 
        vld1.64         {q2},     [r1], r2
 
45
1:      vld1.8          {q0},     [r1], r2
 
46
        vld1.8          {q1},     [r1], r2
 
47
        vld1.8          {q2},     [r1], r2
50
48
        pld             [r1, r2, lsl #2]
51
 
        vld1.64         {q3},     [r1], r2
 
49
        vld1.8          {q3},     [r1], r2
52
50
        pld             [r1]
53
51
        pld             [r1, r2]
54
52
        pld             [r1, r2, lsl #1]
55
53
  .if \avg
56
 
        vld1.64         {q8},     [r12,:128], r2
 
54
        vld1.8          {q8},     [r12,:128], r2
57
55
        vrhadd.u8       q0,  q0,  q8
58
 
        vld1.64         {q9},     [r12,:128], r2
 
56
        vld1.8          {q9},     [r12,:128], r2
59
57
        vrhadd.u8       q1,  q1,  q9
60
 
        vld1.64         {q10},    [r12,:128], r2
 
58
        vld1.8          {q10},    [r12,:128], r2
61
59
        vrhadd.u8       q2,  q2,  q10
62
 
        vld1.64         {q11},    [r12,:128], r2
 
60
        vld1.8          {q11},    [r12,:128], r2
63
61
        vrhadd.u8       q3,  q3,  q11
64
62
  .endif
65
63
        subs            r3,  r3,  #4
72
70
.endm
73
71
 
74
72
.macro  pixels16_x2     rnd=1, avg=0
75
 
1:      vld1.64         {d0-d2},  [r1], r2
76
 
        vld1.64         {d4-d6},  [r1], r2
 
73
1:      vld1.8          {d0-d2},  [r1], r2
 
74
        vld1.8          {d4-d6},  [r1], r2
77
75
        pld             [r1]
78
76
        pld             [r1, r2]
79
77
        subs            r3,  r3,  #2
88
86
        vrhadd.u8       q2,  q2,  q3
89
87
        sub             r0,  r0,  r2
90
88
  .endif
91
 
        vst1.64         {q0},     [r0,:128], r2
92
 
        vst1.64         {q2},     [r0,:128], r2
 
89
        vst1.8          {q0},     [r0,:128], r2
 
90
        vst1.8          {q2},     [r0,:128], r2
93
91
        bne             1b
94
92
        bx              lr
95
93
.endm
96
94
 
97
95
.macro  pixels16_y2     rnd=1, avg=0
98
 
        vld1.64         {q0},     [r1], r2
99
 
        vld1.64         {q1},     [r1], r2
 
96
        sub             r3,  r3,  #2
 
97
        vld1.8          {q0},     [r1], r2
 
98
        vld1.8          {q1},     [r1], r2
100
99
1:      subs            r3,  r3,  #2
101
100
        avg             q2,  q0,  q1
102
 
        vld1.64         {q0},     [r1], r2
 
101
        vld1.8          {q0},     [r1], r2
103
102
        avg             q3,  q0,  q1
104
 
        vld1.64         {q1},     [r1], r2
 
103
        vld1.8          {q1},     [r1], r2
105
104
        pld             [r1]
106
105
        pld             [r1, r2]
107
106
  .if \avg
111
110
        vrhadd.u8       q3,  q3,  q9
112
111
        sub             r0,  r0,  r2
113
112
  .endif
114
 
        vst1.64         {q2},     [r0,:128], r2
115
 
        vst1.64         {q3},     [r0,:128], r2
 
113
        vst1.8          {q2},     [r0,:128], r2
 
114
        vst1.8          {q3},     [r0,:128], r2
116
115
        bne             1b
 
116
 
 
117
        avg             q2,  q0,  q1
 
118
        vld1.8          {q0},     [r1], r2
 
119
        avg             q3,  q0,  q1
 
120
  .if \avg
 
121
        vld1.8          {q8},     [r0,:128], r2
 
122
        vld1.8          {q9},     [r0,:128]
 
123
        vrhadd.u8       q2,  q2,  q8
 
124
        vrhadd.u8       q3,  q3,  q9
 
125
        sub             r0,  r0,  r2
 
126
  .endif
 
127
        vst1.8          {q2},     [r0,:128], r2
 
128
        vst1.8          {q3},     [r0,:128], r2
 
129
 
117
130
        bx              lr
118
131
.endm
119
132
 
120
133
.macro  pixels16_xy2    rnd=1, avg=0
121
 
        vld1.64         {d0-d2},  [r1], r2
122
 
        vld1.64         {d4-d6},  [r1], r2
123
 
  .ifeq \rnd
124
 
        vmov.i16        q13, #1
125
 
  .endif
 
134
        sub             r3,  r3,  #2
 
135
        vld1.8          {d0-d2},  [r1], r2
 
136
        vld1.8          {d4-d6},  [r1], r2
 
137
NRND    vmov.i16        q13, #1
126
138
        pld             [r1]
127
139
        pld             [r1, r2]
128
140
        vext.8          q1,  q0,  q1,  #1
132
144
        vaddl.u8        q9,  d4,  d6
133
145
        vaddl.u8        q11, d5,  d7
134
146
1:      subs            r3,  r3,  #2
135
 
        vld1.64         {d0-d2},  [r1], r2
 
147
        vld1.8          {d0-d2},  [r1], r2
136
148
        vadd.u16        q12, q8,  q9
137
149
        pld             [r1]
138
 
  .ifeq \rnd
139
 
        vadd.u16        q12, q12, q13
140
 
  .endif
 
150
NRND    vadd.u16        q12, q12, q13
141
151
        vext.8          q15, q0,  q1,  #1
142
152
        vadd.u16        q1 , q10, q11
143
153
        shrn            d28, q12, #2
144
 
  .ifeq \rnd
145
 
        vadd.u16        q1,  q1,  q13
146
 
  .endif
 
154
NRND    vadd.u16        q1,  q1,  q13
147
155
        shrn            d29, q1,  #2
148
156
  .if \avg
149
157
        vld1.8          {q8},     [r0,:128]
150
158
        vrhadd.u8       q14, q14, q8
151
159
  .endif
152
160
        vaddl.u8        q8,  d0,  d30
153
 
        vld1.64         {d2-d4},  [r1], r2
 
161
        vld1.8          {d2-d4},  [r1], r2
154
162
        vaddl.u8        q10, d1,  d31
155
 
        vst1.64         {q14},    [r0,:128], r2
 
163
        vst1.8          {q14},    [r0,:128], r2
156
164
        vadd.u16        q12, q8,  q9
157
165
        pld             [r1, r2]
158
 
  .ifeq \rnd
159
 
        vadd.u16        q12, q12, q13
160
 
  .endif
 
166
NRND    vadd.u16        q12, q12, q13
161
167
        vext.8          q2,  q1,  q2,  #1
162
168
        vadd.u16        q0,  q10, q11
163
169
        shrn            d30, q12, #2
164
 
  .ifeq \rnd
165
 
        vadd.u16        q0,  q0,  q13
166
 
  .endif
 
170
NRND    vadd.u16        q0,  q0,  q13
167
171
        shrn            d31, q0,  #2
168
172
  .if \avg
169
173
        vld1.8          {q9},     [r0,:128]
171
175
  .endif
172
176
        vaddl.u8        q9,  d2,  d4
173
177
        vaddl.u8        q11, d3,  d5
174
 
        vst1.64         {q15},    [r0,:128], r2
 
178
        vst1.8          {q15},    [r0,:128], r2
175
179
        bgt             1b
 
180
 
 
181
        vld1.8          {d0-d2},  [r1], r2
 
182
        vadd.u16        q12, q8,  q9
 
183
NRND    vadd.u16        q12, q12, q13
 
184
        vext.8          q15, q0,  q1,  #1
 
185
        vadd.u16        q1 , q10, q11
 
186
        shrn            d28, q12, #2
 
187
NRND    vadd.u16        q1,  q1,  q13
 
188
        shrn            d29, q1,  #2
 
189
  .if \avg
 
190
        vld1.8          {q8},     [r0,:128]
 
191
        vrhadd.u8       q14, q14, q8
 
192
  .endif
 
193
        vaddl.u8        q8,  d0,  d30
 
194
        vaddl.u8        q10, d1,  d31
 
195
        vst1.8          {q14},    [r0,:128], r2
 
196
        vadd.u16        q12, q8,  q9
 
197
NRND    vadd.u16        q12, q12, q13
 
198
        vadd.u16        q0,  q10, q11
 
199
        shrn            d30, q12, #2
 
200
NRND    vadd.u16        q0,  q0,  q13
 
201
        shrn            d31, q0,  #2
 
202
  .if \avg
 
203
        vld1.8          {q9},     [r0,:128]
 
204
        vrhadd.u8       q15, q15, q9
 
205
  .endif
 
206
        vst1.8          {q15},    [r0,:128], r2
 
207
 
176
208
        bx              lr
177
209
.endm
178
210
 
179
211
.macro  pixels8         rnd=1, avg=0
180
 
1:      vld1.64         {d0},     [r1], r2
181
 
        vld1.64         {d1},     [r1], r2
182
 
        vld1.64         {d2},     [r1], r2
 
212
1:      vld1.8          {d0},     [r1], r2
 
213
        vld1.8          {d1},     [r1], r2
 
214
        vld1.8          {d2},     [r1], r2
183
215
        pld             [r1, r2, lsl #2]
184
 
        vld1.64         {d3},     [r1], r2
 
216
        vld1.8          {d3},     [r1], r2
185
217
        pld             [r1]
186
218
        pld             [r1, r2]
187
219
        pld             [r1, r2, lsl #1]
188
220
  .if \avg
189
 
        vld1.64         {d4},     [r0,:64], r2
 
221
        vld1.8          {d4},     [r0,:64], r2
190
222
        vrhadd.u8       d0,  d0,  d4
191
 
        vld1.64         {d5},     [r0,:64], r2
 
223
        vld1.8          {d5},     [r0,:64], r2
192
224
        vrhadd.u8       d1,  d1,  d5
193
 
        vld1.64         {d6},     [r0,:64], r2
 
225
        vld1.8          {d6},     [r0,:64], r2
194
226
        vrhadd.u8       d2,  d2,  d6
195
 
        vld1.64         {d7},     [r0,:64], r2
 
227
        vld1.8          {d7},     [r0,:64], r2
196
228
        vrhadd.u8       d3,  d3,  d7
197
229
        sub             r0,  r0,  r2,  lsl #2
198
230
  .endif
199
231
        subs            r3,  r3,  #4
200
 
        vst1.64         {d0},     [r0,:64], r2
201
 
        vst1.64         {d1},     [r0,:64], r2
202
 
        vst1.64         {d2},     [r0,:64], r2
203
 
        vst1.64         {d3},     [r0,:64], r2
 
232
        vst1.8          {d0},     [r0,:64], r2
 
233
        vst1.8          {d1},     [r0,:64], r2
 
234
        vst1.8          {d2},     [r0,:64], r2
 
235
        vst1.8          {d3},     [r0,:64], r2
204
236
        bne             1b
205
237
        bx              lr
206
238
.endm
207
239
 
208
240
.macro  pixels8_x2      rnd=1, avg=0
209
 
1:      vld1.64         {q0},     [r1], r2
 
241
1:      vld1.8          {q0},     [r1], r2
210
242
        vext.8          d1,  d0,  d1,  #1
211
 
        vld1.64         {q1},     [r1], r2
 
243
        vld1.8          {q1},     [r1], r2
212
244
        vext.8          d3,  d2,  d3,  #1
213
245
        pld             [r1]
214
246
        pld             [r1, r2]
221
253
        vrhadd.u8       q0,  q0,  q2
222
254
        sub             r0,  r0,  r2
223
255
  .endif
224
 
        vst1.64         {d0},     [r0,:64], r2
225
 
        vst1.64         {d1},     [r0,:64], r2
 
256
        vst1.8          {d0},     [r0,:64], r2
 
257
        vst1.8          {d1},     [r0,:64], r2
226
258
        bne             1b
227
259
        bx              lr
228
260
.endm
229
261
 
230
262
.macro  pixels8_y2      rnd=1, avg=0
231
 
        vld1.64         {d0},     [r1], r2
232
 
        vld1.64         {d1},     [r1], r2
 
263
        sub             r3,  r3,  #2
 
264
        vld1.8          {d0},     [r1], r2
 
265
        vld1.8          {d1},     [r1], r2
233
266
1:      subs            r3,  r3,  #2
234
267
        avg             d4,  d0,  d1
235
 
        vld1.64         {d0},     [r1], r2
 
268
        vld1.8          {d0},     [r1], r2
236
269
        avg             d5,  d0,  d1
237
 
        vld1.64         {d1},     [r1], r2
 
270
        vld1.8          {d1},     [r1], r2
238
271
        pld             [r1]
239
272
        pld             [r1, r2]
240
273
  .if \avg
243
276
        vrhadd.u8       q2,  q2,  q1
244
277
        sub             r0,  r0,  r2
245
278
  .endif
246
 
        vst1.64         {d4},     [r0,:64], r2
247
 
        vst1.64         {d5},     [r0,:64], r2
 
279
        vst1.8          {d4},     [r0,:64], r2
 
280
        vst1.8          {d5},     [r0,:64], r2
248
281
        bne             1b
 
282
 
 
283
        avg             d4,  d0,  d1
 
284
        vld1.8          {d0},     [r1], r2
 
285
        avg             d5,  d0,  d1
 
286
  .if \avg
 
287
        vld1.8          {d2},     [r0,:64], r2
 
288
        vld1.8          {d3},     [r0,:64]
 
289
        vrhadd.u8       q2,  q2,  q1
 
290
        sub             r0,  r0,  r2
 
291
  .endif
 
292
        vst1.8          {d4},     [r0,:64], r2
 
293
        vst1.8          {d5},     [r0,:64], r2
 
294
 
249
295
        bx              lr
250
296
.endm
251
297
 
252
298
.macro  pixels8_xy2     rnd=1, avg=0
253
 
        vld1.64         {q0},     [r1], r2
254
 
        vld1.64         {q1},     [r1], r2
255
 
  .ifeq \rnd
256
 
        vmov.i16        q11, #1
257
 
  .endif
 
299
        sub             r3,  r3,  #2
 
300
        vld1.8          {q0},     [r1], r2
 
301
        vld1.8          {q1},     [r1], r2
 
302
NRND    vmov.i16        q11, #1
258
303
        pld             [r1]
259
304
        pld             [r1, r2]
260
305
        vext.8          d4,  d0,  d1,  #1
262
307
        vaddl.u8        q8,  d0,  d4
263
308
        vaddl.u8        q9,  d2,  d6
264
309
1:      subs            r3,  r3,  #2
265
 
        vld1.64         {q0},     [r1], r2
 
310
        vld1.8          {q0},     [r1], r2
266
311
        pld             [r1]
267
312
        vadd.u16        q10, q8,  q9
268
313
        vext.8          d4,  d0,  d1,  #1
269
 
  .ifeq \rnd
270
 
        vadd.u16        q10, q10, q11
271
 
  .endif
 
314
NRND    vadd.u16        q10, q10, q11
272
315
        vaddl.u8        q8,  d0,  d4
273
316
        shrn            d5,  q10, #2
274
 
        vld1.64         {q1},     [r1], r2
 
317
        vld1.8          {q1},     [r1], r2
275
318
        vadd.u16        q10, q8,  q9
276
319
        pld             [r1, r2]
277
320
  .if \avg
278
321
        vld1.8          {d7},     [r0,:64]
279
322
        vrhadd.u8       d5,  d5,  d7
280
323
  .endif
281
 
  .ifeq \rnd
282
 
        vadd.u16        q10, q10, q11
283
 
  .endif
284
 
        vst1.64         {d5},     [r0,:64], r2
 
324
NRND    vadd.u16        q10, q10, q11
 
325
        vst1.8          {d5},     [r0,:64], r2
285
326
        shrn            d7,  q10, #2
286
327
  .if \avg
287
328
        vld1.8          {d5},     [r0,:64]
289
330
  .endif
290
331
        vext.8          d6,  d2,  d3,  #1
291
332
        vaddl.u8        q9,  d2,  d6
292
 
        vst1.64         {d7},     [r0,:64], r2
 
333
        vst1.8          {d7},     [r0,:64], r2
293
334
        bgt             1b
 
335
 
 
336
        vld1.8          {q0},     [r1], r2
 
337
        vadd.u16        q10, q8,  q9
 
338
        vext.8          d4,  d0,  d1,  #1
 
339
NRND    vadd.u16        q10, q10, q11
 
340
        vaddl.u8        q8,  d0,  d4
 
341
        shrn            d5,  q10, #2
 
342
        vadd.u16        q10, q8,  q9
 
343
  .if \avg
 
344
        vld1.8          {d7},     [r0,:64]
 
345
        vrhadd.u8       d5,  d5,  d7
 
346
  .endif
 
347
NRND    vadd.u16        q10, q10, q11
 
348
        vst1.8          {d5},     [r0,:64], r2
 
349
        shrn            d7,  q10, #2
 
350
  .if \avg
 
351
        vld1.8          {d5},     [r0,:64]
 
352
        vrhadd.u8       d7,  d7,  d5
 
353
  .endif
 
354
        vst1.8          {d7},     [r0,:64], r2
 
355
 
294
356
        bx              lr
295
357
.endm
296
358
 
302
364
    .macro shrn rd, rn, rm
303
365
        vrshrn.u16      \rd, \rn, \rm
304
366
    .endm
 
367
    .macro NRND insn:vararg
 
368
    .endm
305
369
  .else
306
370
    .macro avg  rd, rn, rm
307
371
        vhadd.u8        \rd, \rn, \rm
309
373
    .macro shrn rd, rn, rm
310
374
        vshrn.u16       \rd, \rn, \rm
311
375
    .endm
 
376
    .macro NRND insn:vararg
 
377
        \insn
 
378
    .endm
312
379
  .endif
313
380
function ff_\pfx\name\suf\()_neon, export=1
314
381
        \name           \rnd, \avg
315
382
endfunc
316
383
        .purgem         avg
317
384
        .purgem         shrn
 
385
        .purgem         NRND
318
386
.endm
319
387
 
320
388
.macro  pixfunc2        pfx, name, avg=0
359
427
        pixfunc2        avg_, pixels8_xy2, avg=1
360
428
 
361
429
function ff_put_pixels_clamped_neon, export=1
362
 
        vld1.64         {d16-d19}, [r0,:128]!
 
430
        vld1.16         {d16-d19}, [r0,:128]!
363
431
        vqmovun.s16     d0, q8
364
 
        vld1.64         {d20-d23}, [r0,:128]!
 
432
        vld1.16         {d20-d23}, [r0,:128]!
365
433
        vqmovun.s16     d1, q9
366
 
        vld1.64         {d24-d27}, [r0,:128]!
 
434
        vld1.16         {d24-d27}, [r0,:128]!
367
435
        vqmovun.s16     d2, q10
368
 
        vld1.64         {d28-d31}, [r0,:128]!
 
436
        vld1.16         {d28-d31}, [r0,:128]!
369
437
        vqmovun.s16     d3, q11
370
 
        vst1.64         {d0},      [r1,:64], r2
 
438
        vst1.8          {d0},      [r1,:64], r2
371
439
        vqmovun.s16     d4, q12
372
 
        vst1.64         {d1},      [r1,:64], r2
 
440
        vst1.8          {d1},      [r1,:64], r2
373
441
        vqmovun.s16     d5, q13
374
 
        vst1.64         {d2},      [r1,:64], r2
 
442
        vst1.8          {d2},      [r1,:64], r2
375
443
        vqmovun.s16     d6, q14
376
 
        vst1.64         {d3},      [r1,:64], r2
 
444
        vst1.8          {d3},      [r1,:64], r2
377
445
        vqmovun.s16     d7, q15
378
 
        vst1.64         {d4},      [r1,:64], r2
379
 
        vst1.64         {d5},      [r1,:64], r2
380
 
        vst1.64         {d6},      [r1,:64], r2
381
 
        vst1.64         {d7},      [r1,:64], r2
 
446
        vst1.8          {d4},      [r1,:64], r2
 
447
        vst1.8          {d5},      [r1,:64], r2
 
448
        vst1.8          {d6},      [r1,:64], r2
 
449
        vst1.8          {d7},      [r1,:64], r2
382
450
        bx              lr
383
451
endfunc
384
452
 
385
453
function ff_put_signed_pixels_clamped_neon, export=1
386
454
        vmov.u8         d31, #128
387
 
        vld1.64         {d16-d17}, [r0,:128]!
 
455
        vld1.16         {d16-d17}, [r0,:128]!
388
456
        vqmovn.s16      d0, q8
389
 
        vld1.64         {d18-d19}, [r0,:128]!
 
457
        vld1.16         {d18-d19}, [r0,:128]!
390
458
        vqmovn.s16      d1, q9
391
 
        vld1.64         {d16-d17}, [r0,:128]!
 
459
        vld1.16         {d16-d17}, [r0,:128]!
392
460
        vqmovn.s16      d2, q8
393
 
        vld1.64         {d18-d19}, [r0,:128]!
 
461
        vld1.16         {d18-d19}, [r0,:128]!
394
462
        vadd.u8         d0, d0, d31
395
 
        vld1.64         {d20-d21}, [r0,:128]!
 
463
        vld1.16         {d20-d21}, [r0,:128]!
396
464
        vadd.u8         d1, d1, d31
397
 
        vld1.64         {d22-d23}, [r0,:128]!
 
465
        vld1.16         {d22-d23}, [r0,:128]!
398
466
        vadd.u8         d2, d2, d31
399
 
        vst1.64         {d0},      [r1,:64], r2
 
467
        vst1.8          {d0},      [r1,:64], r2
400
468
        vqmovn.s16      d3, q9
401
 
        vst1.64         {d1},      [r1,:64], r2
 
469
        vst1.8          {d1},      [r1,:64], r2
402
470
        vqmovn.s16      d4, q10
403
 
        vst1.64         {d2},      [r1,:64], r2
 
471
        vst1.8          {d2},      [r1,:64], r2
404
472
        vqmovn.s16      d5, q11
405
 
        vld1.64         {d24-d25}, [r0,:128]!
 
473
        vld1.16         {d24-d25}, [r0,:128]!
406
474
        vadd.u8         d3, d3, d31
407
 
        vld1.64         {d26-d27}, [r0,:128]!
 
475
        vld1.16         {d26-d27}, [r0,:128]!
408
476
        vadd.u8         d4, d4, d31
409
477
        vadd.u8         d5, d5, d31
410
 
        vst1.64         {d3},      [r1,:64], r2
 
478
        vst1.8          {d3},      [r1,:64], r2
411
479
        vqmovn.s16      d6, q12
412
 
        vst1.64         {d4},      [r1,:64], r2
 
480
        vst1.8          {d4},      [r1,:64], r2
413
481
        vqmovn.s16      d7, q13
414
 
        vst1.64         {d5},      [r1,:64], r2
 
482
        vst1.8          {d5},      [r1,:64], r2
415
483
        vadd.u8         d6, d6, d31
416
484
        vadd.u8         d7, d7, d31
417
 
        vst1.64         {d6},      [r1,:64], r2
418
 
        vst1.64         {d7},      [r1,:64], r2
 
485
        vst1.8          {d6},      [r1,:64], r2
 
486
        vst1.8          {d7},      [r1,:64], r2
419
487
        bx              lr
420
488
endfunc
421
489
 
422
490
function ff_add_pixels_clamped_neon, export=1
423
491
        mov             r3, r1
424
 
        vld1.64         {d16},   [r1,:64], r2
425
 
        vld1.64         {d0-d1}, [r0,:128]!
426
 
        vaddw.u8        q0, q0, d16
427
 
        vld1.64         {d17},   [r1,:64], r2
428
 
        vld1.64         {d2-d3}, [r0,:128]!
429
 
        vqmovun.s16     d0, q0
430
 
        vld1.64         {d18},   [r1,:64], r2
431
 
        vaddw.u8        q1, q1, d17
432
 
        vld1.64         {d4-d5}, [r0,:128]!
433
 
        vaddw.u8        q2, q2, d18
434
 
        vst1.64         {d0},    [r3,:64], r2
435
 
        vqmovun.s16     d2, q1
436
 
        vld1.64         {d19},   [r1,:64], r2
437
 
        vld1.64         {d6-d7}, [r0,:128]!
438
 
        vaddw.u8        q3, q3, d19
439
 
        vqmovun.s16     d4, q2
440
 
        vst1.64         {d2},    [r3,:64], r2
441
 
        vld1.64         {d16},   [r1,:64], r2
442
 
        vqmovun.s16     d6, q3
443
 
        vld1.64         {d0-d1}, [r0,:128]!
444
 
        vaddw.u8        q0, q0, d16
445
 
        vst1.64         {d4},    [r3,:64], r2
446
 
        vld1.64         {d17},   [r1,:64], r2
447
 
        vld1.64         {d2-d3}, [r0,:128]!
448
 
        vaddw.u8        q1, q1, d17
449
 
        vst1.64         {d6},    [r3,:64], r2
450
 
        vqmovun.s16     d0, q0
451
 
        vld1.64         {d18},   [r1,:64], r2
452
 
        vld1.64         {d4-d5}, [r0,:128]!
453
 
        vaddw.u8        q2, q2, d18
454
 
        vst1.64         {d0},    [r3,:64], r2
455
 
        vqmovun.s16     d2, q1
456
 
        vld1.64         {d19},   [r1,:64], r2
457
 
        vqmovun.s16     d4, q2
458
 
        vld1.64         {d6-d7}, [r0,:128]!
459
 
        vaddw.u8        q3, q3, d19
460
 
        vst1.64         {d2},    [r3,:64], r2
461
 
        vqmovun.s16     d6, q3
462
 
        vst1.64         {d4},    [r3,:64], r2
463
 
        vst1.64         {d6},    [r3,:64], r2
464
 
        bx              lr
465
 
endfunc
466
 
 
467
 
function ff_vector_fmul_neon, export=1
468
 
        subs            r3,  r3,  #8
469
 
        vld1.64         {d0-d3},  [r1,:128]!
470
 
        vld1.64         {d4-d7},  [r2,:128]!
471
 
        vmul.f32        q8,  q0,  q2
472
 
        vmul.f32        q9,  q1,  q3
473
 
        beq             3f
474
 
        bics            ip,  r3,  #15
475
 
        beq             2f
476
 
1:      subs            ip,  ip,  #16
477
 
        vld1.64         {d0-d1},  [r1,:128]!
478
 
        vld1.64         {d4-d5},  [r2,:128]!
479
 
        vmul.f32        q10, q0,  q2
480
 
        vld1.64         {d2-d3},  [r1,:128]!
481
 
        vld1.64         {d6-d7},  [r2,:128]!
482
 
        vmul.f32        q11, q1,  q3
483
 
        vst1.64         {d16-d19},[r0,:128]!
484
 
        vld1.64         {d0-d1},  [r1,:128]!
485
 
        vld1.64         {d4-d5},  [r2,:128]!
486
 
        vmul.f32        q8,  q0,  q2
487
 
        vld1.64         {d2-d3},  [r1,:128]!
488
 
        vld1.64         {d6-d7},  [r2,:128]!
489
 
        vmul.f32        q9,  q1,  q3
490
 
        vst1.64         {d20-d23},[r0,:128]!
491
 
        bne             1b
492
 
        ands            r3,  r3,  #15
493
 
        beq             3f
494
 
2:      vld1.64         {d0-d1},  [r1,:128]!
495
 
        vld1.64         {d4-d5},  [r2,:128]!
496
 
        vst1.64         {d16-d17},[r0,:128]!
497
 
        vmul.f32        q8,  q0,  q2
498
 
        vld1.64         {d2-d3},  [r1,:128]!
499
 
        vld1.64         {d6-d7},  [r2,:128]!
500
 
        vst1.64         {d18-d19},[r0,:128]!
501
 
        vmul.f32        q9,  q1,  q3
502
 
3:      vst1.64         {d16-d19},[r0,:128]!
 
492
        vld1.8          {d16},   [r1,:64], r2
 
493
        vld1.16         {d0-d1}, [r0,:128]!
 
494
        vaddw.u8        q0, q0, d16
 
495
        vld1.8          {d17},   [r1,:64], r2
 
496
        vld1.16         {d2-d3}, [r0,:128]!
 
497
        vqmovun.s16     d0, q0
 
498
        vld1.8          {d18},   [r1,:64], r2
 
499
        vaddw.u8        q1, q1, d17
 
500
        vld1.16         {d4-d5}, [r0,:128]!
 
501
        vaddw.u8        q2, q2, d18
 
502
        vst1.8          {d0},    [r3,:64], r2
 
503
        vqmovun.s16     d2, q1
 
504
        vld1.8          {d19},   [r1,:64], r2
 
505
        vld1.16         {d6-d7}, [r0,:128]!
 
506
        vaddw.u8        q3, q3, d19
 
507
        vqmovun.s16     d4, q2
 
508
        vst1.8          {d2},    [r3,:64], r2
 
509
        vld1.8          {d16},   [r1,:64], r2
 
510
        vqmovun.s16     d6, q3
 
511
        vld1.16         {d0-d1}, [r0,:128]!
 
512
        vaddw.u8        q0, q0, d16
 
513
        vst1.8          {d4},    [r3,:64], r2
 
514
        vld1.8          {d17},   [r1,:64], r2
 
515
        vld1.16         {d2-d3}, [r0,:128]!
 
516
        vaddw.u8        q1, q1, d17
 
517
        vst1.8          {d6},    [r3,:64], r2
 
518
        vqmovun.s16     d0, q0
 
519
        vld1.8          {d18},   [r1,:64], r2
 
520
        vld1.16         {d4-d5}, [r0,:128]!
 
521
        vaddw.u8        q2, q2, d18
 
522
        vst1.8          {d0},    [r3,:64], r2
 
523
        vqmovun.s16     d2, q1
 
524
        vld1.8          {d19},   [r1,:64], r2
 
525
        vqmovun.s16     d4, q2
 
526
        vld1.16         {d6-d7}, [r0,:128]!
 
527
        vaddw.u8        q3, q3, d19
 
528
        vst1.8          {d2},    [r3,:64], r2
 
529
        vqmovun.s16     d6, q3
 
530
        vst1.8          {d4},    [r3,:64], r2
 
531
        vst1.8          {d6},    [r3,:64], r2
503
532
        bx              lr
504
533
endfunc
505
534
 
512
541
        add             r4,  r3,  r5, lsl #3
513
542
        add             ip,  r0,  r5, lsl #3
514
543
        mov             r5,  #-16
515
 
        vld1.64         {d0,d1},  [r1,:128]!
516
 
        vld1.64         {d2,d3},  [r2,:128], r5
517
 
        vld1.64         {d4,d5},  [r3,:128]!
518
 
        vld1.64         {d6,d7},  [r4,:128], r5
 
544
        vld1.32         {d0,d1},  [r1,:128]!
 
545
        vld1.32         {d2,d3},  [r2,:128], r5
 
546
        vld1.32         {d4,d5},  [r3,:128]!
 
547
        vld1.32         {d6,d7},  [r4,:128], r5
519
548
1:      subs            lr,  lr,  #4
520
549
        vmul.f32        d22, d0,  d4
521
550
        vrev64.32       q3,  q3
525
554
        vmul.f32        d21, d1,  d6
526
555
        beq             2f
527
556
        vmla.f32        d22, d3,  d7
528
 
        vld1.64         {d0,d1},  [r1,:128]!
 
557
        vld1.32         {d0,d1},  [r1,:128]!
529
558
        vmla.f32        d23, d2,  d6
530
 
        vld1.64         {d18,d19},[r2,:128], r5
 
559
        vld1.32         {d18,d19},[r2,:128], r5
531
560
        vmls.f32        d20, d3,  d4
532
 
        vld1.64         {d24,d25},[r3,:128]!
 
561
        vld1.32         {d24,d25},[r3,:128]!
533
562
        vmls.f32        d21, d2,  d5
534
 
        vld1.64         {d6,d7},  [r4,:128], r5
 
563
        vld1.32         {d6,d7},  [r4,:128], r5
535
564
        vmov            q1,  q9
536
565
        vrev64.32       q11, q11
537
566
        vmov            q2,  q12
538
567
        vswp            d22, d23
539
 
        vst1.64         {d20,d21},[r0,:128]!
540
 
        vst1.64         {d22,d23},[ip,:128], r5
 
568
        vst1.32         {d20,d21},[r0,:128]!
 
569
        vst1.32         {d22,d23},[ip,:128], r5
541
570
        b               1b
542
571
2:      vmla.f32        d22, d3,  d7
543
572
        vmla.f32        d23, d2,  d6
545
574
        vmls.f32        d21, d2,  d5
546
575
        vrev64.32       q11, q11
547
576
        vswp            d22, d23
548
 
        vst1.64         {d20,d21},[r0,:128]!
549
 
        vst1.64         {d22,d23},[ip,:128], r5
 
577
        vst1.32         {d20,d21},[r0,:128]!
 
578
        vst1.32         {d22,d23},[ip,:128], r5
550
579
        pop             {r4,r5,pc}
551
580
endfunc
552
581
 
613
642
endfunc
614
643
#endif
615
644
 
616
 
function ff_vector_fmul_scalar_neon, export=1
617
 
VFP     len .req r2
618
 
NOVFP   len .req r3
619
 
VFP     vdup.32         q8,  d0[0]
620
 
NOVFP   vdup.32         q8,  r2
621
 
        bics            r12, len, #15
622
 
        beq             3f
623
 
        vld1.32         {q0},[r1,:128]!
624
 
        vld1.32         {q1},[r1,:128]!
625
 
1:      vmul.f32        q0,  q0,  q8
626
 
        vld1.32         {q2},[r1,:128]!
627
 
        vmul.f32        q1,  q1,  q8
628
 
        vld1.32         {q3},[r1,:128]!
629
 
        vmul.f32        q2,  q2,  q8
630
 
        vst1.32         {q0},[r0,:128]!
631
 
        vmul.f32        q3,  q3,  q8
632
 
        vst1.32         {q1},[r0,:128]!
633
 
        subs            r12, r12, #16
634
 
        beq             2f
635
 
        vld1.32         {q0},[r1,:128]!
636
 
        vst1.32         {q2},[r0,:128]!
637
 
        vld1.32         {q1},[r1,:128]!
638
 
        vst1.32         {q3},[r0,:128]!
639
 
        b               1b
640
 
2:      vst1.32         {q2},[r0,:128]!
641
 
        vst1.32         {q3},[r0,:128]!
642
 
        ands            len, len, #15
643
 
        it              eq
644
 
        bxeq            lr
645
 
3:      vld1.32         {q0},[r1,:128]!
646
 
        vmul.f32        q0,  q0,  q8
647
 
        vst1.32         {q0},[r0,:128]!
648
 
        subs            len, len, #4
649
 
        bgt             3b
650
 
        bx              lr
651
 
        .unreq          len
652
 
endfunc
653
 
 
654
 
function ff_vector_fmac_scalar_neon, export=1
655
 
VFP     len .req r2
656
 
VFP     acc .req r3
657
 
NOVFP   len .req r3
658
 
NOVFP   acc .req r2
659
 
VFP     vdup.32         q15, d0[0]
660
 
NOVFP   vdup.32         q15, r2
661
 
        bics            r12, len, #15
662
 
        mov             acc, r0
663
 
        beq             3f
664
 
        vld1.32         {q0},     [r1,:128]!
665
 
        vld1.32         {q8},     [acc,:128]!
666
 
        vld1.32         {q1},     [r1,:128]!
667
 
        vld1.32         {q9},     [acc,:128]!
668
 
1:      vmla.f32        q8,  q0,  q15
669
 
        vld1.32         {q2},     [r1,:128]!
670
 
        vld1.32         {q10},    [acc,:128]!
671
 
        vmla.f32        q9,  q1,  q15
672
 
        vld1.32         {q3},     [r1,:128]!
673
 
        vld1.32         {q11},    [acc,:128]!
674
 
        vmla.f32        q10, q2,  q15
675
 
        vst1.32         {q8},     [r0,:128]!
676
 
        vmla.f32        q11, q3,  q15
677
 
        vst1.32         {q9},     [r0,:128]!
678
 
        subs            r12, r12, #16
679
 
        beq             2f
680
 
        vld1.32         {q0},     [r1,:128]!
681
 
        vld1.32         {q8},     [acc,:128]!
682
 
        vst1.32         {q10},    [r0,:128]!
683
 
        vld1.32         {q1},     [r1,:128]!
684
 
        vld1.32         {q9},     [acc,:128]!
685
 
        vst1.32         {q11},    [r0,:128]!
686
 
        b               1b
687
 
2:      vst1.32         {q10},    [r0,:128]!
688
 
        vst1.32         {q11},    [r0,:128]!
689
 
        ands            len, len, #15
690
 
        it              eq
691
 
        bxeq            lr
692
 
3:      vld1.32         {q0},     [r1,:128]!
693
 
        vld1.32         {q8},     [acc,:128]!
694
 
        vmla.f32        q8,  q0,  q15
695
 
        vst1.32         {q8},     [r0,:128]!
696
 
        subs            len, len, #4
697
 
        bgt             3b
698
 
        bx              lr
699
 
        .unreq          len
700
 
endfunc
701
 
 
702
645
function ff_butterflies_float_neon, export=1
703
646
1:      vld1.32         {q0},[r0,:128]
704
647
        vld1.32         {q1},[r1,:128]