292
322
vstr d8, [a1, #0 * 2*4]
293
323
vstr d9, [a1, #4 * 2*4]
328
function ff_fft16_vfp, export=1
329
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
334
bl .Lfft16_internal_vfp
340
.macro pass n, z0, z1, z2, z3
342
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
343
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
344
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
345
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
346
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
348
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
349
vldmia v5!, {s0,s1} @ s0 is unused
350
vldr s7, [\z2, #8*o2] @ t1
351
vmul.f s20, s16, s2 @ vector * scalar
352
vldr s0, [\z3, #8*o3] @ t5
353
vldr s6, [\z2, #8*o2+4] @ t2
354
vldr s3, [\z3, #8*o3+4] @ t6
355
vmul.f s16, s16, s1 @ vector * scalar
357
1: add \z0, \z0, #8*2
367
@ up to 2 stalls (VFP vector issuing / waiting for s0)
368
@ depending upon whether this is the first iteration and
369
@ how many add instructions are inserted above
370
vadd.f s4, s0, s7 @ t5
371
vadd.f s5, s6, s3 @ t6
372
vsub.f s6, s6, s3 @ t4
373
vsub.f s7, s0, s7 @ t3
374
vldr d6, [\z0, #8*0-8*2] @ s12,s13
375
vadd.f s0, s16, s21 @ t1
376
vldr d7, [\z1, #8*o1-8*2] @ s14,s15
377
vsub.f s1, s18, s23 @ t5
378
vadd.f s8, s4, s12 @ vector + vector
379
@ stall (VFP vector issuing)
380
@ stall (VFP vector issuing)
381
@ stall (VFP vector issuing)
386
vsub.f s2, s17, s20 @ t2
387
vadd.f s3, s19, s22 @ t6
388
vstr d4, [\z0, #8*0-8*2] @ s8,s9
389
vstr d5, [\z1, #8*o1-8*2] @ s10,s11
390
@ stall (waiting for s5)
391
vstr d2, [\z2, #8*o2-8*2] @ s4,s5
392
vadd.f s4, s1, s0 @ t5
393
vstr d3, [\z3, #8*o3-8*2] @ s6,s7
394
vsub.f s7, s1, s0 @ t3
395
vadd.f s5, s2, s3 @ t6
396
vsub.f s6, s2, s3 @ t4
397
vldr d6, [\z0, #8*1-8*2] @ s12,s13
398
vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
399
vldr d4, [\z2, #8*o2] @ s8,s9
401
vldr d5, [\z3, #8*o3] @ s10,s11
402
vadd.f s20, s4, s12 @ vector + vector
404
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
405
@ stall (VFP vector issuing)
410
vmul.f s12, s8, s3 @ vector * scalar
411
vstr d10, [\z0, #8*1-8*2] @ s20,s21
412
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
413
vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
414
vmul.f s8, s8, s0 @ vector * scalar
415
vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
416
@ stall (waiting for s7)
417
vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
418
vmul.f s20, s16, s2 @ vector * scalar
419
@ stall (VFP vector issuing)
420
@ stall (VFP vector issuing)
421
@ stall (VFP vector issuing)
422
vadd.f s7, s8, s13 @ t1
423
vsub.f s6, s9, s12 @ t2
424
vsub.f s0, s10, s15 @ t5
425
vadd.f s3, s11, s14 @ t6
426
vmul.f s16, s16, s1 @ vector * scalar
429
@ What remains is identical to the first two indentations of
430
@ the above, but without the increment of z
431
vadd.f s4, s0, s7 @ t5
432
vadd.f s5, s6, s3 @ t6
433
vsub.f s6, s6, s3 @ t4
434
vsub.f s7, s0, s7 @ t3
435
vldr d6, [\z0, #8*0] @ s12,s13
436
vadd.f s0, s16, s21 @ t1
437
vldr d7, [\z1, #8*o1] @ s14,s15
438
vsub.f s1, s18, s23 @ t5
439
vadd.f s8, s4, s12 @ vector + vector
444
vsub.f s2, s17, s20 @ t2
445
vadd.f s3, s19, s22 @ t6
446
vstr d4, [\z0, #8*0] @ s8,s9
447
vstr d5, [\z1, #8*o1] @ s10,s11
448
vstr d2, [\z2, #8*o2] @ s4,s5
449
vadd.f s4, s1, s0 @ t5
450
vstr d3, [\z3, #8*o3] @ s6,s7
451
vsub.f s7, s1, s0 @ t3
452
vadd.f s5, s2, s3 @ t6
453
vsub.f s6, s2, s3 @ t4
454
vldr d6, [\z0, #8*1] @ s12,s13
455
vldr d7, [\z1, #8*(o1+1)] @ s14,s15
456
vadd.f s20, s4, s12 @ vector + vector
461
vstr d10, [\z0, #8*1] @ s20,s21
462
vstr d11, [\z1, #8*(o1+1)] @ s22,s23
463
vstr d2, [\z2, #8*(o2+1)] @ s4,s5
464
vstr d3, [\z3, #8*(o3+1)] @ s6,s7
467
.macro def_fft n, n2, n4
468
function .Lfft\n\()_internal_vfp
472
push {v1-v2,v5-v6,lr}
477
bl .Lfft\n2\()_internal_vfp
478
add a1, v1, #8*(\n/4)*2
479
bl .Lfft\n4\()_internal_vfp
480
movrelx v5, X(ff_cos_\n), a1
481
add a1, v1, #8*(\n/4)*3
482
bl .Lfft\n4\()_internal_vfp
487
add v2, v1, #8*2*(\n/4/2)
488
add v3, v1, #8*4*(\n/4/2)
489
add v4, v1, #8*6*(\n/4/2)
490
pass (\n/4/2), v1, v2, v3, v4
496
add v2, v1, #8*4*(\n/4/2)
497
pass (\n/4/2), v1, v1, v2, v2
503
pass (\n/4/2), v1, v1, v1, v1
508
function fft\n\()_vfp
509
ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
514
bl .Lfft\n\()_internal_vfp
527
def_fft 512, 256, 128
528
def_fft 1024, 512, 256
529
def_fft 2048, 1024, 512
530
def_fft 4096, 2048, 1024
531
def_fft 8192, 4096, 2048
532
def_fft 16384, 8192, 4096
533
def_fft 32768, 16384, 8192
534
def_fft 65536, 32768, 16384