~ubuntu-branches/ubuntu/lucid/ffmpeg/lucid-updates

« back to all changes in this revision

Viewing changes to libavcodec/arm/h264dsp_neon.S

  • Committer: Bazaar Package Importer
  • Author(s): Reinhard Tartler
  • Date: 2009-02-05 21:45:05 UTC
  • mfrom: (1.1.7 upstream)
  • Revision ID: james.westby@ubuntu.com-20090205214505-fvn0jkiv3lrkaaq4
Tags: 3:0.svn20090204-2ubuntu1+unstripped1
rebuild using a clean, uncrippled ffmpeg tarball

Show diffs side-by-side

added added

removed removed

Lines of Context:
1368
1368
        sub             r1,  r1,  #1
1369
1369
        b               put_h264_qpel16_mc11
1370
1370
        .endfunc
 
1371
 
 
1372
@ Biweighted prediction
 
1373
 
 
1374
        .macro  biweight_16 macs, macd
 
1375
        vdup.8          d0,  r4
 
1376
        vdup.8          d1,  r5
 
1377
        vmov            q2,  q8
 
1378
        vmov            q3,  q8
 
1379
1:      subs            ip,  ip,  #2
 
1380
        vld1.8          {d20-d21},[r0,:128], r2
 
1381
        \macd           q2,  d0,  d20
 
1382
        pld             [r0]
 
1383
        \macd           q3,  d0,  d21
 
1384
        vld1.8          {d22-d23},[r1,:128], r2
 
1385
        \macs           q2,  d1,  d22
 
1386
        pld             [r1]
 
1387
        \macs           q3,  d1,  d23
 
1388
        vmov            q12, q8
 
1389
        vld1.8          {d28-d29},[r0,:128], r2
 
1390
        vmov            q13, q8
 
1391
        \macd           q12, d0,  d28
 
1392
        pld             [r0]
 
1393
        \macd           q13, d0,  d29
 
1394
        vld1.8          {d30-d31},[r1,:128], r2
 
1395
        \macs           q12, d1,  d30
 
1396
        pld             [r1]
 
1397
        \macs           q13, d1,  d31
 
1398
        vshl.s16        q2,  q2,  q9
 
1399
        vshl.s16        q3,  q3,  q9
 
1400
        vqmovun.s16     d4,  q2
 
1401
        vqmovun.s16     d5,  q3
 
1402
        vshl.s16        q12, q12, q9
 
1403
        vshl.s16        q13, q13, q9
 
1404
        vqmovun.s16     d24, q12
 
1405
        vqmovun.s16     d25, q13
 
1406
        vmov            q3,  q8
 
1407
        vst1.8          {d4- d5}, [r6,:128], r2
 
1408
        vmov            q2,  q8
 
1409
        vst1.8          {d24-d25},[r6,:128], r2
 
1410
        bne             1b
 
1411
        pop             {r4-r6, pc}
 
1412
        .endm
 
1413
 
 
1414
        .macro  biweight_8 macs, macd
 
1415
        vdup.8          d0,  r4
 
1416
        vdup.8          d1,  r5
 
1417
        vmov            q1,  q8
 
1418
        vmov            q10, q8
 
1419
1:      subs            ip,  ip,  #2
 
1420
        vld1.8          {d4},[r0,:64], r2
 
1421
        \macd           q1,  d0,  d4
 
1422
        pld             [r0]
 
1423
        vld1.8          {d5},[r1,:64], r2
 
1424
        \macs           q1,  d1,  d5
 
1425
        pld             [r1]
 
1426
        vld1.8          {d6},[r0,:64], r2
 
1427
        \macd           q10, d0,  d6
 
1428
        pld             [r0]
 
1429
        vld1.8          {d7},[r1,:64], r2
 
1430
        \macs           q10, d1,  d7
 
1431
        pld             [r1]
 
1432
        vshl.s16        q1,  q1,  q9
 
1433
        vqmovun.s16     d2,  q1
 
1434
        vshl.s16        q10, q10, q9
 
1435
        vqmovun.s16     d4,  q10
 
1436
        vmov            q10, q8
 
1437
        vst1.8          {d2},[r6,:64], r2
 
1438
        vmov            q1,  q8
 
1439
        vst1.8          {d4},[r6,:64], r2
 
1440
        bne             1b
 
1441
        pop             {r4-r6, pc}
 
1442
        .endm
 
1443
 
 
1444
        .macro  biweight_4 macs, macd
 
1445
        vdup.8          d0,  r4
 
1446
        vdup.8          d1,  r5
 
1447
        vmov            q1,  q8
 
1448
        vmov            q10, q8
 
1449
1:      subs            ip,  ip,  #4
 
1450
        vld1.32         {d4[0]},[r0,:32], r2
 
1451
        vld1.32         {d4[1]},[r0,:32], r2
 
1452
        \macd           q1,  d0,  d4
 
1453
        pld             [r0]
 
1454
        vld1.32         {d5[0]},[r1,:32], r2
 
1455
        vld1.32         {d5[1]},[r1,:32], r2
 
1456
        \macs           q1,  d1,  d5
 
1457
        pld             [r1]
 
1458
        blt             2f
 
1459
        vld1.32         {d6[0]},[r0,:32], r2
 
1460
        vld1.32         {d6[1]},[r0,:32], r2
 
1461
        \macd           q10, d0,  d6
 
1462
        pld             [r0]
 
1463
        vld1.32         {d7[0]},[r1,:32], r2
 
1464
        vld1.32         {d7[1]},[r1,:32], r2
 
1465
        \macs           q10, d1,  d7
 
1466
        pld             [r1]
 
1467
        vshl.s16        q1,  q1,  q9
 
1468
        vqmovun.s16     d2,  q1
 
1469
        vshl.s16        q10, q10, q9
 
1470
        vqmovun.s16     d4,  q10
 
1471
        vmov            q10, q8
 
1472
        vst1.32         {d2[0]},[r6,:32], r2
 
1473
        vst1.32         {d2[1]},[r6,:32], r2
 
1474
        vmov            q1,  q8
 
1475
        vst1.32         {d4[0]},[r6,:32], r2
 
1476
        vst1.32         {d4[1]},[r6,:32], r2
 
1477
        bne             1b
 
1478
        pop             {r4-r6, pc}
 
1479
2:      vshl.s16        q1,  q1,  q9
 
1480
        vqmovun.s16     d2,  q1
 
1481
        vst1.32         {d2[0]},[r6,:32], r2
 
1482
        vst1.32         {d2[1]},[r6,:32], r2
 
1483
        pop             {r4-r6, pc}
 
1484
        .endm
 
1485
 
 
1486
        .macro  biweight_func w
 
1487
function biweight_h264_pixels_\w\()_neon
 
1488
        push            {r4-r6, lr}
 
1489
        add             r4,  sp,  #16
 
1490
        ldm             r4,  {r4-r6}
 
1491
        lsr             lr,  r4,  #31
 
1492
        add             r6,  r6,  #1
 
1493
        eors            lr,  lr,  r5,  lsr #30
 
1494
        orr             r6,  r6,  #1
 
1495
        vdup.16         q9,  r3
 
1496
        lsl             r6,  r6,  r3
 
1497
        vmvn            q9,  q9
 
1498
        vdup.16         q8,  r6
 
1499
        mov             r6,  r0
 
1500
        beq             10f
 
1501
        subs            lr,  lr,  #1
 
1502
        beq             20f
 
1503
        subs            lr,  lr,  #1
 
1504
        beq             30f
 
1505
        b               40f
 
1506
10:     biweight_\w     vmlal.u8, vmlal.u8
 
1507
20:     rsb             r4,  r4,  #0
 
1508
        biweight_\w     vmlal.u8, vmlsl.u8
 
1509
30:     rsb             r4,  r4,  #0
 
1510
        rsb             r5,  r5,  #0
 
1511
        biweight_\w     vmlsl.u8, vmlsl.u8
 
1512
40:     rsb             r5,  r5,  #0
 
1513
        biweight_\w     vmlsl.u8, vmlal.u8
 
1514
        .endfunc
 
1515
        .endm
 
1516
 
 
1517
        .macro  biweight_entry w, h, b=1
 
1518
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
 
1519
        mov             ip,  #\h
 
1520
.if \b
 
1521
        b               biweight_h264_pixels_\w\()_neon
 
1522
.endif
 
1523
        .endfunc
 
1524
        .endm
 
1525
 
 
1526
        biweight_entry  16, 8
 
1527
        biweight_entry  16, 16, b=0
 
1528
        biweight_func   16
 
1529
 
 
1530
        biweight_entry  8,  16
 
1531
        biweight_entry  8,  4
 
1532
        biweight_entry  8,  8,  b=0
 
1533
        biweight_func   8
 
1534
 
 
1535
        biweight_entry  4,  8
 
1536
        biweight_entry  4,  2
 
1537
        biweight_entry  4,  4,  b=0
 
1538
        biweight_func   4
 
1539
 
 
1540
@ Weighted prediction
 
1541
 
 
1542
        .macro  weight_16 mac
 
1543
        vdup.8          d0,  r3
 
1544
        vmov            q2,  q8
 
1545
        vmov            q3,  q8
 
1546
1:      subs            ip,  ip,  #2
 
1547
        vld1.8          {d20-d21},[r0,:128], r1
 
1548
        \mac            q2,  d0,  d20
 
1549
        pld             [r0]
 
1550
        \mac            q3,  d0,  d21
 
1551
        vmov            q12, q8
 
1552
        vld1.8          {d28-d29},[r0,:128], r1
 
1553
        vmov            q13, q8
 
1554
        \mac            q12, d0,  d28
 
1555
        pld             [r0]
 
1556
        \mac            q13, d0,  d29
 
1557
        vshl.s16        q2,  q2,  q9
 
1558
        vshl.s16        q3,  q3,  q9
 
1559
        vqmovun.s16     d4,  q2
 
1560
        vqmovun.s16     d5,  q3
 
1561
        vshl.s16        q12, q12, q9
 
1562
        vshl.s16        q13, q13, q9
 
1563
        vqmovun.s16     d24, q12
 
1564
        vqmovun.s16     d25, q13
 
1565
        vmov            q3,  q8
 
1566
        vst1.8          {d4- d5}, [r4,:128], r1
 
1567
        vmov            q2,  q8
 
1568
        vst1.8          {d24-d25},[r4,:128], r1
 
1569
        bne             1b
 
1570
        pop             {r4, pc}
 
1571
        .endm
 
1572
 
 
1573
        .macro  weight_8 mac
 
1574
        vdup.8          d0,  r3
 
1575
        vmov            q1,  q8
 
1576
        vmov            q10, q8
 
1577
1:      subs            ip,  ip,  #2
 
1578
        vld1.8          {d4},[r0,:64], r1
 
1579
        \mac            q1,  d0,  d4
 
1580
        pld             [r0]
 
1581
        vld1.8          {d6},[r0,:64], r1
 
1582
        \mac            q10, d0,  d6
 
1583
        pld             [r0]
 
1584
        vshl.s16        q1,  q1,  q9
 
1585
        vqmovun.s16     d2,  q1
 
1586
        vshl.s16        q10, q10, q9
 
1587
        vqmovun.s16     d4,  q10
 
1588
        vmov            q10, q8
 
1589
        vst1.8          {d2},[r4,:64], r1
 
1590
        vmov            q1,  q8
 
1591
        vst1.8          {d4},[r4,:64], r1
 
1592
        bne             1b
 
1593
        pop             {r4, pc}
 
1594
        .endm
 
1595
 
 
1596
        .macro  weight_4 mac
 
1597
        vdup.8          d0,  r3
 
1598
        vmov            q1,  q8
 
1599
        vmov            q10, q8
 
1600
1:      subs            ip,  ip,  #4
 
1601
        vld1.32         {d4[0]},[r0,:32], r1
 
1602
        vld1.32         {d4[1]},[r0,:32], r1
 
1603
        \mac            q1,  d0,  d4
 
1604
        pld             [r0]
 
1605
        blt             2f
 
1606
        vld1.32         {d6[0]},[r0,:32], r1
 
1607
        vld1.32         {d6[1]},[r0,:32], r1
 
1608
        \mac            q10, d0,  d6
 
1609
        pld             [r0]
 
1610
        vshl.s16        q1,  q1,  q9
 
1611
        vqmovun.s16     d2,  q1
 
1612
        vshl.s16        q10, q10, q9
 
1613
        vqmovun.s16     d4,  q10
 
1614
        vmov            q10, q8
 
1615
        vst1.32         {d2[0]},[r4,:32], r1
 
1616
        vst1.32         {d2[1]},[r4,:32], r1
 
1617
        vmov            q1,  q8
 
1618
        vst1.32         {d4[0]},[r4,:32], r1
 
1619
        vst1.32         {d4[1]},[r4,:32], r1
 
1620
        bne             1b
 
1621
        pop             {r4, pc}
 
1622
2:      vshl.s16        q1,  q1,  q9
 
1623
        vqmovun.s16     d2,  q1
 
1624
        vst1.32         {d2[0]},[r4,:32], r1
 
1625
        vst1.32         {d2[1]},[r4,:32], r1
 
1626
        pop             {r4, pc}
 
1627
        .endm
 
1628
 
 
1629
        .macro  weight_func w
 
1630
function weight_h264_pixels_\w\()_neon
 
1631
        push            {r4, lr}
 
1632
        ldr             r4,  [sp, #8]
 
1633
        vdup.16         q9,  r2
 
1634
        mov             lr,  #1
 
1635
        lsl             r4,  r4,  r2
 
1636
        subs            r2,  r2,  #1
 
1637
        vneg.s16        q9,  q9
 
1638
        addge           r4,  r4,  lr,  lsl r2
 
1639
        cmp             r3,  #0
 
1640
        vdup.16         q8,  r4
 
1641
        mov             r4,  r0
 
1642
        blt             10f
 
1643
        weight_\w       vmlal.u8
 
1644
10:     rsb             r3,  r3,  #0
 
1645
        weight_\w       vmlsl.u8
 
1646
        .endfunc
 
1647
        .endm
 
1648
 
 
1649
        .macro  weight_entry w, h, b=1
 
1650
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
 
1651
        mov             ip,  #\h
 
1652
.if \b
 
1653
        b               weight_h264_pixels_\w\()_neon
 
1654
.endif
 
1655
        .endfunc
 
1656
        .endm
 
1657
 
 
1658
        weight_entry    16, 8
 
1659
        weight_entry    16, 16, b=0
 
1660
        weight_func     16
 
1661
 
 
1662
        weight_entry    8,  16
 
1663
        weight_entry    8,  4
 
1664
        weight_entry    8,  8,  b=0
 
1665
        weight_func     8
 
1666
 
 
1667
        weight_entry    4,  8
 
1668
        weight_entry    4,  2
 
1669
        weight_entry    4,  4,  b=0
 
1670
        weight_func     4