1099
1369
const struct lp_type type = bld->type;
1371
assert(lp_check_value(type, a));
1101
1373
assert(type.floating);
1103
if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1104
return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1375
if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1376
const unsigned num_iterations = 0;
1380
res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1382
for (i = 0; i < num_iterations; ++i) {
1383
res = lp_build_rsqrt_refine(bld, a, res);
1106
1389
return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1393
static inline LLVMValueRef
1394
lp_build_const_v4si(unsigned long value)
1396
LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1397
LLVMValueRef elements[4] = { element, element, element, element };
1398
return LLVMConstVector(elements, 4);
1401
static inline LLVMValueRef
1402
lp_build_const_v4sf(float value)
1404
LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1405
LLVMValueRef elements[4] = { element, element, element, element };
1406
return LLVMConstVector(elements, 4);
1411
* Generate sin(a) using SSE2
1414
lp_build_sin(struct lp_build_context *bld,
1417
struct lp_type int_type = lp_int_type(bld->type);
1418
LLVMBuilderRef b = bld->builder;
1419
LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1420
LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1423
* take the absolute value,
1424
* x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1427
LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1428
LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1430
LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1431
LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1434
* extract the sign bit (upper one)
1435
* sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1437
LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1438
LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1442
* y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1445
LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1446
LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1449
* store the integer part of y in mm0
1450
* emm2 = _mm_cvttps_epi32(y);
1453
LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1456
* j=(j+1) & (~1) (see the cephes sources)
1457
* emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1460
LLVMValueRef all_one = lp_build_const_v4si(1);
1461
LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1463
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1465
LLVMValueRef inv_one = lp_build_const_v4si(~1);
1466
LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1469
* y = _mm_cvtepi32_ps(emm2);
1471
LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1473
/* get the swap sign flag
1474
* emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1476
LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1477
LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1480
* emm2 = _mm_slli_epi32(emm0, 29);
1482
LLVMValueRef const_29 = lp_build_const_v4si(29);
1483
LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1486
* get the polynom selection mask
1487
* there is one polynom for 0 <= x <= Pi/4
1488
* and another one for Pi/4<x<=Pi/2
1489
* Both branches will be computed.
1491
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1492
* emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1495
LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1496
LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1497
LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1498
emm2_3, lp_build_const_v4si(0));
1500
* sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1502
LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1505
* _PS_CONST(minus_cephes_DP1, -0.78515625);
1506
* _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1507
* _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1509
LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1510
LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1511
LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1514
* The magic pass: "Extended precision modular arithmetic"
1515
* x = ((x - y * DP1) - y * DP2) - y * DP3;
1516
* xmm1 = _mm_mul_ps(y, xmm1);
1517
* xmm2 = _mm_mul_ps(y, xmm2);
1518
* xmm3 = _mm_mul_ps(y, xmm3);
1520
LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1521
LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1522
LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1525
* x = _mm_add_ps(x, xmm1);
1526
* x = _mm_add_ps(x, xmm2);
1527
* x = _mm_add_ps(x, xmm3);
1530
LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1531
LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1532
LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1535
* Evaluate the first polynom (0 <= x <= Pi/4)
1537
* z = _mm_mul_ps(x,x);
1539
LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1542
* _PS_CONST(coscof_p0, 2.443315711809948E-005);
1543
* _PS_CONST(coscof_p1, -1.388731625493765E-003);
1544
* _PS_CONST(coscof_p2, 4.166664568298827E-002);
1546
LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1547
LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1548
LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1551
* y = *(v4sf*)_ps_coscof_p0;
1552
* y = _mm_mul_ps(y, z);
1554
LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1555
LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1556
LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1557
LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1558
LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1559
LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1563
* tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1564
* y = _mm_sub_ps(y, tmp);
1565
* y = _mm_add_ps(y, *(v4sf*)_ps_1);
1567
LLVMValueRef half = lp_build_const_v4sf(0.5);
1568
LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1569
LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1570
LLVMValueRef one = lp_build_const_v4sf(1.0);
1571
LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1574
* _PS_CONST(sincof_p0, -1.9515295891E-4);
1575
* _PS_CONST(sincof_p1, 8.3321608736E-3);
1576
* _PS_CONST(sincof_p2, -1.6666654611E-1);
1578
LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1579
LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1580
LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1583
* Evaluate the second polynom (Pi/4 <= x <= 0)
1585
* y2 = *(v4sf*)_ps_sincof_p0;
1586
* y2 = _mm_mul_ps(y2, z);
1587
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1588
* y2 = _mm_mul_ps(y2, z);
1589
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1590
* y2 = _mm_mul_ps(y2, z);
1591
* y2 = _mm_mul_ps(y2, x);
1592
* y2 = _mm_add_ps(y2, x);
1595
LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1596
LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1597
LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1598
LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1599
LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1600
LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1601
LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1604
* select the correct result from the two polynoms
1606
* y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1607
* y = _mm_andnot_ps(xmm3, y);
1608
* y = _mm_add_ps(y,y2);
1610
LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1611
LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1612
LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1613
LLVMValueRef inv = lp_build_const_v4si(~0);
1614
LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1615
LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1616
LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1620
* y = _mm_xor_ps(y, sign_bit);
1622
LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1623
LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1629
* Generate cos(a) using SSE2
1114
1632
lp_build_cos(struct lp_build_context *bld,
1117
const struct lp_type type = bld->type;
1118
LLVMTypeRef vec_type = lp_build_vec_type(type);
1121
/* TODO: optimize the constant case */
1123
assert(type.floating);
1124
util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1126
return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1134
lp_build_sin(struct lp_build_context *bld,
1137
const struct lp_type type = bld->type;
1138
LLVMTypeRef vec_type = lp_build_vec_type(type);
1141
/* TODO: optimize the constant case */
1143
assert(type.floating);
1144
util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1146
return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1635
struct lp_type int_type = lp_int_type(bld->type);
1636
LLVMBuilderRef b = bld->builder;
1637
LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1638
LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1641
* take the absolute value,
1642
* x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1645
LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1646
LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1648
LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1649
LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1653
* y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1656
LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1657
LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1660
* store the integer part of y in mm0
1661
* emm2 = _mm_cvttps_epi32(y);
1664
LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1667
* j=(j+1) & (~1) (see the cephes sources)
1668
* emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1671
LLVMValueRef all_one = lp_build_const_v4si(1);
1672
LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1674
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1676
LLVMValueRef inv_one = lp_build_const_v4si(~1);
1677
LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1680
* y = _mm_cvtepi32_ps(emm2);
1682
LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1686
* emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1688
LLVMValueRef const_2 = lp_build_const_v4si(2);
1689
LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1692
/* get the swap sign flag
1693
* emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1695
LLVMValueRef inv = lp_build_const_v4si(~0);
1696
LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1697
LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1698
LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1701
* emm2 = _mm_slli_epi32(emm0, 29);
1703
LLVMValueRef const_29 = lp_build_const_v4si(29);
1704
LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1707
* get the polynom selection mask
1708
* there is one polynom for 0 <= x <= Pi/4
1709
* and another one for Pi/4<x<=Pi/2
1710
* Both branches will be computed.
1712
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1713
* emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1716
LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1717
LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1718
LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1719
emm2_3, lp_build_const_v4si(0));
1722
* _PS_CONST(minus_cephes_DP1, -0.78515625);
1723
* _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1724
* _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1726
LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1727
LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1728
LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1731
* The magic pass: "Extended precision modular arithmetic"
1732
* x = ((x - y * DP1) - y * DP2) - y * DP3;
1733
* xmm1 = _mm_mul_ps(y, xmm1);
1734
* xmm2 = _mm_mul_ps(y, xmm2);
1735
* xmm3 = _mm_mul_ps(y, xmm3);
1737
LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1738
LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1739
LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1742
* x = _mm_add_ps(x, xmm1);
1743
* x = _mm_add_ps(x, xmm2);
1744
* x = _mm_add_ps(x, xmm3);
1747
LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1748
LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1749
LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1752
* Evaluate the first polynom (0 <= x <= Pi/4)
1754
* z = _mm_mul_ps(x,x);
1756
LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1759
* _PS_CONST(coscof_p0, 2.443315711809948E-005);
1760
* _PS_CONST(coscof_p1, -1.388731625493765E-003);
1761
* _PS_CONST(coscof_p2, 4.166664568298827E-002);
1763
LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1764
LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1765
LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1768
* y = *(v4sf*)_ps_coscof_p0;
1769
* y = _mm_mul_ps(y, z);
1771
LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1772
LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1773
LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1774
LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1775
LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1776
LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1780
* tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1781
* y = _mm_sub_ps(y, tmp);
1782
* y = _mm_add_ps(y, *(v4sf*)_ps_1);
1784
LLVMValueRef half = lp_build_const_v4sf(0.5);
1785
LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1786
LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1787
LLVMValueRef one = lp_build_const_v4sf(1.0);
1788
LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1791
* _PS_CONST(sincof_p0, -1.9515295891E-4);
1792
* _PS_CONST(sincof_p1, 8.3321608736E-3);
1793
* _PS_CONST(sincof_p2, -1.6666654611E-1);
1795
LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1796
LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1797
LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1800
* Evaluate the second polynom (Pi/4 <= x <= 0)
1802
* y2 = *(v4sf*)_ps_sincof_p0;
1803
* y2 = _mm_mul_ps(y2, z);
1804
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1805
* y2 = _mm_mul_ps(y2, z);
1806
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1807
* y2 = _mm_mul_ps(y2, z);
1808
* y2 = _mm_mul_ps(y2, x);
1809
* y2 = _mm_add_ps(y2, x);
1812
LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1813
LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1814
LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1815
LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1816
LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1817
LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1818
LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1821
* select the correct result from the two polynoms
1823
* y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1824
* y = _mm_andnot_ps(xmm3, y);
1825
* y = _mm_add_ps(y,y2);
1827
LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1828
LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1829
LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1830
LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1831
LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1832
LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1836
* y = _mm_xor_ps(y, sign_bit);
1838
LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1839
LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");