62
62
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63
63
//! p_unit_test 5, %res5
64
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
65
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
66
67
//! v1: %res6 = v_subrev_f32 %a, %b
67
68
//! p_unit_test 6, %res6
264
265
finish_opt_test();
267
Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
269
create_subbrev_co(Operand op0, Operand op1, Operand op2)
269
271
return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2);
880
static const char *srcdest_op_name(aco_opcode op)
883
srcdest_op_name(aco_opcode op)
883
case aco_opcode::v_cndmask_b32:
885
case aco_opcode::v_min_f32:
887
case aco_opcode::v_rcp_f32:
886
case aco_opcode::v_cndmask_b32: return "cndmask";
887
case aco_opcode::v_min_f32: return "min";
888
case aco_opcode::v_rcp_f32: return "rcp";
889
default: return "none";
894
static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
894
emit_denorm_srcdest(aco_opcode op, Temp val)
897
897
case aco_opcode::v_cndmask_b32:
898
898
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
899
899
case aco_opcode::v_min_f32:
900
900
return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
901
case aco_opcode::v_rcp_f32:
902
return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
901
case aco_opcode::v_rcp_f32: return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
917
915
configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
920
for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
918
for (aco_opcode src :
919
{aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
921
920
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
922
921
configs.push_back({flush, op, src, aco_opcode::num_opcodes});
926
925
for (denorm_config cfg : configs) {
927
926
char subvariant[128];
928
sprintf(subvariant, "_%s_%s_%s_%s",
929
cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
927
sprintf(subvariant, "_%s_%s_%s_%s", cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
930
928
denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
931
929
if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
934
bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
935
cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
932
bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 ||
933
(i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
934
cfg.dest == aco_opcode::v_rcp_f32 ||
935
(i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || !cfg.flush;
938
fprintf(output, "src, dest, op: %s %s %s\n",
939
srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
937
fprintf(output, "src, dest, op: %s %s %s\n", srcdest_op_name(cfg.src),
938
srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
940
939
fprintf(output, "can_propagate: %u\n", can_propagate);
941
940
//! src, dest, op: $src $dest $op
942
941
//! can_propagate: #can_propagate
976
975
case denorm_mul1:
977
976
val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
986
val = fneg(fabs(val));
978
case denorm_fneg: val = fneg(val); break;
979
case denorm_fabs: val = fabs(val); break;
980
case denorm_fnegabs: val = fneg(fabs(val)); break;
989
982
val = emit_denorm_srcdest(cfg.dest, val);
1123
1116
//! v1: %res2 = v_mul_f32 0x12345678, %a
1124
1117
//! p_unit_test 2, %res2
1125
1118
Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1126
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1120
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1128
1122
//! v1: %literal2 = p_parallelcopy 0x12345679
1129
1123
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
1130
1124
//! p_unit_test 3, %res3
1131
1125
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1132
writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1127
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1134
1129
//! v1: %b_v = p_parallelcopy %b
1135
1130
//! v1: %res4 = v_mul_f32 %b, %a
1171
1166
//! v1: %res2_tmp = v_mul_f32 -1.0, %a16
1172
1167
//! v2b: %res2 = v_mul_f16 %res2_tmp, %a16
1173
1168
//! p_unit_test 2, %res2
1174
writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16));
1169
writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1),
1170
Operand::c32(0xbf800000u), bld.as_uniform(a16))),
1176
1173
//! v1: %res3_tmp = v_mul_f32 %a, %a
1177
//! v2b: %res3 = v_med3_f16 0, 1.0, %res3_tmp
1174
//! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
1178
1175
//! p_unit_test 3, %res3
1179
1176
writeout(3, fsat(u2u16(fmul(a, a))));
1181
1178
//! v2b: %res4_tmp = v_mul_f16 %a16, %a16
1182
//! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp
1179
//! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
1183
1180
//! p_unit_test 4, %res4
1184
1181
writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
1191
1188
//! v2b: %res6_tmp = v_mul_f16 %a16, %a16
1192
1189
//! v1: %res6 = v_mul_f32 2.0, %res6_tmp
1193
1190
//! p_unit_test 6, %res6
1194
writeout(6, fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1192
fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1196
1194
//! v1: %res7_tmp = v_mul_f32 %a, %a
1197
1195
//! v2b: %res7 = v_add_f16 %res7_tmp, %a16
1211
1209
//! v2b: %res10_tmp = v_mul_f16 %a16, %a16
1212
1210
//! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
1213
1211
//! p_unit_test 10, %res10
1214
writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(fmul(a16, a16))));
1212
writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u),
1213
bld.as_uniform(fmul(a16, a16))));
1216
1215
finish_opt_test();
1383
1382
writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
1385
1384
//~gfx(9|10)! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
1386
//~gfx11! v1: %res16_cvt1 = v_cvt_f32_f16 %a
1385
//~gfx11! v1: %res16_cvt1 = v_fma_mix_f32 lo(%a), 1.0, -0
1387
1386
//~gfx11! v1: %res15_cvt = p_extract %res16_cvt1, 0, 16, 0
1388
1387
//! v1: %res15 = v_mul_f32 %res15_cvt, %a
1389
1388
//! p_unit_test 15, %res15
1390
1389
writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
1392
//! v1: %res16_cvt = v_cvt_f32_f16 %a
1391
//~gfx(9|10)! v1: %res16_cvt = v_cvt_f32_f16 %a
1393
1392
//~gfx(9|10)! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
1393
//~gfx11! v1: %res16_cvt = v_fma_mix_f32 lo(%a), 1.0, -0
1394
1394
//~gfx11! v1: %res16_ext = p_extract %res16_cvt, 1, 16, 0
1395
1395
//~gfx11! v1: %res16 = v_mul_f32 %res16_ext, %a
1396
1396
//! p_unit_test 16, %res16
1548
1548
//! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0
1549
1549
//! v1: %res2 = v_add_f32 %res2_mul, %c *2
1550
1550
//! p_unit_test 2, %res2
1551
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32(a16), b), c)));
1551
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000),
1552
fadd(fmul(f2f32(a16), b), c)));
1553
1554
/* neg/abs modifiers */
1554
1555
//! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
1701
1702
writeout(3, f2f32(u2u16(fmul(a, a))));
1703
1704
//! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
1704
//! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul
1705
//! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
1705
1706
//! p_unit_test 4, %res4
1706
1707
writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
1708
1709
//! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
1709
//! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
1710
//! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
1710
1711
//! p_unit_test 5, %res5
1711
1712
writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
1732
static void vop3p_constant(unsigned *idx, aco_opcode op, const char *swizzle, uint32_t val)
1734
vop3p_constant(unsigned* idx, aco_opcode op, const char* swizzle, uint32_t val)
1734
1736
uint32_t halves[2] = {val & 0xffff, val >> 16};
1735
1737
uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16);
1744
1746
BEGIN_TEST(optimize.vop3p_constants)
1745
1747
for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) {
1746
for (const char *swizzle : {"xx", "yy", "xy", "yx"}) {
1748
for (const char* swizzle : {"xx", "yy", "xy", "yx"}) {
1747
1749
char variant[16];
1748
1750
strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16");
1749
1751
strcat(variant, "_");