296
248
rc_remove_instruction(inst);
300
* Definition of LIT (from ARB_fragment_program):
302
* tmp = VectorLoad(op0);
303
* if (tmp.x < 0) tmp.x = 0;
304
* if (tmp.y < 0) tmp.y = 0;
305
* if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
306
* else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
309
* result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
312
* The longest path of computation is the one leading to result.z,
313
* consisting of 5 operations. This implementation of LIT takes
314
* 5 slots, if the subsequent optimization passes are clever enough
315
* to pair instructions correctly.
317
static void transform_LIT(struct radeon_compiler* c,
318
struct rc_instruction* inst)
320
unsigned int constant;
321
unsigned int constant_swizzle;
323
struct rc_src_register srctemp;
325
constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
327
if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
328
struct rc_instruction * inst_mov;
330
inst_mov = emit1(c, inst,
331
RC_OPCODE_MOV, NULL, inst->U.I.DstReg,
332
srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
334
inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
335
inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
336
inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
339
temp = inst->U.I.DstReg.Index;
340
srctemp = srcreg(RC_FILE_TEMPORARY, temp);
342
/* tmp.x = max(0.0, Src.x); */
343
/* tmp.y = max(0.0, Src.y); */
344
/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
345
emit2(c, inst->Prev, RC_OPCODE_MAX, NULL,
346
dstregtmpmask(temp, RC_MASK_XYW),
348
swizzle(srcreg(RC_FILE_CONSTANT, constant),
349
RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
350
emit2(c, inst->Prev, RC_OPCODE_MIN, NULL,
351
dstregtmpmask(temp, RC_MASK_Z),
352
swizzle_wwww(srctemp),
353
negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
355
/* tmp.w = Pow(tmp.y, tmp.w) */
356
emit1(c, inst->Prev, RC_OPCODE_LG2, NULL,
357
dstregtmpmask(temp, RC_MASK_W),
358
swizzle_yyyy(srctemp));
359
emit2(c, inst->Prev, RC_OPCODE_MUL, NULL,
360
dstregtmpmask(temp, RC_MASK_W),
361
swizzle_wwww(srctemp),
362
swizzle_zzzz(srctemp));
363
emit1(c, inst->Prev, RC_OPCODE_EX2, NULL,
364
dstregtmpmask(temp, RC_MASK_W),
365
swizzle_wwww(srctemp));
367
/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
368
emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
369
dstregtmpmask(temp, RC_MASK_Z),
370
negate(swizzle_xxxx(srctemp)),
371
swizzle_wwww(srctemp),
374
/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
375
emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
376
dstregtmpmask(temp, RC_MASK_XYW),
377
swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
379
rc_remove_instruction(inst);
382
251
static void transform_LRP(struct radeon_compiler* c,
383
252
struct rc_instruction* inst)
394
263
rc_remove_instruction(inst);
397
static void transform_POW(struct radeon_compiler* c,
398
struct rc_instruction* inst)
400
struct rc_dst_register tempdst = new_dst_reg(c, inst);
401
struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
402
tempdst.WriteMask = RC_MASK_W;
403
tempsrc.Swizzle = RC_SWIZZLE_WWWW;
405
emit1(c, inst->Prev, RC_OPCODE_LG2, NULL, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
406
emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
407
emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
409
rc_remove_instruction(inst);
412
/* dst = ROUND(src) :
417
* According to the GLSL spec, the implementor can decide which way to round
418
* when the fraction is .5. We round down for .5.
421
static void transform_ROUND(struct radeon_compiler* c,
422
struct rc_instruction* inst)
424
unsigned int mask = inst->U.I.DstReg.WriteMask;
425
unsigned int frac_index, add_index;
426
struct rc_dst_register frac_dst, add_dst;
427
struct rc_src_register frac_src, add_src;
430
add_index = rc_find_free_temporary(c);
431
add_dst = dstregtmpmask(add_index, mask);
432
emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, add_dst, inst->U.I.SrcReg[0],
434
add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
437
/* frac = FRC(add) */
438
frac_index = rc_find_free_temporary(c);
439
frac_dst = dstregtmpmask(frac_index, mask);
440
emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, frac_dst, add_src);
441
frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
443
/* dst = add - frac */
444
emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, inst->U.I.DstReg,
445
add_src, negate(frac_src));
446
rc_remove_instruction(inst);
449
266
static void transform_RSQ(struct radeon_compiler* c,
450
267
struct rc_instruction* inst)
857
static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
859
static const float SinCosConsts[2][4] = {
861
1.273239545, /* 4/PI */
862
-0.405284735, /* -4/(PI*PI) */
863
3.141592654, /* PI */
869
0.159154943, /* 1/(2*PI) */
870
6.283185307 /* 2*PI */
875
for(i = 0; i < 2; ++i)
876
constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
880
* Approximate sin(x), where x is clamped to (-pi/2, pi/2).
882
* MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
883
* MAD tmp.x, tmp.y, |src|, tmp.x
884
* MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
885
* MAD dest, tmp.y, weight, tmp.x
887
static void sin_approx(
888
struct radeon_compiler* c, struct rc_instruction * inst,
889
struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
891
unsigned int tempreg = rc_find_free_temporary(c);
893
emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
895
srcreg(RC_FILE_CONSTANT, constants[0]));
896
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_X),
897
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
898
absolute(swizzle_xxxx(src)),
899
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
900
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_Y),
901
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
902
absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
903
negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
904
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dst,
905
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
906
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
907
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
911
* Translate the trigonometric functions COS and SIN
912
* using only the basic instructions
913
* MOV, ADD, MUL, MAD, FRC
915
int r300_transform_trig_simple(struct radeon_compiler* c,
916
struct rc_instruction* inst,
919
unsigned int constants[2];
920
unsigned int tempreg;
922
if (inst->U.I.Opcode != RC_OPCODE_COS &&
923
inst->U.I.Opcode != RC_OPCODE_SIN)
926
tempreg = rc_find_free_temporary(c);
928
sincos_constants(c, constants);
930
if (inst->U.I.Opcode == RC_OPCODE_COS) {
931
/* MAD tmp.x, src, 1/(2*PI), 0.75 */
932
/* FRC tmp.x, tmp.x */
933
/* MAD tmp.z, tmp.x, 2*PI, -PI */
934
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
935
swizzle_xxxx(inst->U.I.SrcReg[0]),
936
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
937
swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
938
emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
939
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
940
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
941
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
942
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
943
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
945
sin_approx(c, inst, inst->U.I.DstReg,
946
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
948
} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
949
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
950
swizzle_xxxx(inst->U.I.SrcReg[0]),
951
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
952
swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
953
emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
954
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
955
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
956
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
957
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
958
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
960
sin_approx(c, inst, inst->U.I.DstReg,
961
swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
964
struct rc_dst_register dst;
966
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
967
swizzle_xxxx(inst->U.I.SrcReg[0]),
968
swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
969
swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
970
emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
971
srcreg(RC_FILE_TEMPORARY, tempreg));
972
emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
973
srcreg(RC_FILE_TEMPORARY, tempreg),
974
swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
975
negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
977
dst = inst->U.I.DstReg;
979
dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
980
sin_approx(c, inst, dst,
981
swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
984
dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
985
sin_approx(c, inst, dst,
986
swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
990
rc_remove_instruction(inst);
995
static void r300_transform_SIN_COS(struct radeon_compiler *c,
996
struct rc_instruction *inst,
999
if (inst->U.I.Opcode == RC_OPCODE_COS) {
1000
emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1001
srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1002
} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1003
emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1004
inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1007
rc_remove_instruction(inst);
1012
* Transform the trigonometric functions COS and SIN
1013
* to include pre-scaling by 1/(2*PI) and taking the fractional
1014
* part, so that the input to COS and SIN is always in the range [0,1).
1016
* @warning This transformation implicitly changes the semantics of SIN and COS!
1018
int radeonTransformTrigScale(struct radeon_compiler* c,
1019
struct rc_instruction* inst,
1022
static const float RCP_2PI = 0.15915494309189535;
1024
unsigned int constant;
1025
unsigned int constant_swizzle;
1027
if (inst->U.I.Opcode != RC_OPCODE_COS &&
1028
inst->U.I.Opcode != RC_OPCODE_SIN)
1031
if (!c->needs_trig_input_transform)
1034
temp = rc_find_free_temporary(c);
1035
constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1037
emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(temp, RC_MASK_W),
1038
swizzle_xxxx(inst->U.I.SrcReg[0]),
1039
srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1040
emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(temp, RC_MASK_W),
1041
srcreg(RC_FILE_TEMPORARY, temp));
1043
r300_transform_SIN_COS(c, inst, temp);
1048
602
* Replaces DDX/DDY instructions with MOV 0 to avoid using dummy shaders on r300/r400.