~mmach/netext73/mesa-ryzen

« back to all changes in this revision

Viewing changes to src/gallium/drivers/r300/compiler/radeon_program_alu.c

  • Committer: mmach
  • Date: 2023-11-02 21:31:35 UTC
  • Revision ID: netbit73@gmail.com-20231102213135-18d4tzh7tj0uz752
2023-11-02 22:11:57

Show diffs side-by-side

added added

removed removed

Lines of Context:
37
37
 
38
38
#include "radeon_compiler.h"
39
39
#include "radeon_compiler_util.h"
 
40
#include "radeon_dataflow.h"
40
41
 
41
42
#include "util/log.h"
42
43
 
117
118
        .Swizzle = RC_SWIZZLE_1111
118
119
};
119
120
 
120
 
static const struct rc_src_register builtin_half = {
121
 
        .File = RC_FILE_NONE,
122
 
        .Index = 0,
123
 
        .Swizzle = RC_SWIZZLE_HHHH
124
 
};
125
 
 
126
121
static const struct rc_src_register srcreg_undefined = {
127
122
        .File = RC_FILE_NONE,
128
123
        .Index = 0,
202
197
        return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
203
198
}
204
199
 
205
 
static void transform_CEIL(struct radeon_compiler* c,
206
 
        struct rc_instruction* inst)
207
 
{
208
 
        /* Assuming:
209
 
         *     ceil(x) = -floor(-x)
210
 
         *
211
 
         * After inlining floor:
212
 
         *     ceil(x) = -(-x-frac(-x))
213
 
         *
214
 
         * After simplification:
215
 
         *     ceil(x) = x+frac(-x)
216
 
         */
217
 
 
218
 
        struct rc_dst_register dst = new_dst_reg(c, inst);
219
 
        emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, negate(inst->U.I.SrcReg[0]));
220
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
221
 
                inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
222
 
        rc_remove_instruction(inst);
223
 
}
224
 
 
225
200
static void transform_DP2(struct radeon_compiler* c,
226
201
        struct rc_instruction* inst)
227
202
{
237
212
        rc_remove_instruction(inst);
238
213
}
239
214
 
240
 
/**
241
 
 * [1, src0.y*src1.y, src0.z, src1.w]
242
 
 * So basically MUL with lotsa swizzling.
243
 
 */
244
 
static void transform_DST(struct radeon_compiler* c,
245
 
        struct rc_instruction* inst)
246
 
{
247
 
        emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
248
 
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
249
 
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
250
 
        rc_remove_instruction(inst);
251
 
}
252
 
 
253
 
static void transform_FLR(struct radeon_compiler* c,
254
 
        struct rc_instruction* inst)
255
 
{
256
 
        struct rc_dst_register dst = new_dst_reg(c, inst);
257
 
        emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, inst->U.I.SrcReg[0]);
258
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
259
 
                inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
260
 
        rc_remove_instruction(inst);
261
 
}
262
 
 
263
215
static void transform_TRUNC(struct radeon_compiler* c,
264
216
        struct rc_instruction* inst)
265
217
{
296
248
        rc_remove_instruction(inst);
297
249
}
298
250
 
299
 
/**
300
 
 * Definition of LIT (from ARB_fragment_program):
301
 
 *
302
 
 *  tmp = VectorLoad(op0);
303
 
 *  if (tmp.x < 0) tmp.x = 0;
304
 
 *  if (tmp.y < 0) tmp.y = 0;
305
 
 *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
306
 
 *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
307
 
 *  result.x = 1.0;
308
 
 *  result.y = tmp.x;
309
 
 *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
310
 
 *  result.w = 1.0;
311
 
 *
312
 
 * The longest path of computation is the one leading to result.z,
313
 
 * consisting of 5 operations. This implementation of LIT takes
314
 
 * 5 slots, if the subsequent optimization passes are clever enough
315
 
 * to pair instructions correctly.
316
 
 */
317
 
static void transform_LIT(struct radeon_compiler* c,
318
 
        struct rc_instruction* inst)
319
 
{
320
 
        unsigned int constant;
321
 
        unsigned int constant_swizzle;
322
 
        unsigned int temp;
323
 
        struct rc_src_register srctemp;
324
 
 
325
 
        constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
326
 
 
327
 
        if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
328
 
                struct rc_instruction * inst_mov;
329
 
 
330
 
                inst_mov = emit1(c, inst,
331
 
                        RC_OPCODE_MOV, NULL, inst->U.I.DstReg,
332
 
                        srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
333
 
 
334
 
                inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
335
 
                inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
336
 
                inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
337
 
        }
338
 
 
339
 
        temp = inst->U.I.DstReg.Index;
340
 
        srctemp = srcreg(RC_FILE_TEMPORARY, temp);
341
 
 
342
 
        /* tmp.x = max(0.0, Src.x); */
343
 
        /* tmp.y = max(0.0, Src.y); */
344
 
        /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
345
 
        emit2(c, inst->Prev, RC_OPCODE_MAX, NULL,
346
 
                dstregtmpmask(temp, RC_MASK_XYW),
347
 
                inst->U.I.SrcReg[0],
348
 
                swizzle(srcreg(RC_FILE_CONSTANT, constant),
349
 
                        RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
350
 
        emit2(c, inst->Prev, RC_OPCODE_MIN, NULL,
351
 
                dstregtmpmask(temp, RC_MASK_Z),
352
 
                swizzle_wwww(srctemp),
353
 
                negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
354
 
 
355
 
        /* tmp.w = Pow(tmp.y, tmp.w) */
356
 
        emit1(c, inst->Prev, RC_OPCODE_LG2, NULL,
357
 
                dstregtmpmask(temp, RC_MASK_W),
358
 
                swizzle_yyyy(srctemp));
359
 
        emit2(c, inst->Prev, RC_OPCODE_MUL, NULL,
360
 
                dstregtmpmask(temp, RC_MASK_W),
361
 
                swizzle_wwww(srctemp),
362
 
                swizzle_zzzz(srctemp));
363
 
        emit1(c, inst->Prev, RC_OPCODE_EX2, NULL,
364
 
                dstregtmpmask(temp, RC_MASK_W),
365
 
                swizzle_wwww(srctemp));
366
 
 
367
 
        /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
368
 
        emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
369
 
                dstregtmpmask(temp, RC_MASK_Z),
370
 
                negate(swizzle_xxxx(srctemp)),
371
 
                swizzle_wwww(srctemp),
372
 
                builtin_zero);
373
 
 
374
 
        /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
375
 
        emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
376
 
                dstregtmpmask(temp, RC_MASK_XYW),
377
 
                swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
378
 
 
379
 
        rc_remove_instruction(inst);
380
 
}
381
 
 
382
251
static void transform_LRP(struct radeon_compiler* c,
383
252
        struct rc_instruction* inst)
384
253
{
394
263
        rc_remove_instruction(inst);
395
264
}
396
265
 
397
 
static void transform_POW(struct radeon_compiler* c,
398
 
        struct rc_instruction* inst)
399
 
{
400
 
        struct rc_dst_register tempdst = new_dst_reg(c, inst);
401
 
        struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
402
 
        tempdst.WriteMask = RC_MASK_W;
403
 
        tempsrc.Swizzle = RC_SWIZZLE_WWWW;
404
 
 
405
 
        emit1(c, inst->Prev, RC_OPCODE_LG2, NULL, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
406
 
        emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
407
 
        emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
408
 
 
409
 
        rc_remove_instruction(inst);
410
 
}
411
 
 
412
 
/* dst = ROUND(src) :
413
 
 *   add = src + .5
414
 
 *   frac = FRC(add)
415
 
 *   dst = add - frac
416
 
 *
417
 
 * According to the GLSL spec, the implementor can decide which way to round
418
 
 * when the fraction is .5.  We round down for .5.
419
 
 *
420
 
 */
421
 
static void transform_ROUND(struct radeon_compiler* c,
422
 
        struct rc_instruction* inst)
423
 
{
424
 
        unsigned int mask = inst->U.I.DstReg.WriteMask;
425
 
        unsigned int frac_index, add_index;
426
 
        struct rc_dst_register frac_dst, add_dst;
427
 
        struct rc_src_register frac_src, add_src;
428
 
 
429
 
        /* add = src + .5 */
430
 
        add_index = rc_find_free_temporary(c);
431
 
        add_dst = dstregtmpmask(add_index, mask);
432
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, add_dst, inst->U.I.SrcReg[0],
433
 
                                                                builtin_half);
434
 
        add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
435
 
 
436
 
 
437
 
        /* frac = FRC(add) */
438
 
        frac_index = rc_find_free_temporary(c);
439
 
        frac_dst = dstregtmpmask(frac_index, mask);
440
 
        emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, frac_dst, add_src);
441
 
        frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
442
 
 
443
 
        /* dst = add - frac */
444
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, inst->U.I.DstReg,
445
 
                                                add_src, negate(frac_src));
446
 
        rc_remove_instruction(inst);
447
 
}
448
 
 
449
266
static void transform_RSQ(struct radeon_compiler* c,
450
267
        struct rc_instruction* inst)
451
268
{
524
341
        rc_remove_instruction(inst);
525
342
}
526
343
 
527
 
static void transform_SSG(struct radeon_compiler* c,
528
 
        struct rc_instruction* inst)
529
 
{
530
 
        /* result = sign(x)
531
 
         *
532
 
         *   CMP tmp0, -x, 1, 0
533
 
         *   CMP tmp1, x, 1, 0
534
 
         *   ADD result, tmp0, -tmp1;
535
 
         */
536
 
        struct rc_dst_register dst0;
537
 
        unsigned tmp1;
538
 
 
539
 
        /* 0 < x */
540
 
        dst0 = new_dst_reg(c, inst);
541
 
        emit3(c, inst->Prev, RC_OPCODE_CMP, NULL,
542
 
              dst0,
543
 
              negate(inst->U.I.SrcReg[0]),
544
 
              builtin_one,
545
 
              builtin_zero);
546
 
 
547
 
        /* x < 0 */
548
 
        tmp1 = rc_find_free_temporary(c);
549
 
        emit3(c, inst->Prev, RC_OPCODE_CMP, NULL,
550
 
              dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
551
 
              inst->U.I.SrcReg[0],
552
 
              builtin_one,
553
 
              builtin_zero);
554
 
 
555
 
        /* Either both are zero, or one of them is one and the other is zero. */
556
 
        /* result = tmp0 - tmp1 */
557
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, NULL,
558
 
              inst->U.I.DstReg,
559
 
              srcreg(RC_FILE_TEMPORARY, dst0.Index),
560
 
              negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
561
 
 
562
 
        rc_remove_instruction(inst);
563
 
}
564
 
 
565
344
static void transform_SUB(struct radeon_compiler* c,
566
345
        struct rc_instruction* inst)
567
346
{
581
360
 * no userData necessary.
582
361
 *
583
362
 * Eliminates the following ALU instructions:
584
 
 *  CEIL, DST, FLR, LIT, LRP, POW, SEQ, SGE, SGT, SLE, SLT, SNE, SUB
 
363
 *  LRP, SEQ, SGE, SGT, SLE, SLT, SNE, SUB
585
364
 * using:
586
365
 *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
587
366
 *
596
375
        void* unused)
597
376
{
598
377
        switch(inst->U.I.Opcode) {
599
 
        case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
600
378
        case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
601
 
        case RC_OPCODE_DST: transform_DST(c, inst); return 1;
602
 
        case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
603
379
        case RC_OPCODE_KILP: transform_KILP(c, inst); return 1;
604
 
        case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
605
380
        case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
606
 
        case RC_OPCODE_POW: transform_POW(c, inst); return 1;
607
 
        case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
608
381
        case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
609
382
        case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
610
383
        case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
612
385
        case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
613
386
        case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
614
387
        case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
615
 
        case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
616
388
        case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
617
389
        case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
618
390
        default:
623
395
static void transform_r300_vertex_CMP(struct radeon_compiler* c,
624
396
        struct rc_instruction* inst)
625
397
{
626
 
        /* There is no decent CMP available, so let's rig one up.
 
398
        /* R5xx has a CMP, but we can use it only if it reads from less than
 
399
         * three different temps. */
 
400
        if (c->is_r500 &&
 
401
            (inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY ||
 
402
             inst->U.I.SrcReg[1].File != RC_FILE_TEMPORARY ||
 
403
             inst->U.I.SrcReg[2].File != RC_FILE_TEMPORARY ||
 
404
             inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[1].Index ||
 
405
             inst->U.I.SrcReg[1].Index == inst->U.I.SrcReg[2].Index ||
 
406
             inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[2].Index))
 
407
                return;
 
408
 
 
409
        /* There is no decent CMP available on r300, so let's rig one up.
627
410
         * CMP is defined as dst = src0 < 0.0 ? src1 : src2
628
411
         * The following sequence consumes zero to two temps and two extra slots
629
412
         * (the second temp and the second slot is consumed by transform_LRP),
768
551
        inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
769
552
}
770
553
 
771
 
static void transform_r300_vertex_SSG(struct radeon_compiler* c,
772
 
        struct rc_instruction* inst)
773
 
{
774
 
        /* result = sign(x)
775
 
         *
776
 
         *   SLT tmp0, 0, x;
777
 
         *   SLT tmp1, x, 0;
778
 
         *   ADD result, tmp0, -tmp1;
779
 
         */
780
 
        struct rc_dst_register dst0;
781
 
        unsigned tmp1;
782
 
 
783
 
        /* 0 < x */
784
 
        dst0 = new_dst_reg(c, inst);
785
 
        emit2(c, inst->Prev, RC_OPCODE_SLT, NULL,
786
 
              dst0,
787
 
              builtin_zero,
788
 
              inst->U.I.SrcReg[0]);
789
 
 
790
 
        /* x < 0 */
791
 
        tmp1 = rc_find_free_temporary(c);
792
 
        emit2(c, inst->Prev, RC_OPCODE_SLT, NULL,
793
 
              dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
794
 
              inst->U.I.SrcReg[0],
795
 
              builtin_zero);
796
 
 
797
 
        /* Either both are zero, or one of them is one and the other is zero. */
798
 
        /* result = tmp0 - tmp1 */
799
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, NULL,
800
 
              inst->U.I.DstReg,
801
 
              srcreg(RC_FILE_TEMPORARY, dst0.Index),
802
 
              negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
803
 
 
804
 
        rc_remove_instruction(inst);
805
 
}
806
 
 
807
554
static void transform_vertex_TRUNC(struct radeon_compiler* c,
808
555
        struct rc_instruction* inst)
809
556
{
825
572
        void* unused)
826
573
{
827
574
        switch(inst->U.I.Opcode) {
828
 
        case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
829
575
        case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
830
576
        case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
831
577
        case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
832
 
        case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
833
578
        case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
834
579
        case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
835
580
        case RC_OPCODE_SEQ:
846
591
                        return 1;
847
592
                }
848
593
                return 0;
849
 
        case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
850
594
        case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
851
595
        case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
852
596
        default:
854
598
        }
855
599
}
856
600
 
857
 
static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
858
 
{
859
 
        static const float SinCosConsts[2][4] = {
860
 
                {
861
 
                        1.273239545,            /* 4/PI */
862
 
                        -0.405284735,           /* -4/(PI*PI) */
863
 
                        3.141592654,            /* PI */
864
 
                        0.2225                  /* weight */
865
 
                },
866
 
                {
867
 
                        0.75,
868
 
                        0.5,
869
 
                        0.159154943,            /* 1/(2*PI) */
870
 
                        6.283185307             /* 2*PI */
871
 
                }
872
 
        };
873
 
        int i;
874
 
 
875
 
        for(i = 0; i < 2; ++i)
876
 
                constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
877
 
}
878
 
 
879
 
/**
880
 
 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
881
 
 *
882
 
 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
883
 
 * MAD tmp.x, tmp.y, |src|, tmp.x
884
 
 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
885
 
 * MAD dest, tmp.y, weight, tmp.x
886
 
 */
887
 
static void sin_approx(
888
 
        struct radeon_compiler* c, struct rc_instruction * inst,
889
 
        struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
890
 
{
891
 
        unsigned int tempreg = rc_find_free_temporary(c);
892
 
 
893
 
        emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
894
 
                swizzle_xxxx(src),
895
 
                srcreg(RC_FILE_CONSTANT, constants[0]));
896
 
        emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_X),
897
 
                swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
898
 
                absolute(swizzle_xxxx(src)),
899
 
                swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
900
 
        emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_Y),
901
 
                swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
902
 
                absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
903
 
                negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
904
 
        emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dst,
905
 
                swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
906
 
                swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
907
 
                swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
908
 
}
909
 
 
910
 
/**
911
 
 * Translate the trigonometric functions COS and SIN
912
 
 * using only the basic instructions
913
 
 *  MOV, ADD, MUL, MAD, FRC
914
 
 */
915
 
int r300_transform_trig_simple(struct radeon_compiler* c,
916
 
        struct rc_instruction* inst,
917
 
        void* unused)
918
 
{
919
 
        unsigned int constants[2];
920
 
        unsigned int tempreg;
921
 
 
922
 
        if (inst->U.I.Opcode != RC_OPCODE_COS &&
923
 
            inst->U.I.Opcode != RC_OPCODE_SIN)
924
 
                return 0;
925
 
 
926
 
        tempreg = rc_find_free_temporary(c);
927
 
 
928
 
        sincos_constants(c, constants);
929
 
 
930
 
        if (inst->U.I.Opcode == RC_OPCODE_COS) {
931
 
                /* MAD tmp.x, src, 1/(2*PI), 0.75 */
932
 
                /* FRC tmp.x, tmp.x */
933
 
                /* MAD tmp.z, tmp.x, 2*PI, -PI */
934
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
935
 
                        swizzle_xxxx(inst->U.I.SrcReg[0]),
936
 
                        swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
937
 
                        swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
938
 
                emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
939
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
940
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
941
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
942
 
                        swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
943
 
                        negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
944
 
 
945
 
                sin_approx(c, inst, inst->U.I.DstReg,
946
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
947
 
                        constants);
948
 
        } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
949
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
950
 
                        swizzle_xxxx(inst->U.I.SrcReg[0]),
951
 
                        swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
952
 
                        swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
953
 
                emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
954
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
955
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
956
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
957
 
                        swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
958
 
                        negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
959
 
 
960
 
                sin_approx(c, inst, inst->U.I.DstReg,
961
 
                        swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
962
 
                        constants);
963
 
        } else {
964
 
                struct rc_dst_register dst;
965
 
 
966
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
967
 
                        swizzle_xxxx(inst->U.I.SrcReg[0]),
968
 
                        swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
969
 
                        swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
970
 
                emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
971
 
                        srcreg(RC_FILE_TEMPORARY, tempreg));
972
 
                emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
973
 
                        srcreg(RC_FILE_TEMPORARY, tempreg),
974
 
                        swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
975
 
                        negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
976
 
 
977
 
                dst = inst->U.I.DstReg;
978
 
 
979
 
                dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
980
 
                sin_approx(c, inst, dst,
981
 
                        swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
982
 
                        constants);
983
 
 
984
 
                dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
985
 
                sin_approx(c, inst, dst,
986
 
                        swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
987
 
                        constants);
988
 
        }
989
 
 
990
 
        rc_remove_instruction(inst);
991
 
 
992
 
        return 1;
993
 
}
994
 
 
995
 
static void r300_transform_SIN_COS(struct radeon_compiler *c,
996
 
        struct rc_instruction *inst,
997
 
        unsigned srctmp)
998
 
{
999
 
        if (inst->U.I.Opcode == RC_OPCODE_COS) {
1000
 
                emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1001
 
                        srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1002
 
        } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1003
 
                emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1004
 
                        inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1005
 
        }
1006
 
 
1007
 
        rc_remove_instruction(inst);
1008
 
}
1009
 
 
1010
 
 
1011
 
/**
1012
 
 * Transform the trigonometric functions COS and SIN
1013
 
 * to include pre-scaling by 1/(2*PI) and taking the fractional
1014
 
 * part, so that the input to COS and SIN is always in the range [0,1).
1015
 
 *
1016
 
 * @warning This transformation implicitly changes the semantics of SIN and COS!
1017
 
 */
1018
 
int radeonTransformTrigScale(struct radeon_compiler* c,
1019
 
        struct rc_instruction* inst,
1020
 
        void* unused)
1021
 
{
1022
 
        static const float RCP_2PI = 0.15915494309189535;
1023
 
        unsigned int temp;
1024
 
        unsigned int constant;
1025
 
        unsigned int constant_swizzle;
1026
 
 
1027
 
        if (inst->U.I.Opcode != RC_OPCODE_COS &&
1028
 
            inst->U.I.Opcode != RC_OPCODE_SIN)
1029
 
                return 0;
1030
 
 
1031
 
        if (!c->needs_trig_input_transform)
1032
 
                return 1;
1033
 
 
1034
 
        temp = rc_find_free_temporary(c);
1035
 
        constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1036
 
 
1037
 
        emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(temp, RC_MASK_W),
1038
 
                swizzle_xxxx(inst->U.I.SrcReg[0]),
1039
 
                srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1040
 
        emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(temp, RC_MASK_W),
1041
 
                srcreg(RC_FILE_TEMPORARY, temp));
1042
 
 
1043
 
        r300_transform_SIN_COS(c, inst, temp);
1044
 
        return 1;
1045
 
}
1046
 
 
1047
601
/**
1048
602
 * Replaces DDX/DDY instructions with MOV 0 to avoid using dummy shaders on r300/r400.
1049
603
 *