2
* Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE. */
23
#include "radeon_compiler.h"
27
#include "../r300_reg.h"
29
#include "radeon_compiler_util.h"
30
#include "radeon_dataflow.h"
31
#include "radeon_program_alu.h"
32
#include "radeon_swizzle.h"
33
#include "radeon_emulate_branches.h"
34
#include "radeon_emulate_loops.h"
35
#include "radeon_remove_constants.h"
43
* Take an already-setup and valid source then swizzle it appropriately to
44
* obtain a constant ZERO or ONE source.
46
#define __CONST(x, y) \
47
(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
52
t_src_class(vpi->SrcReg[x].File), \
53
RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
56
static unsigned long t_dst_mask(unsigned int mask)
58
/* RC_MASK_* is equivalent to VSF_FLAG_* */
59
return mask & RC_MASK_XYZW;
62
static unsigned long t_dst_class(rc_register_file file)
66
fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
68
case RC_FILE_TEMPORARY:
69
return PVS_DST_REG_TEMPORARY;
71
return PVS_DST_REG_OUT;
73
return PVS_DST_REG_A0;
77
static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
78
struct rc_dst_register *dst)
80
if (dst->File == RC_FILE_OUTPUT)
81
return vp->outputs[dst->Index];
86
static unsigned long t_src_class(rc_register_file file)
90
fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
93
case RC_FILE_TEMPORARY:
94
return PVS_SRC_REG_TEMPORARY;
96
return PVS_SRC_REG_INPUT;
97
case RC_FILE_CONSTANT:
98
return PVS_SRC_REG_CONSTANT;
102
static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
104
unsigned long aclass = t_src_class(a.File);
105
unsigned long bclass = t_src_class(b.File);
107
if (aclass != bclass)
109
if (aclass == PVS_SRC_REG_TEMPORARY)
112
if (a.RelAddr || b.RelAddr)
114
if (a.Index != b.Index)
120
static inline unsigned long t_swizzle(unsigned int swizzle)
122
/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
126
static unsigned long t_src_index(struct r300_vertex_program_code *vp,
127
struct rc_src_register *src)
129
if (src->File == RC_FILE_INPUT) {
130
assert(vp->inputs[src->Index] != -1);
131
return vp->inputs[src->Index];
133
if (src->Index < 0) {
135
"negative offsets for indirect addressing do not work.\n");
142
/* these two functions should probably be merged... */
144
static unsigned long t_src(struct r300_vertex_program_code *vp,
145
struct rc_src_register *src)
147
/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
148
* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
150
return PVS_SRC_OPERAND(t_src_index(vp, src),
151
t_swizzle(GET_SWZ(src->Swizzle, 0)),
152
t_swizzle(GET_SWZ(src->Swizzle, 1)),
153
t_swizzle(GET_SWZ(src->Swizzle, 2)),
154
t_swizzle(GET_SWZ(src->Swizzle, 3)),
155
t_src_class(src->File),
157
(src->RelAddr << 4) | (src->Abs << 3);
160
static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
161
struct rc_src_register *src)
163
/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
164
* which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
166
unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
168
return PVS_SRC_OPERAND(t_src_index(vp, src),
173
t_src_class(src->File),
174
src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
175
(src->RelAddr << 4) | (src->Abs << 3);
178
static int valid_dst(struct r300_vertex_program_code *vp,
179
struct rc_dst_register *dst)
181
if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
183
} else if (dst->File == RC_FILE_ADDRESS) {
184
assert(dst->Index == 0);
190
static void ei_vector1(struct r300_vertex_program_code *vp,
191
unsigned int hw_opcode,
192
struct rc_sub_instruction *vpi,
195
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
198
t_dst_index(vp, &vpi->DstReg),
199
t_dst_mask(vpi->DstReg.WriteMask),
200
t_dst_class(vpi->DstReg.File));
201
inst[1] = t_src(vp, &vpi->SrcReg[0]);
202
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
206
static void ei_vector2(struct r300_vertex_program_code *vp,
207
unsigned int hw_opcode,
208
struct rc_sub_instruction *vpi,
211
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
214
t_dst_index(vp, &vpi->DstReg),
215
t_dst_mask(vpi->DstReg.WriteMask),
216
t_dst_class(vpi->DstReg.File));
217
inst[1] = t_src(vp, &vpi->SrcReg[0]);
218
inst[2] = t_src(vp, &vpi->SrcReg[1]);
219
inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
222
static void ei_math1(struct r300_vertex_program_code *vp,
223
unsigned int hw_opcode,
224
struct rc_sub_instruction *vpi,
227
inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
230
t_dst_index(vp, &vpi->DstReg),
231
t_dst_mask(vpi->DstReg.WriteMask),
232
t_dst_class(vpi->DstReg.File));
233
inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
234
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
235
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238
static void ei_lit(struct r300_vertex_program_code *vp,
239
struct rc_sub_instruction *vpi,
242
//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
244
inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
247
t_dst_index(vp, &vpi->DstReg),
248
t_dst_mask(vpi->DstReg.WriteMask),
249
t_dst_class(vpi->DstReg.File));
250
/* NOTE: Users swizzling might not work. */
251
inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
252
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
253
PVS_SRC_SELECT_FORCE_0, // Z
254
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
255
t_src_class(vpi->SrcReg[0].File),
256
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
257
(vpi->SrcReg[0].RelAddr << 4);
258
inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
259
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
260
PVS_SRC_SELECT_FORCE_0, // Z
261
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
262
t_src_class(vpi->SrcReg[0].File),
263
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
264
(vpi->SrcReg[0].RelAddr << 4);
265
inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
266
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
267
PVS_SRC_SELECT_FORCE_0, // Z
268
t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
269
t_src_class(vpi->SrcReg[0].File),
270
vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
271
(vpi->SrcReg[0].RelAddr << 4);
274
static void ei_mad(struct r300_vertex_program_code *vp,
275
struct rc_sub_instruction *vpi,
279
/* Remarks about hardware limitations of MAD
280
* (please preserve this comment, as this information is _NOT_
281
* in the documentation provided by AMD).
283
* As described in the documentation, MAD with three unique temporary
284
* source registers requires the use of the macro version.
286
* However (and this is not mentioned in the documentation), apparently
287
* the macro version is _NOT_ a full superset of the normal version.
288
* In particular, the macro version does not always work when relative
289
* addressing is used in the source operands.
291
* This limitation caused incorrect rendering in Sauerbraten's OpenGL
292
* assembly shader path when using medium quality animations
293
* (i.e. animations with matrix blending instead of quaternion blending).
295
* Unfortunately, I (nha) have been unable to extract a Piglit regression
296
* test for this issue - for some reason, it is possible to have vertex
297
* programs whose prefix is *exactly* the same as the prefix of the
298
* offending program in Sauerbraten up to the offending instruction
299
* without causing any trouble.
301
* Bottom line: Only use the macro version only when really necessary;
302
* according to AMD docs, this should improve performance by one clock
303
* as a nice side bonus.
305
if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
306
vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
307
vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
308
vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
309
vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
310
vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
311
inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
314
t_dst_index(vp, &vpi->DstReg),
315
t_dst_mask(vpi->DstReg.WriteMask),
316
t_dst_class(vpi->DstReg.File));
318
inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
321
t_dst_index(vp, &vpi->DstReg),
322
t_dst_mask(vpi->DstReg.WriteMask),
323
t_dst_class(vpi->DstReg.File));
325
/* Arguments with constant swizzles still count as a unique
326
* temporary, so we should make sure these arguments share a
327
* register index with one of the other arguments. */
328
for (i = 0; i < 3; i++) {
330
if (vpi->SrcReg[i].File != RC_FILE_NONE)
333
for (j = 0; j < 3; j++) {
335
vpi->SrcReg[i].Index =
336
vpi->SrcReg[j].Index;
342
inst[1] = t_src(vp, &vpi->SrcReg[0]);
343
inst[2] = t_src(vp, &vpi->SrcReg[1]);
344
inst[3] = t_src(vp, &vpi->SrcReg[2]);
347
static void ei_pow(struct r300_vertex_program_code *vp,
348
struct rc_sub_instruction *vpi,
351
inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
354
t_dst_index(vp, &vpi->DstReg),
355
t_dst_mask(vpi->DstReg.WriteMask),
356
t_dst_class(vpi->DstReg.File));
357
inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
358
inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
359
inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
362
static void mark_write(void * userdata, struct rc_instruction * inst,
363
rc_register_file file, unsigned int index, unsigned int mask)
365
unsigned int * writemasks = userdata;
367
if (file != RC_FILE_TEMPORARY)
370
if (index >= R300_VS_MAX_TEMPS)
373
writemasks[index] |= mask;
376
static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
378
return PVS_SRC_OPERAND(compiler->PredicateIndex,
379
t_swizzle(RC_SWIZZLE_ZERO),
380
t_swizzle(RC_SWIZZLE_ZERO),
381
t_swizzle(RC_SWIZZLE_ZERO),
382
t_swizzle(RC_SWIZZLE_W),
383
t_src_class(RC_FILE_TEMPORARY),
387
static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
388
unsigned int hw_opcode, int is_math)
390
return PVS_OP_DST_OPERAND(hw_opcode,
393
compiler->PredicateIndex,
395
t_dst_class(RC_FILE_TEMPORARY));
399
static void ei_if(struct r300_vertex_program_compiler * compiler,
400
struct rc_instruction *rci,
402
unsigned int branch_depth)
404
unsigned int predicate_opcode;
407
if (!compiler->Base.is_r500) {
408
rc_error(&compiler->Base,"Opcode IF not supported\n");
412
/* Reserve a temporary to use as our predicate stack counter, if we
413
* don't already have one. */
414
if (!compiler->PredicateMask) {
415
unsigned int writemasks[RC_REGISTER_MAX_INDEX];
416
struct rc_instruction * inst;
418
memset(writemasks, 0, sizeof(writemasks));
419
for(inst = compiler->Base.Program.Instructions.Next;
420
inst != &compiler->Base.Program.Instructions;
422
rc_for_all_writes_mask(inst, mark_write, writemasks);
424
for(i = 0; i < compiler->Base.max_temp_regs; i++) {
425
unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
426
/* Only the W component can be used fo the predicate
428
if (mask & RC_MASK_W) {
429
compiler->PredicateMask = RC_MASK_W;
430
compiler->PredicateIndex = i;
434
if (i == compiler->Base.max_temp_regs) {
435
rc_error(&compiler->Base, "No free temporary to use for"
436
" predicate stack counter.\n");
441
branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
443
rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
444
if (branch_depth == 0) {
446
predicate_opcode = ME_PRED_SET_NEQ;
447
inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
450
predicate_opcode = VE_PRED_SET_NEQ_PUSH;
451
inst[1] = t_pred_src(compiler);
452
inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
455
inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
460
static void ei_else(struct r300_vertex_program_compiler * compiler,
463
if (!compiler->Base.is_r500) {
464
rc_error(&compiler->Base,"Opcode ELSE not supported\n");
467
inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
468
inst[1] = t_pred_src(compiler);
473
static void ei_endif(struct r300_vertex_program_compiler *compiler,
476
if (!compiler->Base.is_r500) {
477
rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
480
inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
481
inst[1] = t_pred_src(compiler);
486
static void translate_vertex_program(struct radeon_compiler *c, void *user)
488
struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
489
struct rc_instruction *rci;
491
struct loop * loops = NULL;
492
int current_loop_depth = 0;
493
int loops_reserved = 0;
495
unsigned int branch_depth = 0;
497
compiler->code->pos_end = 0; /* Not supported yet */
498
compiler->code->length = 0;
499
compiler->code->num_temporaries = 0;
501
compiler->SetHwInputOutput(compiler);
503
for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
504
struct rc_sub_instruction *vpi = &rci->U.I;
505
unsigned int *inst = compiler->code->body.d + compiler->code->length;
506
const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
508
/* Skip instructions writing to non-existing destination */
509
if (!valid_dst(compiler->code, &vpi->DstReg))
512
if (info->HasDstReg) {
513
/* Neither is Saturate. */
514
if (vpi->SaturateMode != RC_SATURATE_NONE) {
515
rc_error(&compiler->Base, "Vertex program does not support the Saturate "
516
"modifier (yet).\n");
520
if (compiler->code->length >= c->max_alu_insts * 4) {
521
rc_error(&compiler->Base, "Vertex program has too many instructions\n");
525
assert(compiler->Base.is_r500 ||
526
(vpi->Opcode != RC_OPCODE_SEQ &&
527
vpi->Opcode != RC_OPCODE_SNE));
529
switch (vpi->Opcode) {
530
case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
531
case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
532
case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
533
case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
534
case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
535
case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
536
case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
537
case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
538
case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
539
case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
540
case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
541
case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
542
case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
543
case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
544
case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
545
case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
546
case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
547
case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
548
case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
549
case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
550
case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
551
case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
552
case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
553
case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
554
case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
555
case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
556
case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
557
case RC_OPCODE_BGNLOOP:
561
if ((!compiler->Base.is_r500
562
&& loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
563
|| loops_reserved >= R500_VS_MAX_FC_DEPTH) {
564
rc_error(&compiler->Base,
565
"Loops are nested too deep.");
568
memory_pool_array_reserve(&compiler->Base.Pool,
569
struct loop, loops, current_loop_depth,
571
l = &loops[current_loop_depth++];
572
memset(l , 0, sizeof(struct loop));
573
l->BgnLoop = (compiler->code->length / 4);
576
case RC_OPCODE_ENDLOOP:
579
unsigned int act_addr;
580
unsigned int last_addr;
581
unsigned int ret_addr;
584
l = &loops[current_loop_depth - 1];
585
act_addr = l->BgnLoop - 1;
586
last_addr = (compiler->code->length / 4) - 1;
587
ret_addr = l->BgnLoop;
589
if (loops_reserved >= R300_VS_MAX_FC_OPS) {
590
rc_error(&compiler->Base,
591
"Too many flow control instructions.");
594
if (compiler->Base.is_r500) {
595
compiler->code->fc_op_addrs.r500
596
[compiler->code->num_fc_ops].lw =
597
R500_PVS_FC_ACT_ADRS(act_addr)
598
| R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
600
compiler->code->fc_op_addrs.r500
601
[compiler->code->num_fc_ops].uw =
602
R500_PVS_FC_LAST_INST(last_addr)
603
| R500_PVS_FC_RTN_INST(ret_addr)
606
compiler->code->fc_op_addrs.r300
607
[compiler->code->num_fc_ops] =
608
R300_PVS_FC_ACT_ADRS(act_addr)
609
| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
610
| R300_PVS_FC_LAST_INST(last_addr)
611
| R300_PVS_FC_RTN_INST(ret_addr)
614
compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
615
R300_PVS_FC_LOOP_INIT_VAL(0x0)
616
| R300_PVS_FC_LOOP_STEP_VAL(0x1)
618
compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
619
compiler->code->num_fc_ops);
620
compiler->code->num_fc_ops++;
621
current_loop_depth--;
626
rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
630
/* Non-flow control instructions that are inside an if statement
631
* need to pay attention to the predicate bit. */
633
&& vpi->Opcode != RC_OPCODE_IF
634
&& vpi->Opcode != RC_OPCODE_ELSE
635
&& vpi->Opcode != RC_OPCODE_ENDIF) {
637
inst[0] |= (PVS_DST_PRED_ENABLE_MASK
638
<< PVS_DST_PRED_ENABLE_SHIFT);
639
inst[0] |= (PVS_DST_PRED_SENSE_MASK
640
<< PVS_DST_PRED_SENSE_SHIFT);
643
/* Update the number of temporaries. */
644
if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
645
vpi->DstReg.Index >= compiler->code->num_temporaries)
646
compiler->code->num_temporaries = vpi->DstReg.Index + 1;
648
for (unsigned i = 0; i < info->NumSrcRegs; i++)
649
if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
650
vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
651
compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
653
if (compiler->PredicateMask)
654
if (compiler->PredicateIndex >= compiler->code->num_temporaries)
655
compiler->code->num_temporaries = compiler->PredicateIndex + 1;
657
if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
658
rc_error(&compiler->Base, "Too many temporaries.\n");
662
compiler->code->length += 4;
664
if (compiler->Base.Error)
669
struct temporary_allocation {
670
unsigned int Allocated:1;
671
unsigned int HwTemp:15;
672
struct rc_instruction * LastRead;
675
static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
677
struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
678
struct rc_instruction *inst;
679
struct rc_instruction *end_loop = NULL;
680
unsigned int num_orig_temps = 0;
681
char hwtemps[RC_REGISTER_MAX_INDEX];
682
struct temporary_allocation * ta;
685
memset(hwtemps, 0, sizeof(hwtemps));
689
/* Pass 1: Count original temporaries. */
690
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
691
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
693
for (i = 0; i < opcode->NumSrcRegs; ++i) {
694
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
695
if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
696
num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
700
if (opcode->HasDstReg) {
701
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
702
if (inst->U.I.DstReg.Index >= num_orig_temps)
703
num_orig_temps = inst->U.I.DstReg.Index + 1;
708
ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
709
sizeof(struct temporary_allocation) * num_orig_temps);
710
memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
712
/* Pass 2: Determine original temporary lifetimes */
713
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
714
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
715
/* Instructions inside of loops need to use the ENDLOOP
716
* instruction as their LastRead. */
717
if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
719
struct rc_instruction * ptr;
720
for(ptr = inst->Next;
721
ptr != &compiler->Base.Program.Instructions;
723
if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
725
} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
735
if (inst == end_loop) {
740
for (i = 0; i < opcode->NumSrcRegs; ++i) {
741
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
742
ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
747
/* Pass 3: Register allocation */
748
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
749
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
751
for (i = 0; i < opcode->NumSrcRegs; ++i) {
752
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
753
unsigned int orig = inst->U.I.SrcReg[i].Index;
754
inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
756
if (ta[orig].Allocated && inst == ta[orig].LastRead)
757
hwtemps[ta[orig].HwTemp] = 0;
761
if (opcode->HasDstReg) {
762
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
763
unsigned int orig = inst->U.I.DstReg.Index;
765
if (!ta[orig].Allocated) {
766
for(j = 0; j < c->max_temp_regs; ++j) {
770
ta[orig].Allocated = 1;
772
hwtemps[ta[orig].HwTemp] = 1;
775
inst->U.I.DstReg.Index = ta[orig].HwTemp;
782
* R3xx-R4xx vertex engine does not support the Absolute source operand modifier
783
* and the Saturate opcode modifier. Only Absolute is currently transformed.
785
static int transform_nonnative_modifiers(
786
struct radeon_compiler *c,
787
struct rc_instruction *inst,
790
const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
793
/* Transform ABS(a) to MAX(a, -a). */
794
for (i = 0; i < opcode->NumSrcRegs; i++) {
795
if (inst->U.I.SrcReg[i].Abs) {
796
struct rc_instruction *new_inst;
799
inst->U.I.SrcReg[i].Abs = 0;
801
temp = rc_find_free_temporary(c);
803
new_inst = rc_insert_new_instruction(c, inst->Prev);
804
new_inst->U.I.Opcode = RC_OPCODE_MAX;
805
new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
806
new_inst->U.I.DstReg.Index = temp;
807
new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
808
new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
809
new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
811
memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
812
inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
813
inst->U.I.SrcReg[i].Index = temp;
814
inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
821
* Vertex engine cannot read two inputs or two constants at the same time.
822
* Introduce intermediate MOVs to temporary registers to account for this.
824
static int transform_source_conflicts(
825
struct radeon_compiler *c,
826
struct rc_instruction* inst,
829
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
831
if (opcode->NumSrcRegs == 3) {
832
if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
833
|| t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
834
int tmpreg = rc_find_free_temporary(c);
835
struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
836
inst_mov->U.I.Opcode = RC_OPCODE_MOV;
837
inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
838
inst_mov->U.I.DstReg.Index = tmpreg;
839
inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
841
reset_srcreg(&inst->U.I.SrcReg[2]);
842
inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
843
inst->U.I.SrcReg[2].Index = tmpreg;
847
if (opcode->NumSrcRegs >= 2) {
848
if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
849
int tmpreg = rc_find_free_temporary(c);
850
struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
851
inst_mov->U.I.Opcode = RC_OPCODE_MOV;
852
inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
853
inst_mov->U.I.DstReg.Index = tmpreg;
854
inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
856
reset_srcreg(&inst->U.I.SrcReg[1]);
857
inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
858
inst->U.I.SrcReg[1].Index = tmpreg;
865
static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
867
struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
870
for(i = 0; i < 32; ++i) {
871
if ((compiler->RequiredOutputs & (1 << i)) &&
872
!(compiler->Base.Program.OutputsWritten & (1 << i))) {
873
struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
874
inst->U.I.Opcode = RC_OPCODE_MOV;
876
inst->U.I.DstReg.File = RC_FILE_OUTPUT;
877
inst->U.I.DstReg.Index = i;
878
inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
880
inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
881
inst->U.I.SrcReg[0].Index = 0;
882
inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
884
compiler->Base.Program.OutputsWritten |= 1 << i;
889
static void dataflow_outputs_mark_used(void * userdata, void * data,
890
void (*callback)(void *, unsigned int, unsigned int))
892
struct r300_vertex_program_compiler * c = userdata;
895
for(i = 0; i < 32; ++i) {
896
if (c->RequiredOutputs & (1 << i))
897
callback(data, i, RC_MASK_XYZW);
901
static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
909
static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
910
struct rc_instruction *arl,
911
struct rc_instruction *end,
914
struct rc_instruction *inst, *add;
915
unsigned const_swizzle;
918
add = rc_insert_new_instruction(&c->Base, arl->Prev);
919
add->U.I.Opcode = RC_OPCODE_ADD;
920
add->U.I.DstReg.File = RC_FILE_TEMPORARY;
921
add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
922
add->U.I.DstReg.WriteMask = RC_MASK_X;
923
add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
924
add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
925
add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
926
min_offset, &const_swizzle);
927
add->U.I.SrcReg[1].Swizzle = const_swizzle;
929
arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
930
arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
931
arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
933
/* Rewrite offsets up to and excluding inst. */
934
for (inst = arl->Next; inst != end; inst = inst->Next) {
935
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
937
for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
938
if (inst->U.I.SrcReg[i].RelAddr)
939
inst->U.I.SrcReg[i].Index -= min_offset;
943
static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
945
struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
946
struct rc_instruction *inst, *lastARL = NULL;
949
for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
950
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
952
if (inst->U.I.Opcode == RC_OPCODE_ARL) {
953
if (lastARL != NULL && min_offset < 0)
954
transform_negative_addressing(c, lastARL, inst, min_offset);
961
for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
962
if (inst->U.I.SrcReg[i].RelAddr &&
963
inst->U.I.SrcReg[i].Index < 0) {
964
/* ARL must precede any indirect addressing. */
965
if (lastARL == NULL) {
966
rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
970
if (inst->U.I.SrcReg[i].Index < min_offset)
971
min_offset = inst->U.I.SrcReg[i].Index;
976
if (lastARL != NULL && min_offset < 0)
977
transform_negative_addressing(c, lastARL, inst, min_offset);
980
static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
981
.IsNative = &swizzle_is_native,
982
.Split = 0 /* should never be called */
985
void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
987
int is_r500 = c->Base.is_r500;
988
int opt = !c->Base.disable_optimizations;
990
/* Lists of instruction transformations. */
991
struct radeon_program_transformation alu_rewrite_r500[] = {
992
{ &r300_transform_vertex_alu, 0 },
993
{ &r300_transform_trig_scale_vertex, 0 },
997
struct radeon_program_transformation alu_rewrite_r300[] = {
998
{ &r300_transform_vertex_alu, 0 },
999
{ &r300_transform_trig_simple, 0 },
1003
/* Note: These passes have to be done seperately from ALU rewrite,
1004
* otherwise non-native ALU instructions with source conflits
1005
* or non-native modifiers will not be treated properly.
1007
struct radeon_program_transformation emulate_modifiers[] = {
1008
{ &transform_nonnative_modifiers, 0 },
1012
struct radeon_program_transformation resolve_src_conflicts[] = {
1013
{ &transform_source_conflicts, 0 },
1017
/* List of compiler passes. */
1018
struct radeon_compiler_pass vs_list[] = {
1019
/* NAME DUMP PREDICATE FUNCTION PARAM */
1020
{"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
1021
{"transform loops", 1, 1, rc_transform_loops, NULL},
1022
{"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
1023
{"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
1024
{"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
1025
{"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
1026
{"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
1027
{"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
1028
{"dataflow optimize", 1, opt, rc_optimize, NULL},
1029
/* This pass must be done after optimizations. */
1030
{"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
1031
{"register allocation", 1, opt, allocate_temporary_registers, NULL},
1032
{"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
1033
{"final code validation", 0, 1, rc_validate_final_shader, NULL},
1034
{"machine code generation", 0, 1, translate_vertex_program, NULL},
1035
{"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
1036
{NULL, 0, 0, NULL, NULL}
1039
c->Base.type = RC_VERTEX_PROGRAM;
1040
c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
1042
rc_run_compiler(&c->Base, vs_list);
1044
c->code->InputsRead = c->Base.Program.InputsRead;
1045
c->code->OutputsWritten = c->Base.Program.OutputsWritten;
1046
rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);