2
* Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
30
#define PSC_DUMP(a) do { a } while (0)
36
#include "sb_shader.h"
39
#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
43
rp_kcache_tracker::rp_kcache_tracker(shader &sh) : rp(), uc(),
44
// FIXME: for now we'll use "two const pairs" limit for r600, same as
45
// for other chips, otherwise additional check in alu_group_tracker is
46
// required to make sure that all 4 consts in the group fit into 2
50
bool rp_kcache_tracker::try_reserve(sel_chan r) {
51
unsigned sel = kc_sel(r);
53
for (unsigned i = 0; i < sel_count; ++i) {
67
bool rp_kcache_tracker::try_reserve(node* n) {
68
bool need_unreserve = false;
69
vvec::iterator I(n->src.begin()), E(n->src.end());
74
if (!try_reserve(v->select))
77
need_unreserve = true;
83
if (need_unreserve && I != n->src.begin()) {
89
} while (I != n->src.begin());
95
void rp_kcache_tracker::unreserve(node* n) {
96
vvec::iterator I(n->src.begin()), E(n->src.end());
100
unreserve(v->select);
104
void rp_kcache_tracker::unreserve(sel_chan r) {
105
unsigned sel = kc_sel(r);
107
for (unsigned i = 0; i < sel_count; ++i)
117
bool literal_tracker::try_reserve(alu_node* n) {
118
bool need_unreserve = false;
120
vvec::iterator I(n->src.begin()), E(n->src.end());
122
for (; I != E; ++I) {
124
if (v->is_literal()) {
125
if (!try_reserve(v->literal_value))
128
need_unreserve = true;
134
if (need_unreserve && I != n->src.begin()) {
139
unreserve(v->literal_value);
140
} while (I != n->src.begin());
145
void literal_tracker::unreserve(alu_node* n) {
146
unsigned nsrc = n->bc.op_ptr->src_count, i;
148
for (i = 0; i < nsrc; ++i) {
149
value *v = n->src[i];
151
unreserve(v->literal_value);
155
bool literal_tracker::try_reserve(literal l) {
157
PSC_DUMP( sblog << "literal reserve " << l.u << " " << l.f << "\n"; );
159
for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
163
PSC_DUMP( sblog << " reserved new uc = " << uc[i] << "\n"; );
165
} else if (lt[i] == l) {
167
PSC_DUMP( sblog << " reserved uc = " << uc[i] << "\n"; );
171
PSC_DUMP( sblog << " failed to reserve literal\n"; );
175
void literal_tracker::unreserve(literal l) {
177
PSC_DUMP( sblog << "literal unreserve " << l.u << " " << l.f << "\n"; );
179
for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
190
static inline unsigned bs_cycle_vector(unsigned bs, unsigned src) {
191
static const unsigned swz[VEC_NUM][3] = {
192
{0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0}
194
assert(bs < VEC_NUM && src < 3);
198
static inline unsigned bs_cycle_scalar(unsigned bs, unsigned src) {
199
static const unsigned swz[SCL_NUM][3] = {
200
{2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1}
203
if (bs >= SCL_NUM || src >= 3) {
204
// this prevents gcc warning "array subscript is above array bounds"
205
// AFAICS we should never hit this path
211
static inline unsigned bs_cycle(bool trans, unsigned bs, unsigned src) {
212
return trans ? bs_cycle_scalar(bs, src) : bs_cycle_vector(bs, src);
216
bool rp_gpr_tracker::try_reserve(unsigned cycle, unsigned sel, unsigned chan) {
218
if (rp[cycle][chan] == 0) {
219
rp[cycle][chan] = sel;
222
} else if (rp[cycle][chan] == sel) {
230
void rp_gpr_tracker::unreserve(alu_node* n) {
231
unsigned nsrc = n->bc.op_ptr->src_count, i;
232
unsigned trans = n->bc.slot == SLOT_TRANS;
233
unsigned bs = n->bc.bank_swizzle;
234
unsigned opt = !trans
235
&& n->bc.src[0].sel == n->bc.src[1].sel
236
&& n->bc.src[0].chan == n->bc.src[1].chan;
238
for (i = 0; i < nsrc; ++i) {
239
value *v = n->src[i];
240
if (v->is_readonly() || v->is_undef())
244
unsigned cycle = bs_cycle(trans, bs, i);
245
unreserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan);
250
void rp_gpr_tracker::unreserve(unsigned cycle, unsigned sel, unsigned chan) {
252
assert(rp[cycle][chan] == sel && uc[cycle][chan]);
253
if (--uc[cycle][chan] == 0)
258
bool rp_gpr_tracker::try_reserve(alu_node* n) {
259
unsigned nsrc = n->bc.op_ptr->src_count, i;
260
unsigned trans = n->bc.slot == SLOT_TRANS;
261
unsigned bs = n->bc.bank_swizzle;
262
unsigned opt = !trans && nsrc >= 2 &&
263
n->src[0] == n->src[1];
265
bool need_unreserve = false;
266
unsigned const_count = 0, min_gpr_cycle = 3;
268
for (i = 0; i < nsrc; ++i) {
269
value *v = n->src[i];
270
if (v->is_readonly() || v->is_undef()) {
272
if (trans && const_count == 3)
278
unsigned cycle = bs_cycle(trans, bs, i);
280
if (trans && cycle < min_gpr_cycle)
281
min_gpr_cycle = cycle;
283
if (const_count && cycle < const_count && trans)
286
if (!try_reserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan))
289
need_unreserve = true;
293
if ((i == nsrc) && (min_gpr_cycle + 1 > const_count))
296
if (need_unreserve && i--) {
298
value *v = n->src[i];
299
if (!v->is_readonly() && !v->is_undef()) {
302
unreserve(bs_cycle(trans, bs, i), n->bc.src[i].sel,
310
alu_group_tracker::alu_group_tracker(shader &sh)
312
gpr(), lt(), slots(),
313
max_slots(sh.get_ctx().is_cayman() ? 4 : 5),
314
has_mova(), uses_ar(), has_predset(), has_kill(),
315
updates_exec_mask(), consumes_lds_oqa(), produces_lds_oqa(), chan_count(), interp_param(), next_id() {
317
available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
321
sel_chan alu_group_tracker::get_value_id(value* v) {
322
unsigned &id = vmap[v];
325
return sel_chan(id, v->get_final_chan());
329
void alu_group_tracker::assign_slot(unsigned slot, alu_node* n) {
332
available_slots &= ~(1 << slot);
334
unsigned param = n->interp_param();
337
assert(!interp_param || interp_param == param);
338
interp_param = param;
343
void alu_group_tracker::discard_all_slots(container_node &removed_nodes) {
344
PSC_DUMP( sblog << "agt::discard_all_slots\n"; );
345
discard_slots(~available_slots & ((1 << max_slots) - 1), removed_nodes);
348
void alu_group_tracker::discard_slots(unsigned slot_mask,
349
container_node &removed_nodes) {
352
sblog << "discard_slots : packed_ops : "
353
<< (unsigned)packed_ops.size() << "\n";
356
for (node_vec::iterator N, I = packed_ops.begin();
357
I != packed_ops.end(); I = N) {
360
alu_packed_node *n = static_cast<alu_packed_node*>(*I);
361
unsigned pslots = n->get_slot_mask();
364
sblog << "discard_slots : packed slot_mask : " << pslots << "\n";
367
if (pslots & slot_mask) {
370
sblog << "discard_slots : discarding packed...\n";
373
removed_nodes.push_back(n);
374
slot_mask &= ~pslots;
375
N = packed_ops.erase(I);
376
available_slots |= pslots;
377
for (unsigned k = 0; k < max_slots; ++k) {
378
if (pslots & (1 << k))
384
for (unsigned slot = 0; slot < max_slots; ++slot) {
385
unsigned slot_bit = 1 << slot;
387
if (slot_mask & slot_bit) {
388
assert(!(available_slots & slot_bit));
391
assert(!(slots[slot]->bc.slot_flags & AF_4SLOT));
394
sblog << "discarding slot " << slot << " : ";
395
dump::dump_op(slots[slot]);
399
removed_nodes.push_back(slots[slot]);
401
available_slots |= slot_bit;
405
alu_node *t = slots[4];
406
if (t && (t->bc.slot_flags & AF_V)) {
407
unsigned chan = t->bc.dst_chan;
412
sblog << " from trans slot to free slot " << chan << "\n";
424
alu_group_node* alu_group_tracker::emit() {
426
alu_group_node *g = sh.create_alu_group();
428
lt.init_group_literals(g);
430
for (unsigned i = 0; i < max_slots; ++i) {
431
alu_node *n = slots[i];
439
bool alu_group_tracker::try_reserve(alu_node* n) {
440
unsigned nsrc = n->bc.op_ptr->src_count;
441
unsigned slot = n->bc.slot;
442
bool trans = slot == 4;
447
unsigned flags = n->bc.op_ptr->flags;
449
unsigned param = n->interp_param();
451
if (param && interp_param && interp_param != param)
454
if ((flags & AF_KILL) && has_predset)
456
if ((flags & AF_ANY_PRED) && (has_kill || has_predset))
458
if ((flags & AF_MOVA) && (has_mova || uses_ar))
461
if (n->uses_ar() && has_mova)
464
if (consumes_lds_oqa)
466
if (n->consumes_lds_oq() && available_slots != (sh.get_ctx().has_trans ? 0x1F : 0x0F))
468
for (unsigned i = 0; i < nsrc; ++i) {
470
unsigned last_id = next_id;
472
value *v = n->src[i];
473
if (!v->is_any_gpr() && !v->is_rel())
475
sel_chan vid = get_value_id(n->src[i]);
477
if (vid > last_id && chan_count[vid.chan()] == 3) {
481
n->bc.src[i].sel = vid.sel();
482
n->bc.src[i].chan = vid.chan();
485
if (!lt.try_reserve(n))
488
if (!kc.try_reserve(n)) {
493
unsigned fbs = n->forced_bank_swizzle();
495
n->bc.bank_swizzle = 0;
498
n->bc.bank_swizzle = VEC_210;
500
if (gpr.try_reserve(n)) {
501
assign_slot(slot, n);
506
unsigned swz_num = trans ? SCL_NUM : VEC_NUM;
507
for (unsigned bs = 0; bs < swz_num; ++bs) {
508
n->bc.bank_swizzle = bs;
509
if (gpr.try_reserve(n)) {
510
assign_slot(slot, n);
519
unsigned forced_swz_slots = 0;
520
int first_slot = ~0, first_nf = ~0, last_slot = ~0;
523
for (unsigned i = 0; i < max_slots; ++i) {
524
alu_node *a = slots[i];
526
if (first_slot == ~0)
529
save_bs[i] = a->bc.bank_swizzle;
530
if (a->forced_bank_swizzle()) {
531
assert(i != SLOT_TRANS);
532
forced_swz_slots |= (1 << i);
533
a->bc.bank_swizzle = VEC_210;
534
if (!gpr.try_reserve(a))
535
assert(!"internal reservation error");
540
a->bc.bank_swizzle = 0;
545
if (first_nf == ~0) {
546
assign_slot(slot, n);
550
assert(first_slot != ~0 && last_slot != ~0);
552
// silence "array subscript is above array bounds" with gcc 4.8
557
alu_node *a = slots[i];
558
bool backtrack = false;
563
sblog << " bs: trying s" << i << " bs:" << a->bc.bank_swizzle
564
<< " bt:" << backtrack << "\n";
567
if (!backtrack && gpr.try_reserve(a)) {
569
sblog << " bs: reserved s" << i << " bs:" << a->bc.bank_swizzle
573
while ((++i <= last_slot) && !slots[i]);
579
bool itrans = i == SLOT_TRANS;
580
unsigned max_swz = itrans ? SCL_221 : VEC_210;
582
if (a->bc.bank_swizzle < max_swz) {
583
++a->bc.bank_swizzle;
586
sblog << " bs: inc s" << i << " bs:" << a->bc.bank_swizzle
592
a->bc.bank_swizzle = 0;
593
while ((--i >= first_nf) && !slots[i]);
598
sblog << " bs: unreserve s" << i << " bs:" << a->bc.bank_swizzle
610
if (i == last_slot + 1) {
611
assign_slot(slot, n);
615
// reservation failed, restore previous state
618
for (unsigned i = 0; i < max_slots; ++i) {
619
alu_node *a = slots[i];
621
a->bc.bank_swizzle = save_bs[i];
622
bool b = gpr.try_reserve(a);
632
bool alu_group_tracker::try_reserve(alu_packed_node* p) {
633
bool need_unreserve = false;
634
node_iterator I(p->begin()), E(p->end());
636
for (; I != E; ++I) {
637
alu_node *n = static_cast<alu_node*>(*I);
641
need_unreserve = true;
645
packed_ops.push_back(p);
649
if (need_unreserve) {
651
alu_node *n = static_cast<alu_node*>(*I);
652
slots[n->bc.slot] = NULL;
659
void alu_group_tracker::reinit() {
661
memcpy(s, slots, sizeof(slots));
665
for (int i = max_slots - 1; i >= 0; --i) {
666
if (s[i] && !try_reserve(s[i])) {
667
sblog << "alu_group_tracker: reinit error on slot " << i << "\n";
668
for (unsigned i = 0; i < max_slots; ++i) {
669
sblog << " slot " << i << " : ";
675
assert(!"alu_group_tracker: reinit error");
680
void alu_group_tracker::reset(bool keep_packed) {
684
memset(slots, 0, sizeof(slots));
687
produces_lds_oqa = 0;
688
consumes_lds_oqa = 0;
693
updates_exec_mask = false;
694
available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
706
void alu_group_tracker::update_flags(alu_node* n) {
707
unsigned flags = n->bc.op_ptr->flags;
708
has_kill |= (flags & AF_KILL);
709
has_mova |= (flags & AF_MOVA);
710
has_predset |= (flags & AF_ANY_PRED);
711
uses_ar |= n->uses_ar();
712
consumes_lds_oqa |= n->consumes_lds_oq();
713
produces_lds_oqa |= n->produces_lds_oq();
714
if (flags & AF_ANY_PRED) {
715
if (n->dst[2] != NULL)
716
updates_exec_mask = true;
720
int post_scheduler::run() {
721
return run_on(sh.root) ? 0 : 1;
724
bool post_scheduler::run_on(container_node* n) {
726
for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
727
if (I->is_container()) {
728
if (I->subtype == NST_BB) {
729
bb_node* bb = static_cast<bb_node*>(*I);
732
r = run_on(static_cast<container_node*>(*I));
741
void post_scheduler::init_uc_val(container_node *c, value *v) {
742
node *d = v->any_def();
743
if (d && d->parent == c)
747
void post_scheduler::init_uc_vec(container_node *c, vvec &vv, bool src) {
748
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
750
if (!v || v->is_readonly())
754
init_uc_val(c, v->rel);
755
init_uc_vec(c, v->muse, true);
762
unsigned post_scheduler::init_ucm(container_node *c, node *n) {
763
init_uc_vec(c, n->src, true);
764
init_uc_vec(c, n->dst, false);
766
uc_map::iterator F = ucm.find(n);
767
return F == ucm.end() ? 0 : F->second;
770
bool post_scheduler::schedule_bb(bb_node* bb) {
772
sblog << "scheduling BB " << bb->id << "\n";
773
if (!pending.empty())
774
dump::dump_op_list(&pending);
777
assert(pending.empty());
778
assert(bb_pending.empty());
779
assert(ready.empty());
781
bb_pending.append_from(bb);
786
while ((n = bb_pending.back())) {
789
sblog << "post_sched_bb ";
794
// May require emitting ALU ops to load index registers
795
if (n->is_fetch_clause()) {
797
process_fetch(static_cast<container_node *>(n));
801
if (n->is_alu_clause()) {
803
bool r = process_alu(static_cast<container_node*>(n));
817
void post_scheduler::init_regmap() {
822
sblog << "init_regmap: live: ";
823
dump::dump_set(sh, live);
827
for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
830
if (!v->is_sgpr() || !v->is_prealloc())
836
sblog << "init_regmap: " << r << " <= ";
846
static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
847
alu_node *a = sh.create_alu();
849
assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
850
if (ar_idx == V_SQ_CF_INDEX_0)
851
a->bc.set_op(ALU_OP0_SET_CF_IDX0);
853
a->bc.set_op(ALU_OP0_SET_CF_IDX1);
855
a->dst.resize(1); // Dummy needed for recolor
858
sblog << "created IDX load: ";
866
void post_scheduler::load_index_register(value *v, unsigned ar_idx)
870
if (!sh.get_ctx().is_cayman()) {
871
// Evergreen has to first load address register, then use CF_SET_IDX0/1
872
alu_group_tracker &rt = alu.grp();
873
alu_node *set_idx = create_set_idx(sh, ar_idx);
874
if (!rt.try_reserve(set_idx)) {
875
sblog << "can't emit SET_CF_IDX";
876
dump::dump_op(set_idx);
881
if (!alu.check_clause_limits()) {
882
// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
887
alu_group_tracker &rt = alu.grp();
888
alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
890
if (!rt.try_reserve(a)) {
891
sblog << "can't emit AR load : ";
898
if (!alu.check_clause_limits()) {
899
// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
903
alu.emit_clause(cur_bb);
906
void post_scheduler::process_fetch(container_node *c) {
910
for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
916
fetch_node *f = static_cast<fetch_node*>(n);
919
sblog << "process_tex ";
924
// TODO: If same values used can avoid reloading index register
925
if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ||
926
f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
927
unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ?
928
f->bc.sampler_index_mode : f->bc.resource_index_mode;
930
// Currently require prior opt passes to use one TEX per indexed op
931
assert(f->parent->count() == 1);
933
value *v = f->src.back(); // Last src is index offset
936
cur_bb->push_front(c);
938
load_index_register(v, index_mode);
939
f->src.pop_back(); // Don't need index value any more
945
cur_bb->push_front(c);
948
bool post_scheduler::process_alu(container_node *c) {
956
live = c->live_after;
958
init_globals(c->live_after, true);
959
init_globals(c->live_before, true);
963
update_local_interferences();
965
for (node_riterator N, I = c->rbegin(), E = c->rend(); I != E; I = N) {
970
unsigned uc = init_ucm(c, n);
973
sblog << "process_alu uc=" << uc << " ";
981
pending.push_back(n);
982
PSC_DUMP( sblog << "pending\n"; );
988
return schedule_alu(c);
991
void post_scheduler::update_local_interferences() {
994
sblog << "update_local_interferences : ";
995
dump::dump_set(sh, live);
1000
for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
1002
if (v->is_prealloc())
1005
v->interferences.add_set(live);
1009
void post_scheduler::update_live_src_vec(vvec &vv, val_set *born, bool src) {
1010
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1016
if (src && v->is_any_gpr()) {
1017
if (live.add_val(v)) {
1018
if (!v->is_prealloc()) {
1019
if (!cleared_interf.contains(v)) {
1021
sblog << "clearing interferences for " << *v << "\n";
1023
v->interferences.clear();
1024
cleared_interf.add_val(v);
1030
} else if (v->is_rel()) {
1031
if (!v->rel->is_any_gpr())
1032
live.add_val(v->rel);
1033
update_live_src_vec(v->muse, born, true);
1038
void post_scheduler::update_live_dst_vec(vvec &vv) {
1039
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1045
update_live_dst_vec(v->mdef);
1046
} else if (v->is_any_gpr()) {
1047
if (!live.remove_val(v)) {
1049
sblog << "failed to remove ";
1051
sblog << " from live : ";
1052
dump::dump_set(sh, live);
1060
void post_scheduler::update_live(node *n, val_set *born) {
1061
update_live_dst_vec(n->dst);
1062
update_live_src_vec(n->src, born, true);
1063
update_live_src_vec(n->dst, born, false);
1066
void post_scheduler::process_group() {
1067
alu_group_tracker &rt = alu.grp();
1074
sblog << "process_group: live_before : ";
1075
dump::dump_set(sh, live);
1079
for (unsigned s = 0; s < ctx.num_slots; ++s) {
1080
alu_node *n = rt.slot(s);
1084
update_live(n, &vals_born);
1088
sblog << "process_group: live_after : ";
1089
dump::dump_set(sh, live);
1093
update_local_interferences();
1095
for (unsigned i = 0; i < 5; ++i) {
1096
node *n = rt.slot(i);
1097
if (n && !n->is_mova()) {
1098
release_src_values(n);
1103
void post_scheduler::init_globals(val_set &s, bool prealloc) {
1106
sblog << "init_globals: ";
1107
dump::dump_set(sh, s);
1111
for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) {
1113
if (v->is_sgpr() && !v->is_global()) {
1116
if (prealloc && v->is_fixed()) {
1123
void post_scheduler::emit_index_registers() {
1124
for (unsigned i = 0; i < 2; i++) {
1125
if (alu.current_idx[i]) {
1126
regmap = prev_regmap;
1127
alu.discard_current_group();
1129
load_index_register(alu.current_idx[i], KC_INDEX_0 + i);
1130
alu.current_idx[i] = NULL;
1135
void post_scheduler::emit_clause() {
1137
if (alu.current_ar) {
1140
if (!alu.check_clause_limits()) {
1141
// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
1146
if (!alu.is_empty()) {
1147
alu.emit_clause(cur_bb);
1150
emit_index_registers();
1153
bool post_scheduler::schedule_alu(container_node *c) {
1155
assert(!ready.empty() || !ready_copies.empty());
1157
/* This number is rather arbitrary, important is that the scheduler has
1158
* more than one try to create an instruction group
1161
int last_pending = pending.count();
1162
while (improving > 0) {
1163
prev_regmap = regmap;
1164
if (!prepare_alu_group()) {
1166
int new_pending = pending.count();
1167
if ((new_pending < last_pending) || (last_pending == 0))
1172
last_pending = new_pending;
1174
if (alu.current_idx[0] || alu.current_idx[1]) {
1175
regmap = prev_regmap;
1177
init_globals(live, false);
1182
if (alu.current_ar) {
1189
if (!alu.check_clause_limits()) {
1190
regmap = prev_regmap;
1192
init_globals(live, false);
1201
if (!alu.is_empty()) {
1205
if (!ready.empty()) {
1206
sblog << "##post_scheduler: unscheduled ready instructions :";
1207
dump::dump_op_list(&ready);
1208
assert(!"unscheduled ready instructions");
1211
if (!pending.empty()) {
1212
sblog << "##post_scheduler: unscheduled pending instructions :";
1213
dump::dump_op_list(&pending);
1214
assert(!"unscheduled pending instructions");
1219
void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
1220
unsigned chan = v->gpr.chan();
1222
for (val_set::iterator I = vs.begin(sh), E = vs.end(sh);
1225
sel_chan gpr = vi->get_final_gpr();
1227
if (vi->is_any_gpr() && gpr && vi != v &&
1228
(!v->chunk || v->chunk != vi->chunk) &&
1229
vi->is_fixed() && gpr.chan() == chan) {
1231
unsigned r = gpr.sel();
1234
sblog << "\tadd_interferences: " << *vi << "\n";
1244
void post_scheduler::set_color_local_val(value *v, sel_chan color) {
1248
sblog << " recolored: ";
1254
void post_scheduler::set_color_local(value *v, sel_chan color) {
1256
vvec &vv = v->chunk->values;
1257
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1259
set_color_local_val(v2, color);
1263
set_color_local_val(v, color);
1268
bool post_scheduler::recolor_local(value *v) {
1272
assert(v->is_sgpr());
1273
assert(!v->is_prealloc());
1276
unsigned chan = v->gpr.chan();
1279
sblog << "recolor_local: ";
1281
sblog << " interferences: ";
1282
dump::dump_set(sh, v->interferences);
1285
sblog << " in chunk: ";
1286
coalescer::dump_chunk(v->chunk);
1292
for (vvec::iterator I = v->chunk->values.begin(),
1293
E = v->chunk->values.end(); I != E; ++I) {
1296
PSC_DUMP( sblog << " add_interferences for " << *v2 << " :\n"; );
1298
add_interferences(v, rb, v2->interferences);
1301
add_interferences(v, rb, v->interferences);
1305
unsigned sz = rb.size();
1306
sblog << "registers bits: " << sz;
1307
for (unsigned r = 0; r < sz; ++r) {
1309
sblog << "\n " << r << " ";
1310
sblog << (rb.get(r) ? 1 : 0);
1314
bool no_temp_gprs = v->is_global();
1315
unsigned rs, re, pass = no_temp_gprs ? 1 : 0;
1320
rs = sh.first_temp_gpr();
1324
re = sh.num_nontemp_gpr();
1327
for (unsigned reg = rs; reg < re; ++reg) {
1328
if (reg >= rb.size() || !rb.get(reg)) {
1330
set_color_local(v, sel_chan(reg, chan));
1337
assert(!"recolor_local failed");
1341
void post_scheduler::emit_load_ar() {
1343
regmap = prev_regmap;
1344
alu.discard_current_group();
1346
alu_group_tracker &rt = alu.grp();
1347
alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
1349
if (!rt.try_reserve(a)) {
1350
sblog << "can't emit AR load : ";
1358
bool post_scheduler::unmap_dst_val(value *d) {
1360
if (d == alu.current_ar) {
1365
if (d->is_prealloc()) {
1366
sel_chan gpr = d->get_final_gpr();
1367
rv_map::iterator F = regmap.find(gpr);
1369
if (F != regmap.end())
1372
if (c && c!=d && (!c->chunk || c->chunk != d->chunk)) {
1374
sblog << "dst value conflict : ";
1376
sblog << " regmap contains ";
1380
assert(!"scheduler error");
1389
bool post_scheduler::unmap_dst(alu_node *n) {
1390
value *d = n->dst.empty() ? NULL : n->dst[0];
1396
if (d && d->is_any_reg()) {
1399
if (alu.current_ar != d) {
1400
sblog << "loading wrong ar value\n";
1403
alu.current_ar = NULL;
1406
} else if (d->is_any_gpr()) {
1407
if (!unmap_dst_val(d))
1412
for (vvec::iterator I = d->mdef.begin(), E = d->mdef.end();
1418
assert(d->is_any_gpr());
1420
if (!unmap_dst_val(d))
1427
bool post_scheduler::map_src_val(value *v) {
1429
if (!v->is_prealloc())
1432
sel_chan gpr = v->get_final_gpr();
1433
rv_map::iterator F = regmap.find(gpr);
1435
if (F != regmap.end()) {
1437
if (!v->v_equal(c)) {
1439
sblog << "can't map src value ";
1441
sblog << ", regmap contains ";
1448
regmap.insert(std::make_pair(gpr, v));
1453
bool post_scheduler::map_src_vec(vvec &vv, bool src) {
1455
// Handle possible UBO indexing
1456
bool ubo_indexing[2] = { false, false };
1457
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1462
if (v->is_kcache()) {
1463
unsigned index_mode = v->select.kcache_index_mode();
1464
if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) {
1465
ubo_indexing[index_mode - KC_INDEX_0] = true;
1470
// idx values stored at end of src vec, see bc_parser::prepare_alu_group
1471
for (unsigned i = 2; i != 0; i--) {
1472
if (ubo_indexing[i-1]) {
1473
// TODO: skip adding value to kcache reservation somehow, causes
1474
// unnecessary group breaks and cache line locks
1475
value *v = vv.back();
1476
if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) {
1478
sblog << "IDX" << i-1 << " already set to " <<
1479
*alu.current_idx[i-1] << ", trying to set " << *v << "\n";
1484
alu.current_idx[i-1] = v;
1485
PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";);
1490
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1495
if ((!v->is_any_gpr() || !v->is_fixed()) && !v->is_rel())
1499
value *rel = v->rel;
1502
if (!rel->is_const()) {
1503
if (!map_src_vec(v->muse, true))
1506
if (rel != alu.current_ar) {
1507
if (alu.current_ar) {
1509
sblog << " current_AR is " << *alu.current_ar
1510
<< " trying to use " << *rel << "\n";
1515
alu.current_ar = rel;
1518
sblog << " new current_AR assigned: " << *alu.current_ar
1525
if (!map_src_val(v)) {
1533
bool post_scheduler::map_src(alu_node *n) {
1534
if (!map_src_vec(n->dst, false))
1537
if (!map_src_vec(n->src, true))
1543
void post_scheduler::dump_regmap() {
1545
sblog << "# REGMAP :\n";
1547
for(rv_map::iterator I = regmap.begin(), E = regmap.end(); I != E; ++I) {
1548
sblog << " # " << I->first << " => " << *(I->second) << "\n";
1552
sblog << " current_AR: " << *alu.current_ar << "\n";
1554
sblog << " current_PR: " << *alu.current_pr << "\n";
1555
if (alu.current_idx[0])
1556
sblog << " current IDX0: " << *alu.current_idx[0] << "\n";
1557
if (alu.current_idx[1])
1558
sblog << " current IDX1: " << *alu.current_idx[1] << "\n";
1561
void post_scheduler::recolor_locals() {
1562
alu_group_tracker &rt = alu.grp();
1564
for (unsigned s = 0; s < ctx.num_slots; ++s) {
1565
alu_node *n = rt.slot(s);
1567
value *d = n->dst[0];
1568
if (d && d->is_sgpr() && !d->is_prealloc()) {
1575
// returns true if there are interferences
1576
bool post_scheduler::check_interferences() {
1578
alu_group_tracker &rt = alu.grp();
1580
unsigned interf_slots;
1582
bool discarded = false;
1585
sblog << "check_interferences: before: \n";
1593
for (unsigned s = 0; s < ctx.num_slots; ++s) {
1594
alu_node *n = rt.slot(s);
1596
if (!unmap_dst(n)) {
1602
for (unsigned s = 0; s < ctx.num_slots; ++s) {
1603
alu_node *n = rt.slot(s);
1606
interf_slots |= (1 << s);
1612
for (unsigned i = 0; i < 5; ++i) {
1613
if (interf_slots & (1 << i)) {
1614
sblog << "!!!!!! interf slot: " << i << " : ";
1615
dump::dump_op(rt.slot(i));
1624
PSC_DUMP( sblog << "ci: discarding slots " << interf_slots << "\n"; );
1626
rt.discard_slots(interf_slots, alu.conflict_nodes);
1627
regmap = prev_regmap;
1633
sblog << "check_interferences: after: \n";
1640
// add instruction(s) (alu_node or contents of alu_packed_node) to current group
1641
// returns the number of added instructions on success
1642
unsigned post_scheduler::try_add_instruction(node *n) {
1644
alu_group_tracker &rt = alu.grp();
1646
unsigned avail_slots = rt.avail_slots();
1648
// Cannot schedule in same clause as instructions using this index value
1649
if (!n->dst.empty() && n->dst[0] &&
1650
(n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) {
1651
PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";);
1655
if (n->is_alu_packed()) {
1656
alu_packed_node *p = static_cast<alu_packed_node*>(n);
1657
unsigned slots = p->get_slot_mask();
1658
unsigned cnt = __builtin_popcount(slots);
1660
if ((slots & avail_slots) != slots) {
1661
PSC_DUMP( sblog << " no slots \n"; );
1665
p->update_packed_items(ctx);
1667
if (!rt.try_reserve(p)) {
1668
PSC_DUMP( sblog << " reservation failed \n"; );
1676
alu_node *a = static_cast<alu_node*>(n);
1677
value *d = a->dst.empty() ? NULL : a->dst[0];
1679
if (d && d->is_special_reg()) {
1680
assert((a->bc.op_ptr->flags & AF_MOVA) || d->is_geometry_emit() || d->is_lds_oq() || d->is_lds_access() || d->is_scratch());
1684
unsigned allowed_slots = ctx.alu_slots_mask(a->bc.op_ptr);
1687
allowed_slots &= avail_slots;
1693
slot = d->get_final_chan();
1694
a->bc.dst_chan = slot;
1695
allowed_slots &= (1 << slot) | 0x10;
1697
if (a->bc.op_ptr->flags & AF_MOVA) {
1698
if (a->bc.slot_flags & AF_V)
1699
allowed_slots &= (1 << SLOT_X);
1701
allowed_slots &= (1 << SLOT_TRANS);
1705
// FIXME workaround for some problems with MULADD in trans slot on r700,
1706
// (is it really needed on r600?)
1707
if ((a->bc.op == ALU_OP3_MULADD || a->bc.op == ALU_OP3_MULADD_IEEE) &&
1709
allowed_slots &= 0x0F;
1712
if (!allowed_slots) {
1713
PSC_DUMP( sblog << " no suitable slots\n"; );
1717
slot = __builtin_ctz(allowed_slots);
1720
PSC_DUMP( sblog << "slot: " << slot << "\n"; );
1722
if (!rt.try_reserve(a)) {
1723
PSC_DUMP( sblog << " reservation failed\n"; );
1732
bool post_scheduler::check_copy(node *n) {
1733
if (!n->is_copy_mov())
1736
value *s = n->src[0];
1737
value *d = n->dst[0];
1739
if (!s->is_sgpr() || !d->is_sgpr())
1742
if (!s->is_prealloc()) {
1745
if (!s->chunk || s->chunk != d->chunk)
1749
if (s->gpr == d->gpr) {
1752
sblog << "check_copy: ";
1757
rv_map::iterator F = regmap.find(d->gpr);
1758
bool gpr_free = (F == regmap.end());
1760
if (d->is_prealloc()) {
1762
PSC_DUMP( sblog << " copy not ready...\n";);
1766
value *rv = F->second;
1767
if (rv != d && (!rv->chunk || rv->chunk != d->chunk)) {
1768
PSC_DUMP( sblog << " copy not ready(2)...\n";);
1772
unmap_dst(static_cast<alu_node*>(n));
1775
if (s->is_prealloc() && !map_src_val(s))
1778
update_live(n, NULL);
1780
release_src_values(n);
1782
PSC_DUMP( sblog << " copy coalesced...\n";);
1788
void post_scheduler::dump_group(alu_group_tracker &rt) {
1789
for (unsigned i = 0; i < 5; ++i) {
1790
node *n = rt.slot(i);
1792
sblog << "slot " << i << " : ";
1799
void post_scheduler::process_ready_copies() {
1804
last = ready_copies.back();
1806
for (node_iterator N, I = ready_copies.begin(), E = ready_copies.end();
1812
if (!check_copy(n)) {
1817
} while (last != ready_copies.back());
1819
update_local_interferences();
1823
bool post_scheduler::prepare_alu_group() {
1825
alu_group_tracker &rt = alu.grp();
1830
sblog << "prepare_alu_group: starting...\n";
1834
ready.append_from(&alu.conflict_nodes);
1836
// FIXME rework this loop
1840
process_ready_copies();
1844
for (node_iterator N, I = ready.begin(), E = ready.end(); I != E;
1856
unsigned cnt = try_add_instruction(n);
1862
sblog << "current group:\n";
1866
if (rt.inst_count() == ctx.num_slots) {
1867
PSC_DUMP( sblog << " all slots used\n"; );
1872
if (!check_interferences())
1875
// don't try to add more instructions to the group with mova if this
1876
// can lead to breaking clause slot count limit - we don't want mova to
1877
// end up in the end of the new clause instead of beginning of the
1879
if (rt.has_ar_load() && alu.total_slots() > 121)
1882
if (rt.inst_count() && i1 > 50)
1885
regmap = prev_regmap;
1890
sblog << " prepare_alu_group done, " << rt.inst_count()
1893
sblog << "$$$$$$$$PAG i1=" << i1
1894
<< " ready " << ready.count()
1895
<< " pending " << pending.count()
1896
<< " conflicting " << alu.conflict_nodes.count()
1901
return rt.inst_count();
1904
void post_scheduler::release_src_values(node* n) {
1905
release_src_vec(n->src, true);
1906
release_src_vec(n->dst, false);
1909
void post_scheduler::release_op(node *n) {
1911
sblog << "release_op ";
1918
if (n->is_copy_mov()) {
1919
ready_copies.push_back(n);
1920
} else if (n->is_mova() || n->is_pred_set()) {
1921
ready.push_front(n);
1927
void post_scheduler::release_src_val(value *v) {
1928
node *d = v->any_def();
1935
void post_scheduler::release_src_vec(vvec& vv, bool src) {
1937
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1939
if (!v || v->is_readonly())
1943
release_src_val(v->rel);
1944
release_src_vec(v->muse, true);
1952
void literal_tracker::reset() {
1957
memset(uc, 0, sizeof(uc));
1960
void rp_gpr_tracker::reset() {
1961
memset(rp, 0, sizeof(rp));
1962
memset(uc, 0, sizeof(uc));
1965
void rp_kcache_tracker::reset() {
1966
memset(rp, 0, sizeof(rp));
1967
memset(uc, 0, sizeof(uc));
1970
void alu_kcache_tracker::reset() {
1971
memset(kc, 0, sizeof(kc));
1975
void alu_clause_tracker::reset() {
1978
outstanding_lds_oqa_reads = 0;
1983
alu_clause_tracker::alu_clause_tracker(shader &sh)
1984
: sh(sh), kt(sh.get_ctx().hw_class), slot_count(),
1987
push_exec_mask(), outstanding_lds_oqa_reads(),
1988
current_ar(), current_pr(), current_idx() {}
1990
void alu_clause_tracker::emit_group() {
1992
assert(grp().inst_count());
1994
alu_group_node *g = grp().emit();
1996
if (grp().has_update_exec_mask()) {
1997
assert(!push_exec_mask);
1998
push_exec_mask = true;
2004
clause = sh.create_clause(NST_ALU_CLAUSE);
2007
clause->push_front(g);
2009
outstanding_lds_oqa_reads += grp().get_consumes_lds_oqa();
2010
outstanding_lds_oqa_reads -= grp().get_produces_lds_oqa();
2011
slot_count += grp().slot_count();
2015
PSC_DUMP( sblog << " #### group emitted\n"; );
2018
void alu_clause_tracker::emit_clause(container_node *c) {
2021
kt.init_clause(clause->bc);
2023
assert(!outstanding_lds_oqa_reads);
2024
assert(!current_ar);
2025
assert(!current_pr);
2028
clause->bc.set_op(CF_OP_ALU_PUSH_BEFORE);
2030
c->push_front(clause);
2033
push_exec_mask = false;
2037
PSC_DUMP( sblog << "######### ALU clause emitted\n"; );
2040
bool alu_clause_tracker::check_clause_limits() {
2042
alu_group_tracker > = grp();
2044
unsigned slots = gt.slot_count();
2046
// reserving slots to load AR and PR values
2047
unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0);
2048
// ...and index registers
2049
reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL);
2051
if (gt.get_consumes_lds_oqa() && !outstanding_lds_oqa_reads)
2052
reserve_slots += 60;
2054
if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots)
2057
if (!kt.try_reserve(gt))
2063
void alu_clause_tracker::new_group() {
2068
bool alu_clause_tracker::is_empty() {
2069
return clause == NULL;
2072
void literal_tracker::init_group_literals(alu_group_node* g) {
2074
g->literals.clear();
2075
for (unsigned i = 0; i < 4; ++i) {
2079
g->literals.push_back(lt[i]);
2082
sblog << "literal emitted: " << lt[i].f;
2083
sblog.print_zw_hex(lt[i].u, 8);
2084
sblog << " " << lt[i].i << "\n";
2089
bool alu_kcache_tracker::try_reserve(alu_group_tracker& gt) {
2090
rp_kcache_tracker &kt = gt.kcache();
2095
sb_set<unsigned> group_lines;
2097
unsigned nl = kt.get_lines(group_lines);
2100
sb_set<unsigned> clause_lines(lines);
2101
lines.add_set(group_lines);
2103
if (clause_lines.size() == lines.size())
2109
lines = clause_lines;
2114
unsigned rp_kcache_tracker::get_lines(kc_lines& lines) {
2117
for (unsigned i = 0; i < sel_count; ++i) {
2118
unsigned line = rp[i] & 0x1fffffffu;
2119
unsigned index_mode = rp[i] >> 29;
2125
line = (sel_count == 2) ? line >> 5 : line >> 6;
2126
line |= index_mode << 29;
2128
if (lines.insert(line).second)
2134
bool alu_kcache_tracker::update_kc() {
2137
bc_kcache old_kc[4];
2138
memcpy(old_kc, kc, sizeof(kc));
2140
for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) {
2141
unsigned index_mode = *I >> 29;
2142
unsigned line = *I & 0x1fffffffu;
2143
unsigned bank = line >> 8;
2145
assert(index_mode <= KC_INDEX_INVALID);
2148
if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) &&
2149
kc[c-1].index_mode == index_mode)
2151
kc[c-1].mode = KC_LOCK_2;
2154
memcpy(kc, old_kc, sizeof(kc));
2158
kc[c].mode = KC_LOCK_1;
2162
kc[c].index_mode = index_mode;
2169
alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
2170
alu_node *a = sh.create_alu();
2172
if (sh.get_ctx().uses_mova_gpr) {
2173
a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
2174
a->bc.slot = SLOT_TRANS;
2176
a->bc.set_op(ALU_OP1_MOVA_INT);
2177
a->bc.slot = SLOT_X;
2179
a->bc.dst_chan = ar_channel;
2180
if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
2181
a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
2185
a->src.push_back(v);
2188
sblog << "created AR load: ";
2196
void alu_clause_tracker::discard_current_group() {
2197
PSC_DUMP( sblog << "act::discard_current_group\n"; );
2198
grp().discard_all_slots(conflict_nodes);
2201
void rp_gpr_tracker::dump() {
2202
sblog << "=== gpr_tracker dump:\n";
2203
for (int c = 0; c < 3; ++c) {
2204
sblog << "cycle " << c << " ";
2205
for (int h = 0; h < 4; ++h) {
2206
sblog << rp[c][h] << ":" << uc[c][h] << " ";
2212
} // namespace r600_sb