2
/*--------------------------------------------------------------------*/
3
/*--- Cache simulation. ---*/
5
/*--------------------------------------------------------------------*/
8
This file is part of Callgrind.
9
(c) 2003-2005, Josef Weidendorfer
11
Parts are Copyright (C) 2002 Nicholas Nethercote
15
This program is free software; you can redistribute it and/or
16
modify it under the terms of the GNU General Public License as
17
published by the Free Software Foundation; either version 2 of the
18
License, or (at your option) any later version.
20
This program is distributed in the hope that it will be useful, but
21
WITHOUT ANY WARRANTY; without even the implied warranty of
22
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
General Public License for more details.
25
You should have received a copy of the GNU General Public License
26
along with this program; if not, write to the Free Software
27
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
30
The GNU General Public License is contained in the file COPYING.
37
- simulates a write-allocate cache
38
- (block --> set) hash function uses simple bit selection
39
- handling of references straddling two cache blocks:
40
- counts as only one cache access (not two)
41
- both blocks hit --> one hit
42
- one block hits, the other misses --> one miss
43
- both blocks miss --> one miss (not two)
46
/* Cache configuration */
49
/* additional structures for cache use info, separated
50
* according usage frequency:
51
* - line_loaded : pointer to cost center of instruction
52
* which loaded the line into cache.
53
* Needed to increment counters when line is evicted.
54
* - line_use : updated on every access
58
UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
63
line_use* dep_use; /* point to higher-level cacheblock for this memline */
72
int line_size; /* bytes */
73
Bool sectored; /* prefetch nearside cacheline on read */
92
* States of flat caches in our model.
93
* We use a 2-level hierarchy,
95
static cache_t2 I1, D1, L2;
97
/* Lower bits of cache tags are used as flags for a cache line */
98
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99
#define CACHELINE_DIRTY 1
102
/* Cache simulator Options */
103
static Bool clo_simulate_writeback = False;
104
static Bool clo_simulate_hwpref = False;
105
static Bool clo_simulate_sectors = False;
106
static Bool clo_collect_cacheuse = False;
108
/* Following global vars are setup before by
109
* setup_bbcc()/cachesim_after_bbsetup():
111
* - Addr bb_base (instruction start address of original BB)
112
* - ULong* cost_base (start of cost array for BB)
113
* - BBCC* nonskipped (only != 0 when in a function not skipped)
116
/* Offset to events in event set, used in log_* functions */
117
static Int off_D0_Ir;
118
static Int off_D1r_Ir;
119
static Int off_D1r_Dr;
120
static Int off_D1w_Ir;
121
static Int off_D1w_Dw;
122
static Int off_D2_Ir;
123
static Int off_D2_Dr;
124
static Int off_D2_Dw;
127
static ULong* cost_base;
128
static InstrInfo* current_ii;
130
/* Cache use offsets */
131
/* FIXME: The offsets are only correct because all eventsets get
132
* the "Use" set added first !
134
static Int off_I1_AcCost = 0;
135
static Int off_I1_SpLoss = 1;
136
static Int off_D1_AcCost = 0;
137
static Int off_D1_SpLoss = 1;
138
static Int off_L2_AcCost = 2;
139
static Int off_L2_SpLoss = 3;
141
/* Cache access types */
142
typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
144
/* Result of a reference into a flat cache */
145
typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
147
/* Result of a reference into a hierarchical cache model */
152
WriteBackMemAccess } CacheModelResult;
154
typedef CacheModelResult (*simcall_type)(Addr, UChar);
157
simcall_type I1_Read;
158
simcall_type D1_Read;
159
simcall_type D1_Write;
162
/*------------------------------------------------------------*/
163
/*--- Cache Simulator Initialization ---*/
164
/*------------------------------------------------------------*/
166
static void cachesim_clearcache(cache_t2* c)
170
for (i = 0; i < c->sets * c->assoc; i++)
173
for (i = 0; i < c->sets * c->assoc; i++) {
174
c->loaded[i].memline = 0;
175
c->loaded[i].use_base = 0;
176
c->loaded[i].dep_use = 0;
177
c->loaded[i].iaddr = 0;
180
c->tags[i] = i % c->assoc; /* init lower bits as pointer */
185
static void cacheuse_initcache(cache_t2* c);
187
/* By this point, the size/assoc/line_size has been checked. */
188
static void cachesim_initcache(cache_t config, cache_t2* c)
190
c->size = config.size;
191
c->assoc = config.assoc;
192
c->line_size = config.line_size;
193
c->sectored = False; // FIXME
195
c->sets = (c->size / c->line_size) / c->assoc;
196
c->sets_min_1 = c->sets - 1;
197
c->assoc_bits = VG_(log2)(c->assoc);
198
c->line_size_bits = VG_(log2)(c->line_size);
199
c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
200
c->tag_mask = ~((1<<c->tag_shift)-1);
202
/* Can bits in tag entries be used for flags?
203
* Should be always true as MIN_LINE_SIZE >= 16 */
204
CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
207
VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
208
c->size, c->line_size,
209
c->sectored ? ", sectored":"");
211
VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
212
c->size, c->line_size, c->assoc,
213
c->sectored ? ", sectored":"");
216
c->tags = (UWord*) CLG_MALLOC(sizeof(UWord) * c->sets * c->assoc);
217
if (clo_collect_cacheuse)
218
cacheuse_initcache(c);
221
cachesim_clearcache(c);
226
static void print_cache(cache_t2* c)
230
/* Note initialisation and update of 'i'. */
231
for (i = 0, set = 0; set < c->sets; set++) {
232
for (way = 0; way < c->assoc; way++, i++) {
233
VG_(printf)("%8x ", c->tags[i]);
241
/*------------------------------------------------------------*/
242
/*--- Write Through Cache Simulation ---*/
243
/*------------------------------------------------------------*/
246
* Simple model: L1 & L2 Write Through
247
* Does not distinguish among read and write references
249
* Simulator functions:
250
* CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251
* CacheModelResult cachesim_D1_ref(Addr a, UChar size)
255
CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
260
/* Shifting is a bit faster than multiplying */
261
set = &(c->tags[set_no << c->assoc_bits]);
263
/* This loop is unrolled for just the first case, which is the most */
264
/* common. We can't unroll any further because it would screw up */
265
/* if we have a direct-mapped (1-way) cache. */
269
/* If the tag is one other than the MRU, move it into the MRU spot */
270
/* and shuffle the rest down. */
271
for (i = 1; i < c->assoc; i++) {
273
for (j = i; j > 0; j--) {
281
/* A miss; install this tag as MRU, shuffle rest down. */
282
for (j = c->assoc - 1; j > 0; j--) {
290
static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
292
UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
293
UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
294
UWord tag = a >> c->tag_shift;
296
/* Access entirely within line. */
298
return cachesim_setref(c, set1, tag);
300
/* Access straddles two lines. */
301
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
302
else if (((set1 + 1) & (c->sets-1)) == set2) {
304
/* the call updates cache structures as side effect */
305
CacheResult res1 = cachesim_setref(c, set1, tag);
306
CacheResult res2 = cachesim_setref(c, set2, tag);
307
return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
310
VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
311
VG_(tool_panic)("item straddles more than two cache sets");
317
CacheModelResult cachesim_I1_ref(Addr a, UChar size)
319
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
320
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
325
CacheModelResult cachesim_D1_ref(Addr a, UChar size)
327
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
328
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
333
/*------------------------------------------------------------*/
334
/*--- Write Back Cache Simulation ---*/
335
/*------------------------------------------------------------*/
338
* More complex model: L1 Write-through, L2 Write-back
339
* This needs to distinguish among read and write references.
341
* Simulator functions:
342
* CacheModelResult cachesim_I1_Read(Addr a, UChar size)
343
* CacheModelResult cachesim_D1_Read(Addr a, UChar size)
344
* CacheModelResult cachesim_D1_Write(Addr a, UChar size)
348
* With write-back, result can be a miss evicting a dirty line
349
* The dirty state of a cache line is stored in Bit0 of the tag for
350
* this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
351
* type (Read/Write), the line gets dirty on a write.
354
CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
359
/* Shifting is a bit faster than multiplying */
360
set = &(c->tags[set_no << c->assoc_bits]);
362
/* This loop is unrolled for just the first case, which is the most */
363
/* common. We can't unroll any further because it would screw up */
364
/* if we have a direct-mapped (1-way) cache. */
365
if (tag == (set[0] & ~CACHELINE_DIRTY)) {
369
/* If the tag is one other than the MRU, move it into the MRU spot */
370
/* and shuffle the rest down. */
371
for (i = 1; i < c->assoc; i++) {
372
if (tag == (set[i] & ~CACHELINE_DIRTY)) {
373
tmp_tag = set[i] | ref; // update dirty flag
374
for (j = i; j > 0; j--) {
382
/* A miss; install this tag as MRU, shuffle rest down. */
383
tmp_tag = set[c->assoc - 1];
384
for (j = c->assoc - 1; j > 0; j--) {
389
return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
394
CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
396
UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
397
UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
398
UWord tag = a & c->tag_mask;
400
/* Access entirely within line. */
402
return cachesim_setref_wb(c, ref, set1, tag);
404
/* Access straddles two lines. */
405
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
406
else if (((set1 + 1) & (c->sets-1)) == set2) {
408
/* the call updates cache structures as side effect */
409
CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
410
CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag);
412
if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413
return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
416
VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
417
VG_(tool_panic)("item straddles more than two cache sets");
424
CacheModelResult cachesim_I1_Read(Addr a, UChar size)
426
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428
case Hit: return L2_Hit;
429
case Miss: return MemAccess;
432
return WriteBackMemAccess;
436
CacheModelResult cachesim_D1_Read(Addr a, UChar size)
438
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
440
case Hit: return L2_Hit;
441
case Miss: return MemAccess;
444
return WriteBackMemAccess;
448
CacheModelResult cachesim_D1_Write(Addr a, UChar size)
450
if ( cachesim_ref( &D1, a, size) == Hit ) {
451
/* Even for a L1 hit, the write-trough L1 passes
452
* the write to the L2 to make the L2 line dirty.
453
* But this causes no latency, so return the hit.
455
cachesim_ref_wb( &L2, Write, a, size);
458
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
459
case Hit: return L2_Hit;
460
case Miss: return MemAccess;
463
return WriteBackMemAccess;
467
/*------------------------------------------------------------*/
468
/*--- Hardware Prefetch Simulation ---*/
469
/*------------------------------------------------------------*/
471
static ULong prefetch_up = 0;
472
static ULong prefetch_down = 0;
475
#define PF_PAGEBITS 12
477
static UInt pf_lastblock[PF_STREAMS];
478
static Int pf_seqblocks[PF_STREAMS];
481
void prefetch_clear(void)
484
for(i=0;i<PF_STREAMS;i++)
485
pf_lastblock[i] = pf_seqblocks[i] = 0;
489
* HW Prefetch emulation
490
* Start prefetching when detecting sequential access to 3 memory blocks.
491
* One stream can be detected per 4k page.
494
void prefetch_L2_doref(Addr a, UChar size)
496
UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497
UInt block = ( a >> L2.line_size_bits);
499
if (block != pf_lastblock[stream]) {
500
if (pf_seqblocks[stream] == 0) {
501
if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502
else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
504
else if (pf_seqblocks[stream] >0) {
505
if (pf_lastblock[stream] +1 == block) {
506
pf_seqblocks[stream]++;
507
if (pf_seqblocks[stream] >= 2) {
509
cachesim_ref(&L2, a + 5 * L2.line_size,1);
512
else pf_seqblocks[stream] = 0;
514
else if (pf_seqblocks[stream] <0) {
515
if (pf_lastblock[stream] -1 == block) {
516
pf_seqblocks[stream]--;
517
if (pf_seqblocks[stream] <= -2) {
519
cachesim_ref(&L2, a - 5 * L2.line_size,1);
522
else pf_seqblocks[stream] = 0;
524
pf_lastblock[stream] = block;
528
/* simple model with hardware prefetch */
531
CacheModelResult prefetch_I1_ref(Addr a, UChar size)
533
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
534
prefetch_L2_doref(a,size);
535
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
540
CacheModelResult prefetch_D1_ref(Addr a, UChar size)
542
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
543
prefetch_L2_doref(a,size);
544
if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
549
/* complex model with hardware prefetch */
552
CacheModelResult prefetch_I1_Read(Addr a, UChar size)
554
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
555
prefetch_L2_doref(a,size);
556
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
557
case Hit: return L2_Hit;
558
case Miss: return MemAccess;
561
return WriteBackMemAccess;
565
CacheModelResult prefetch_D1_Read(Addr a, UChar size)
567
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
568
prefetch_L2_doref(a,size);
569
switch( cachesim_ref_wb( &L2, Read, a, size) ) {
570
case Hit: return L2_Hit;
571
case Miss: return MemAccess;
574
return WriteBackMemAccess;
578
CacheModelResult prefetch_D1_Write(Addr a, UChar size)
580
prefetch_L2_doref(a,size);
581
if ( cachesim_ref( &D1, a, size) == Hit ) {
582
/* Even for a L1 hit, the write-trough L1 passes
583
* the write to the L2 to make the L2 line dirty.
584
* But this causes no latency, so return the hit.
586
cachesim_ref_wb( &L2, Write, a, size);
589
switch( cachesim_ref_wb( &L2, Write, a, size) ) {
590
case Hit: return L2_Hit;
591
case Miss: return MemAccess;
594
return WriteBackMemAccess;
598
/*------------------------------------------------------------*/
599
/*--- Cache Simulation with use metric collection ---*/
600
/*------------------------------------------------------------*/
602
/* can not be combined with write-back or prefetch */
605
void cacheuse_initcache(cache_t2* c)
608
unsigned int start_mask, start_val;
609
unsigned int end_mask, end_val;
611
c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
612
c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
613
c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
614
c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
617
c->line_size_mask = c->line_size-1;
619
/* Meaning of line_start_mask/line_end_mask
620
* Example: for a given cache line, you get an access starting at
621
* byte offset 5, length 4, byte 5 - 8 was touched. For a cache
622
* line size of 32, you have 1 bit per byte in the mask:
624
* bit31 bit8 bit5 bit 0
626
* 11..111111100000 line_start_mask[5]
627
* 00..000111111111 line_end_mask[(5+4)-1]
629
* use_mask |= line_start_mask[5] && line_end_mask[8]
632
start_val = end_val = ~0;
633
if (c->line_size < 32) {
634
int bits_per_byte = 32/c->line_size;
635
start_mask = (1<<bits_per_byte)-1;
636
end_mask = start_mask << (32-bits_per_byte);
637
for(i=0;i<c->line_size;i++) {
638
c->line_start_mask[i] = start_val;
639
start_val = start_val & ~start_mask;
640
start_mask = start_mask << bits_per_byte;
642
c->line_end_mask[c->line_size-i-1] = end_val;
643
end_val = end_val & ~end_mask;
644
end_mask = end_mask >> bits_per_byte;
648
int bytes_per_bit = c->line_size/32;
651
for(i=0;i<c->line_size;i++) {
652
c->line_start_mask[i] = start_val;
653
c->line_end_mask[c->line_size-i-1] = end_val;
654
if ( ((i+1)%bytes_per_bit) == 0) {
655
start_val &= ~start_mask;
656
end_val &= ~end_mask;
663
CLG_DEBUG(6, "Config %s:\n", c->desc_line);
664
for(i=0;i<c->line_size;i++) {
665
CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
666
i, c->line_start_mask[i], c->line_end_mask[i]);
669
/* We use lower tag bits as offset pointers to cache use info.
670
* I.e. some cache parameters don't work.
672
if (c->tag_shift < c->assoc_bits) {
673
VG_(message)(Vg_DebugMsg,
674
"error: Use associativity < %d for cache use statistics!",
676
VG_(tool_panic)("Unsupported cache configuration");
680
/* FIXME: A little tricky */
684
void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
686
int idx = (high_idx << c->assoc_bits) | low_idx;
688
c->use[idx].count ++;
689
c->use[idx].mask |= use_mask;
691
CLG_DEBUG(6," Hit [idx %d] (line %p from %p): %x => %08x, count %d\n",
692
idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
693
use_mask, c->use[idx].mask, c->use[idx].count);
696
/* only used for I1, D1 */
699
CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
705
/* Shifting is a bit faster than multiplying */
706
set = &(c->tags[set_no << c->assoc_bits]);
708
c->line_start_mask[a & c->line_size_mask] &
709
c->line_end_mask[(a+size-1) & c->line_size_mask];
711
/* This loop is unrolled for just the first case, which is the most */
712
/* common. We can't unroll any further because it would screw up */
713
/* if we have a direct-mapped (1-way) cache. */
714
if (tag == (set[0] & c->tag_mask)) {
715
cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
719
/* If the tag is one other than the MRU, move it into the MRU spot */
720
/* and shuffle the rest down. */
721
for (i = 1; i < c->assoc; i++) {
722
if (tag == (set[i] & c->tag_mask)) {
724
for (j = i; j > 0; j--) {
729
cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
734
/* A miss; install this tag as MRU, shuffle rest down. */
735
tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
736
for (j = c->assoc - 1; j > 0; j--) {
739
set[0] = tag | tmp_tag;
741
cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
742
use_mask, a & ~c->line_size_mask);
748
static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
750
UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
751
UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
752
UWord tag = a >> c->tag_shift;
754
/* Access entirely within line. */
756
return cacheuse_setref(c, set1, tag);
758
/* Access straddles two lines. */
759
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
760
else if (((set1 + 1) & (c->sets-1)) == set2) {
762
/* the call updates cache structures as side effect */
763
CacheResult res1 = cacheuse_isMiss(c, set1, tag);
764
CacheResult res2 = cacheuse_isMiss(c, set2, tag);
765
return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
768
VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
769
VG_(tool_panic)("item straddles more than two cache sets");
776
/* for I1/D1 caches */
777
#define CACHEUSE(L) \
779
static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
781
register UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
782
register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
783
register UWord tag = a & L.tag_mask; \
785
UWord *set, tmp_tag; \
788
CLG_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n", \
789
L.name, a, size, set1, set2); \
791
/* First case: word entirely within line. */ \
792
if (set1 == set2) { \
794
/* Shifting is a bit faster than multiplying */ \
795
set = &(L.tags[set1 << L.assoc_bits]); \
796
use_mask = L.line_start_mask[a & L.line_size_mask] & \
797
L.line_end_mask[(a+size-1) & L.line_size_mask]; \
799
/* This loop is unrolled for just the first case, which is the most */\
800
/* common. We can't unroll any further because it would screw up */\
801
/* if we have a direct-mapped (1-way) cache. */\
802
if (tag == (set[0] & L.tag_mask)) { \
803
idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
804
L.use[idx].count ++; \
805
L.use[idx].mask |= use_mask; \
806
CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
807
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
808
use_mask, L.use[idx].mask, L.use[idx].count); \
811
/* If the tag is one other than the MRU, move it into the MRU spot */\
812
/* and shuffle the rest down. */\
813
for (i = 1; i < L.assoc; i++) { \
814
if (tag == (set[i] & L.tag_mask)) { \
816
for (j = i; j > 0; j--) { \
817
set[j] = set[j - 1]; \
820
idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
821
L.use[idx].count ++; \
822
L.use[idx].mask |= use_mask; \
823
CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
824
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
825
use_mask, L.use[idx].mask, L.use[idx].count); \
830
/* A miss; install this tag as MRU, shuffle rest down. */ \
831
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
832
for (j = L.assoc - 1; j > 0; j--) { \
833
set[j] = set[j - 1]; \
835
set[0] = tag | tmp_tag; \
836
idx = (set1 << L.assoc_bits) | tmp_tag; \
837
return update_##L##_use(&L, idx, \
838
use_mask, a &~ L.line_size_mask); \
840
/* Second case: word straddles two lines. */ \
841
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
842
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
843
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
844
set = &(L.tags[set1 << L.assoc_bits]); \
845
use_mask = L.line_start_mask[a & L.line_size_mask]; \
846
if (tag == (set[0] & L.tag_mask)) { \
847
idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
848
L.use[idx].count ++; \
849
L.use[idx].mask |= use_mask; \
850
CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
851
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
852
use_mask, L.use[idx].mask, L.use[idx].count); \
855
for (i = 1; i < L.assoc; i++) { \
856
if (tag == (set[i] & L.tag_mask)) { \
858
for (j = i; j > 0; j--) { \
859
set[j] = set[j - 1]; \
862
idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
863
L.use[idx].count ++; \
864
L.use[idx].mask |= use_mask; \
865
CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
866
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
867
use_mask, L.use[idx].mask, L.use[idx].count); \
871
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
872
for (j = L.assoc - 1; j > 0; j--) { \
873
set[j] = set[j - 1]; \
875
set[0] = tag | tmp_tag; \
876
idx = (set1 << L.assoc_bits) | tmp_tag; \
877
miss1 = update_##L##_use(&L, idx, \
878
use_mask, a &~ L.line_size_mask); \
880
set = &(L.tags[set2 << L.assoc_bits]); \
881
use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
882
if (tag == (set[0] & L.tag_mask)) { \
883
idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
884
L.use[idx].count ++; \
885
L.use[idx].mask |= use_mask; \
886
CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
887
idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
888
use_mask, L.use[idx].mask, L.use[idx].count); \
891
for (i = 1; i < L.assoc; i++) { \
892
if (tag == (set[i] & L.tag_mask)) { \
894
for (j = i; j > 0; j--) { \
895
set[j] = set[j - 1]; \
898
idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
899
L.use[idx].count ++; \
900
L.use[idx].mask |= use_mask; \
901
CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
902
i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
903
use_mask, L.use[idx].mask, L.use[idx].count); \
907
tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
908
for (j = L.assoc - 1; j > 0; j--) { \
909
set[j] = set[j - 1]; \
911
set[0] = tag | tmp_tag; \
912
idx = (set2 << L.assoc_bits) | tmp_tag; \
913
miss2 = update_##L##_use(&L, idx, \
914
use_mask, (a+size-1) &~ L.line_size_mask); \
915
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
918
VG_(printf)("addr: %p size: %u sets: %d %d", a, size, set1, set2); \
919
VG_(tool_panic)("item straddles more than two cache sets"); \
925
/* logarithmic bitcounting algorithm, see
926
* http://graphics.stanford.edu/~seander/bithacks.html
928
static __inline__ unsigned int countBits(unsigned int bits)
930
unsigned int c; // store the total here
931
const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
932
const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
935
c = ((c >> S[0]) & B[0]) + (c & B[0]);
936
c = ((c >> S[1]) & B[1]) + (c & B[1]);
937
c = ((c >> S[2]) & B[2]) + (c & B[2]);
938
c = ((c >> S[3]) & B[3]) + (c & B[3]);
939
c = ((c >> S[4]) & B[4]) + (c & B[4]);
943
static void update_L2_use(int idx, Addr memline)
945
line_loaded* loaded = &(L2.loaded[idx]);
946
line_use* use = &(L2.use[idx]);
947
int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
949
CLG_DEBUG(2, " L2.miss [%d]: at %p accessing memline %p\n",
950
idx, bb_base + current_ii->instr_offset, memline);
952
CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",
953
use->count, i, use->mask, loaded->memline, loaded->iaddr);
954
CLG_DEBUG(2, " collect: %d, use_base %p\n",
955
CLG_(current_state).collect, loaded->use_base);
957
if (CLG_(current_state).collect && loaded->use_base) {
958
(loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
959
(loaded->use_base)[off_L2_SpLoss] += i;
966
loaded->memline = memline;
967
loaded->iaddr = bb_base + current_ii->instr_offset;
968
loaded->use_base = (CLG_(current_state).nonskipped) ?
969
CLG_(current_state).nonskipped->skipped :
970
cost_base + current_ii->cost_offset;
974
CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
976
UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
977
UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
978
UWord tag = memline & L2.tag_mask;
983
CLG_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);
985
if (tag == (set[0] & L2.tag_mask)) {
986
idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
987
l1_loaded->dep_use = &(L2.use[idx]);
989
CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
990
idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
991
L2.use[idx].mask, L2.use[idx].count);
994
for (i = 1; i < L2.assoc; i++) {
995
if (tag == (set[i] & L2.tag_mask)) {
997
for (j = i; j > 0; j--) {
1001
idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
1002
l1_loaded->dep_use = &(L2.use[idx]);
1004
CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
1005
i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1006
L2.use[idx].mask, L2.use[idx].count);
1011
/* A miss; install this tag as MRU, shuffle rest down. */
1012
tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
1013
for (j = L2.assoc - 1; j > 0; j--) {
1014
set[j] = set[j - 1];
1016
set[0] = tag | tmp_tag;
1017
idx = (setNo << L2.assoc_bits) | tmp_tag;
1018
l1_loaded->dep_use = &(L2.use[idx]);
1020
update_L2_use(idx, memline);
1028
#define UPDATE_USE(L) \
1030
static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
1031
UInt mask, Addr memline) \
1033
line_loaded* loaded = &(cache->loaded[idx]); \
1034
line_use* use = &(cache->use[idx]); \
1035
int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
1037
CLG_DEBUG(2, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
1038
cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
1039
if (use->count>0) { \
1040
CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",\
1041
use->count, c, use->mask, loaded->memline, loaded->iaddr); \
1042
CLG_DEBUG(2, " collect: %d, use_base %p\n", \
1043
CLG_(current_state).collect, loaded->use_base); \
1045
if (CLG_(current_state).collect && loaded->use_base) { \
1046
(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
1047
(loaded->use_base)[off_##L##_SpLoss] += c; \
1049
/* FIXME (?): L1/L2 line sizes must be equal ! */ \
1050
loaded->dep_use->mask |= use->mask; \
1051
loaded->dep_use->count += use->count; \
1057
loaded->memline = memline; \
1058
loaded->iaddr = bb_base + current_ii->instr_offset; \
1059
loaded->use_base = (CLG_(current_state).nonskipped) ? \
1060
CLG_(current_state).nonskipped->skipped : \
1061
cost_base + current_ii->cost_offset; \
1063
if (memline == 0) return L2_Hit; \
1064
return cacheuse_L2_access(memline, loaded); \
1075
void cacheuse_finish(void)
1078
InstrInfo ii = { 0,0,0,0,0 };
1080
if (!CLG_(current_state).collect) return;
1086
/* update usage counters */
1088
for (i = 0; i < I1.sets * I1.assoc; i++)
1089
if (I1.loaded[i].use_base)
1090
update_I1_use( &I1, i, 0,0);
1093
for (i = 0; i < D1.sets * D1.assoc; i++)
1094
if (D1.loaded[i].use_base)
1095
update_D1_use( &D1, i, 0,0);
1098
for (i = 0; i < L2.sets * L2.assoc; i++)
1099
if (L2.loaded[i].use_base)
1100
update_L2_use(i, 0);
1105
/*------------------------------------------------------------*/
1106
/*--- Helper functions called by instrumented code ---*/
1107
/*------------------------------------------------------------*/
1111
void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1114
case WriteBackMemAccess:
1115
if (clo_simulate_writeback) {
1139
static void log_1I0D(InstrInfo* ii)
1141
CacheModelResult IrRes;
1144
IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1146
CLG_DEBUG(6, "log_1I0D: Ir=%p/%u => Ir %d\n",
1147
bb_base + ii->instr_offset, ii->instr_size, IrRes);
1149
if (CLG_(current_state).collect) {
1152
if (CLG_(current_state).nonskipped)
1153
cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1155
cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1157
inc_costs(IrRes, cost_Ir,
1158
CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1163
/* Instruction doing a read access */
1166
static void log_1I1Dr(InstrInfo* ii, Addr data)
1168
CacheModelResult IrRes, DrRes;
1171
IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1172
DrRes = (*simulator.D1_Read)(data, ii->data_size);
1174
CLG_DEBUG(6, "log_1I1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
1175
bb_base + ii->instr_offset, ii->instr_size,
1176
data, ii->data_size, IrRes, DrRes);
1178
if (CLG_(current_state).collect) {
1179
ULong *cost_Ir, *cost_Dr;
1181
if (CLG_(current_state).nonskipped) {
1182
cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1183
cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1186
cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1187
cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1190
inc_costs(IrRes, cost_Ir,
1191
CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1192
inc_costs(DrRes, cost_Dr,
1193
CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1199
static void log_0I1Dr(InstrInfo* ii, Addr data)
1201
CacheModelResult DrRes;
1204
DrRes = (*simulator.D1_Read)(data, ii->data_size);
1206
CLG_DEBUG(6, "log_0I1Dr: Dr=%p/%u => Dr %d\n",
1207
data, ii->data_size, DrRes);
1209
if (CLG_(current_state).collect) {
1212
if (CLG_(current_state).nonskipped) {
1213
cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1216
cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1219
inc_costs(DrRes, cost_Dr,
1220
CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1225
/* Instruction doing a write access */
1228
static void log_1I1Dw(InstrInfo* ii, Addr data)
1230
CacheModelResult IrRes, DwRes;
1233
IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1234
DwRes = (*simulator.D1_Write)(data, ii->data_size);
1236
CLG_DEBUG(6, "log_1I1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
1237
bb_base + ii->instr_offset, ii->instr_size,
1238
data, ii->data_size, IrRes, DwRes);
1240
if (CLG_(current_state).collect) {
1241
ULong *cost_Ir, *cost_Dw;
1243
if (CLG_(current_state).nonskipped) {
1244
cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1245
cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1248
cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1249
cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1252
inc_costs(IrRes, cost_Ir,
1253
CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1254
inc_costs(DwRes, cost_Dw,
1255
CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1260
static void log_0I1Dw(InstrInfo* ii, Addr data)
1262
CacheModelResult DwRes;
1265
DwRes = (*simulator.D1_Write)(data, ii->data_size);
1267
CLG_DEBUG(6, "log_0I1Dw: Dw=%p/%u => Dw %d\n",
1268
data, ii->data_size, DwRes);
1270
if (CLG_(current_state).collect) {
1273
if (CLG_(current_state).nonskipped) {
1274
cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1277
cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1280
inc_costs(DwRes, cost_Dw,
1281
CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1285
/* Instruction doing a read and a write access */
1288
static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1290
CacheModelResult IrRes, DrRes, DwRes;
1293
IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1294
DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1295
DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1298
"log_1I2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
1299
bb_base + ii->instr_offset, ii->instr_size,
1300
data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1302
if (CLG_(current_state).collect) {
1303
ULong *cost_Ir, *cost_Dr, *cost_Dw;
1305
if (CLG_(current_state).nonskipped) {
1306
cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1307
cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1308
cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1311
cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1312
cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1313
cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1316
inc_costs(IrRes, cost_Ir,
1317
CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1318
inc_costs(DrRes, cost_Dr,
1319
CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1320
inc_costs(DwRes, cost_Dw,
1321
CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1326
static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1328
CacheModelResult DrRes, DwRes;
1331
DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1332
DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1335
"log_0D2D: Dr=%p/%u, Dw=%p/%u => Dr %d, Dw %d\n",
1336
data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1338
if (CLG_(current_state).collect) {
1339
ULong *cost_Dr, *cost_Dw;
1341
if (CLG_(current_state).nonskipped) {
1342
cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1343
cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1346
cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1347
cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1350
inc_costs(DrRes, cost_Dr,
1351
CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1352
inc_costs(DwRes, cost_Dw,
1353
CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1358
/*------------------------------------------------------------*/
1359
/*--- Cache configuration ---*/
1360
/*------------------------------------------------------------*/
1362
#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1364
static cache_t clo_I1_cache = UNDEFINED_CACHE;
1365
static cache_t clo_D1_cache = UNDEFINED_CACHE;
1366
static cache_t clo_L2_cache = UNDEFINED_CACHE;
1369
/* Checks cache config is ok; makes it so if not. */
1371
void check_cache(cache_t* cache, Char *name)
1373
/* First check they're all powers of two */
1374
if (-1 == VG_(log2)(cache->size)) {
1375
VG_(message)(Vg_UserMsg,
1376
"error: %s size of %dB not a power of two; aborting.",
1381
if (-1 == VG_(log2)(cache->assoc)) {
1382
VG_(message)(Vg_UserMsg,
1383
"error: %s associativity of %d not a power of two; aborting.",
1384
name, cache->assoc);
1388
if (-1 == VG_(log2)(cache->line_size)) {
1389
VG_(message)(Vg_UserMsg,
1390
"error: %s line size of %dB not a power of two; aborting.",
1391
name, cache->line_size);
1395
// Then check line size >= 16 -- any smaller and a single instruction could
1396
// straddle three cache lines, which breaks a simulation assertion and is
1398
if (cache->line_size < MIN_LINE_SIZE) {
1399
VG_(message)(Vg_UserMsg,
1400
"error: %s line size of %dB too small; aborting.",
1401
name, cache->line_size);
1405
/* Then check cache size > line size (causes seg faults if not). */
1406
if (cache->size <= cache->line_size) {
1407
VG_(message)(Vg_UserMsg,
1408
"error: %s cache size of %dB <= line size of %dB; aborting.",
1409
name, cache->size, cache->line_size);
1413
/* Then check assoc <= (size / line size) (seg faults otherwise). */
1414
if (cache->assoc > (cache->size / cache->line_size)) {
1415
VG_(message)(Vg_UserMsg,
1416
"warning: %s associativity > (size / line size); aborting.", name);
1422
void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1424
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1428
// Count how many were defined on the command line.
1429
if (DEFINED(clo_I1_cache)) { n_clos++; }
1430
if (DEFINED(clo_D1_cache)) { n_clos++; }
1431
if (DEFINED(clo_L2_cache)) { n_clos++; }
1433
// Set the cache config (using auto-detection, if supported by the
1435
VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1437
// Then replace with any defined on the command line.
1438
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1439
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1440
if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1442
// Then check values and fix if not acceptable.
1443
check_cache(I1c, "I1");
1444
check_cache(D1c, "D1");
1445
check_cache(L2c, "L2");
1447
if (VG_(clo_verbosity) > 1) {
1448
VG_(message)(Vg_UserMsg, "Cache configuration used:");
1449
VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1450
I1c->size, I1c->assoc, I1c->line_size);
1451
VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1452
D1c->size, D1c->assoc, D1c->line_size);
1453
VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1454
L2c->size, L2c->assoc, L2c->line_size);
1456
#undef CMD_LINE_DEFINED
1460
/* Initialize and clear simulator state */
1461
static void cachesim_post_clo_init(void)
1463
/* Cache configurations. */
1464
cache_t I1c, D1c, L2c;
1466
/* Initialize access handlers */
1467
if (!CLG_(clo).simulate_cache) {
1468
CLG_(cachesim).log_1I0D = 0;
1469
CLG_(cachesim).log_1I0D_name = "(no function)";
1471
CLG_(cachesim).log_1I1Dr = 0;
1472
CLG_(cachesim).log_1I1Dw = 0;
1473
CLG_(cachesim).log_1I2D = 0;
1474
CLG_(cachesim).log_1I1Dr_name = "(no function)";
1475
CLG_(cachesim).log_1I1Dw_name = "(no function)";
1476
CLG_(cachesim).log_1I2D_name = "(no function)";
1478
CLG_(cachesim).log_0I1Dr = 0;
1479
CLG_(cachesim).log_0I1Dw = 0;
1480
CLG_(cachesim).log_0I2D = 0;
1481
CLG_(cachesim).log_0I1Dr_name = "(no function)";
1482
CLG_(cachesim).log_0I1Dw_name = "(no function)";
1483
CLG_(cachesim).log_0I2D_name = "(no function)";
1487
/* Configuration of caches only needed with real cache simulation */
1488
configure_caches(&I1c, &D1c, &L2c);
1494
cachesim_initcache(I1c, &I1);
1495
cachesim_initcache(D1c, &D1);
1496
cachesim_initcache(L2c, &L2);
1498
/* the other cache simulators use the standard helpers
1499
* with dispatching via simulator struct */
1501
CLG_(cachesim).log_1I0D = log_1I0D;
1502
CLG_(cachesim).log_1I0D_name = "log_1I0D";
1504
CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1505
CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1506
CLG_(cachesim).log_1I2D = log_1I2D;
1507
CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1508
CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1509
CLG_(cachesim).log_1I2D_name = "log_1I2D";
1511
CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1512
CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1513
CLG_(cachesim).log_0I2D = log_0I2D;
1514
CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1515
CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1516
CLG_(cachesim).log_0I2D_name = "log_0I2D";
1518
if (clo_collect_cacheuse) {
1520
/* Output warning for not supported option combinations */
1521
if (clo_simulate_hwpref) {
1522
VG_(message)(Vg_DebugMsg,
1523
"warning: prefetch simulation can not be used with cache usage");
1524
clo_simulate_hwpref = False;
1527
if (clo_simulate_writeback) {
1528
VG_(message)(Vg_DebugMsg,
1529
"warning: write-back simulation can not be used with cache usage");
1530
clo_simulate_writeback = False;
1533
simulator.I1_Read = cacheuse_I1_doRead;
1534
simulator.D1_Read = cacheuse_D1_doRead;
1535
simulator.D1_Write = cacheuse_D1_doRead;
1539
if (clo_simulate_hwpref) {
1542
if (clo_simulate_writeback) {
1543
simulator.I1_Read = prefetch_I1_Read;
1544
simulator.D1_Read = prefetch_D1_Read;
1545
simulator.D1_Write = prefetch_D1_Write;
1548
simulator.I1_Read = prefetch_I1_ref;
1549
simulator.D1_Read = prefetch_D1_ref;
1550
simulator.D1_Write = prefetch_D1_ref;
1556
if (clo_simulate_writeback) {
1557
simulator.I1_Read = cachesim_I1_Read;
1558
simulator.D1_Read = cachesim_D1_Read;
1559
simulator.D1_Write = cachesim_D1_Write;
1562
simulator.I1_Read = cachesim_I1_ref;
1563
simulator.D1_Read = cachesim_D1_ref;
1564
simulator.D1_Write = cachesim_D1_ref;
1569
/* Clear simulator state. Has to be initialized before */
1571
void cachesim_clear(void)
1573
cachesim_clearcache(&I1);
1574
cachesim_clearcache(&D1);
1575
cachesim_clearcache(&L2);
1581
static void cachesim_getdesc(Char* buf)
1584
p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1585
p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1586
VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1590
void cachesim_print_opts(void)
1593
"\n cache simulator options:\n"
1594
" --simulate-cache=no|yes Do cache simulation [no]\n"
1595
" --simulate-wb=no|yes Count write-back events [no]\n"
1596
" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1597
#if CLG_EXPERIMENTAL
1598
" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1600
" --cacheuse=no|yes Collect cache block use [no]\n"
1601
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1602
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1603
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1607
static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1611
char *opt = VG_(strdup)(orig_opt);
1615
/* Option looks like "--I1=65536,2,64".
1616
* Find commas, replace with NULs to make three independent
1617
* strings, then extract numbers. Yuck. */
1618
while (VG_(isdigit)(opt[i])) i++;
1619
if (',' == opt[i]) {
1623
while (VG_(isdigit)(opt[i])) i++;
1624
if (',' == opt[i]) {
1628
while (VG_(isdigit)(opt[i])) i++;
1629
if ('\0' != opt[i]) goto bad;
1631
cache->size = (Int)VG_(atoll)(opt + i1);
1632
cache->assoc = (Int)VG_(atoll)(opt + i2);
1633
cache->line_size = (Int)VG_(atoll)(opt + i3);
1640
VG_(bad_option)(orig_opt);
1643
/* Check for command line option for cache configuration.
1644
* Return False if unknown and not handled.
1646
* Called from CLG_(process_cmd_line_option)() in clo.c
1648
static Bool cachesim_parse_opt(Char* arg)
1650
if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1651
clo_simulate_writeback = True;
1652
else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1653
clo_simulate_writeback = False;
1655
else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1656
clo_simulate_hwpref = True;
1657
else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1658
clo_simulate_hwpref = False;
1660
else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1661
clo_simulate_sectors = True;
1662
else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1663
clo_simulate_sectors = False;
1665
else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1666
clo_collect_cacheuse = True;
1667
/* Use counters only make sense with fine dumping */
1668
CLG_(clo).dump_instr = True;
1670
else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1671
clo_collect_cacheuse = False;
1673
/* 5 is length of "--I1=" */
1674
else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1675
parse_opt(&clo_I1_cache, arg, 5);
1676
else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1677
parse_opt(&clo_D1_cache, arg, 5);
1678
else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1679
parse_opt(&clo_L2_cache, arg, 5);
1686
/* Adds commas to ULong, right justifying in a field field_width wide, returns
1687
* the string in buf. */
1689
Int commify(ULong n, int field_width, char* buf)
1691
int len, n_commas, i, j, new_len, space;
1693
VG_(sprintf)(buf, "%llu", n);
1694
len = VG_(strlen)(buf);
1695
n_commas = (len - 1) / 3;
1696
new_len = len + n_commas;
1697
space = field_width - new_len;
1699
/* Allow for printing a number in a field_width smaller than it's size */
1700
if (space < 0) space = 0;
1702
/* Make j = -1 because we copy the '\0' before doing the numbers in groups
1704
for (j = -1, i = len ; i >= 0; i--) {
1705
buf[i + n_commas + space] = buf[i];
1707
if ((i>0) && (3 == ++j)) {
1710
buf[i + n_commas + space] = ',';
1713
/* Right justify in field. */
1714
for (i = 0; i < space; i++) buf[i] = ' ';
1719
void percentify(Int n, Int ex, Int field_width, char buf[])
1723
VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1724
len = VG_(strlen)(buf);
1725
space = field_width - len;
1726
if (space < 0) space = 0; /* Allow for v. small field_width */
1729
/* Right justify in field */
1730
for ( ; i >= 0; i--) buf[i + space] = buf[i];
1731
for (i = 0; i < space; i++) buf[i] = ' ';
1735
void cachesim_printstat(void)
1737
FullCost total = CLG_(total_cost), D_total = 0;
1738
ULong L2_total_m, L2_total_mr, L2_total_mw,
1739
L2_total, L2_total_r, L2_total_w;
1740
char buf1[RESULTS_BUF_LEN],
1741
buf2[RESULTS_BUF_LEN],
1742
buf3[RESULTS_BUF_LEN];
1746
if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1747
VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1749
VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1751
VG_(message)(Vg_DebugMsg, "");
1754
/* I cache results. Use the I_refs value to determine the first column
1756
l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1757
VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1759
if (!CLG_(clo).simulate_cache) return;
1761
commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1762
VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1764
commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1765
VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1769
if (0 == total[CLG_(sets).off_full_Ir])
1770
total[CLG_(sets).off_full_Ir] = 1;
1772
percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1773
total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1774
VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1776
percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1777
total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1778
VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1779
VG_(message)(Vg_UserMsg, "");
1782
Use the D_refs.rd and D_refs.wr values to determine the
1783
* width of columns 2 & 3. */
1785
D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1786
CLG_(init_cost)( CLG_(sets).full, D_total);
1787
CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1788
CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1790
commify( D_total[0], l1, buf1);
1791
l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1792
l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1793
VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1796
commify( D_total[1], l1, buf1);
1797
commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1798
commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1799
VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1802
commify( D_total[2], l1, buf1);
1803
commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1804
commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1805
VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1810
if (0 == D_total[0]) D_total[0] = 1;
1811
if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1812
if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1814
percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1815
percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1816
total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1817
percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1818
total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1819
VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1821
percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1822
percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1823
total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1824
percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1825
total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1826
VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1827
VG_(message)(Vg_UserMsg, "");
1831
/* L2 overall results */
1834
total[CLG_(sets).off_full_Dr +1] +
1835
total[CLG_(sets).off_full_Dw +1] +
1836
total[CLG_(sets).off_full_Ir +1];
1838
total[CLG_(sets).off_full_Dr +1] +
1839
total[CLG_(sets).off_full_Ir +1];
1840
L2_total_w = total[CLG_(sets).off_full_Dw +1];
1841
commify(L2_total, l1, buf1);
1842
commify(L2_total_r, l2, buf2);
1843
commify(L2_total_w, l3, buf3);
1844
VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1848
total[CLG_(sets).off_full_Dr +2] +
1849
total[CLG_(sets).off_full_Dw +2] +
1850
total[CLG_(sets).off_full_Ir +2];
1852
total[CLG_(sets).off_full_Dr +2] +
1853
total[CLG_(sets).off_full_Ir +2];
1854
L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1855
commify(L2_total_m, l1, buf1);
1856
commify(L2_total_mr, l2, buf2);
1857
commify(L2_total_mw, l3, buf3);
1858
VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1861
percentify(L2_total_m * 100 * p /
1862
(total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1863
percentify(L2_total_mr * 100 * p /
1864
(total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1866
percentify(L2_total_mw * 100 * p /
1867
total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1868
VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1873
/*------------------------------------------------------------*/
1874
/*--- Setup for Event set. ---*/
1875
/*------------------------------------------------------------*/
1877
struct event_sets CLG_(sets);
1879
void CLG_(init_eventsets)(Int max_user)
1881
EventType * e1, *e2, *e3, *e4;
1882
EventSet *Ir, *Dr, *Dw;
1883
EventSet *D0, *D1r, *D1w, *D2;
1884
EventSet *sim, *full;
1888
use = CLG_(get_eventset)("Use", 4);
1889
if (clo_collect_cacheuse) {
1890
/* if TUse is 0, there was never a load, and no loss, too */
1891
e1 = CLG_(register_eventtype)("AcCost1");
1892
CLG_(add_eventtype)(use, e1);
1893
e1 = CLG_(register_eventtype)("SpLoss1");
1894
CLG_(add_eventtype)(use, e1);
1895
e1 = CLG_(register_eventtype)("AcCost2");
1896
CLG_(add_eventtype)(use, e1);
1897
e1 = CLG_(register_eventtype)("SpLoss2");
1898
CLG_(add_eventtype)(use, e1);
1901
Ir = CLG_(get_eventset)("Ir", 4);
1902
Dr = CLG_(get_eventset)("Dr", 4);
1903
Dw = CLG_(get_eventset)("Dw", 4);
1904
if (CLG_(clo).simulate_cache) {
1905
e1 = CLG_(register_eventtype)("Ir");
1906
e2 = CLG_(register_eventtype)("I1mr");
1907
e3 = CLG_(register_eventtype)("I2mr");
1908
if (clo_simulate_writeback) {
1909
e4 = CLG_(register_eventtype)("I2dmr");
1910
CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1913
CLG_(add_dep_event3)(Ir, e1,e2,e3);
1915
e1 = CLG_(register_eventtype)("Dr");
1916
e2 = CLG_(register_eventtype)("D1mr");
1917
e3 = CLG_(register_eventtype)("D2mr");
1918
if (clo_simulate_writeback) {
1919
e4 = CLG_(register_eventtype)("D2dmr");
1920
CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1923
CLG_(add_dep_event3)(Dr, e1,e2,e3);
1925
e1 = CLG_(register_eventtype)("Dw");
1926
e2 = CLG_(register_eventtype)("D1mw");
1927
e3 = CLG_(register_eventtype)("D2mw");
1928
if (clo_simulate_writeback) {
1929
e4 = CLG_(register_eventtype)("D2dmw");
1930
CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1933
CLG_(add_dep_event3)(Dw, e1,e2,e3);
1937
e1 = CLG_(register_eventtype)("Ir");
1938
CLG_(add_eventtype)(Ir, e1);
1941
sizeOfUseIr = use->size + Ir->size;
1942
D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1943
CLG_(add_eventset)(D0, use);
1944
off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1946
D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1947
CLG_(add_eventset)(D1r, use);
1948
off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1949
off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1951
D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1952
CLG_(add_eventset)(D1w, use);
1953
off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1954
off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1956
D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1957
CLG_(add_eventset)(D2, use);
1958
off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1959
off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1960
off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1962
sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1963
CLG_(add_eventset)(sim, use);
1964
CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1965
CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1966
CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1968
if (CLG_(clo).collect_alloc) max_user += 2;
1969
if (CLG_(clo).collect_systime) max_user += 2;
1971
full = CLG_(get_eventset)("full", sim->size + max_user);
1972
CLG_(add_eventset)(full, sim);
1973
CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1974
CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1975
CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1977
CLG_(sets).use = use;
1983
CLG_(sets).D1r = D1r;
1984
CLG_(sets).D1w = D1w;
1987
CLG_(sets).sim = sim;
1988
CLG_(sets).full = full;
1990
if (CLG_(clo).collect_alloc) {
1991
e1 = CLG_(register_eventtype)("allocCount");
1992
e2 = CLG_(register_eventtype)("allocSize");
1993
CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
1996
if (CLG_(clo).collect_systime) {
1997
e1 = CLG_(register_eventtype)("sysCount");
1998
e2 = CLG_(register_eventtype)("sysTime");
1999
CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
2003
CLG_DEBUG(1, "EventSets:\n");
2004
CLG_(print_eventset)(-2, use);
2005
CLG_(print_eventset)(-2, Ir);
2006
CLG_(print_eventset)(-2, Dr);
2007
CLG_(print_eventset)(-2, Dw);
2008
CLG_(print_eventset)(-2, sim);
2009
CLG_(print_eventset)(-2, full);
2012
/* Not-existing events are silently ignored */
2013
CLG_(dumpmap) = CLG_(get_eventmapping)(full);
2014
CLG_(append_event)(CLG_(dumpmap), "Ir");
2015
CLG_(append_event)(CLG_(dumpmap), "Dr");
2016
CLG_(append_event)(CLG_(dumpmap), "Dw");
2017
CLG_(append_event)(CLG_(dumpmap), "I1mr");
2018
CLG_(append_event)(CLG_(dumpmap), "D1mr");
2019
CLG_(append_event)(CLG_(dumpmap), "D1mw");
2020
CLG_(append_event)(CLG_(dumpmap), "I2mr");
2021
CLG_(append_event)(CLG_(dumpmap), "D2mr");
2022
CLG_(append_event)(CLG_(dumpmap), "D2mw");
2023
CLG_(append_event)(CLG_(dumpmap), "I2dmr");
2024
CLG_(append_event)(CLG_(dumpmap), "D2dmr");
2025
CLG_(append_event)(CLG_(dumpmap), "D2dmw");
2026
CLG_(append_event)(CLG_(dumpmap), "AcCost1");
2027
CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
2028
CLG_(append_event)(CLG_(dumpmap), "AcCost2");
2029
CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
2030
CLG_(append_event)(CLG_(dumpmap), "allocCount");
2031
CLG_(append_event)(CLG_(dumpmap), "allocSize");
2032
CLG_(append_event)(CLG_(dumpmap), "sysCount");
2033
CLG_(append_event)(CLG_(dumpmap), "sysTime");
2040
void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
2042
/* if eventset use is defined, it is always first (hardcoded!) */
2043
CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
2045
/* FIXME: This is hardcoded... */
2046
if (es == CLG_(sets).D0) {
2047
CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2050
else if (es == CLG_(sets).D1r) {
2051
CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2053
CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2056
else if (es == CLG_(sets).D1w) {
2057
CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2059
CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2063
CLG_ASSERT(es == CLG_(sets).D2);
2064
CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2066
CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2068
CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2073
/* this is called at dump time for every instruction executed */
2074
static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
2075
InstrInfo* ii, ULong exe_count)
2077
if (!CLG_(clo).simulate_cache)
2078
cost[CLG_(sets).off_sim_Ir] += exe_count;
2082
/* There is always a trivial case where exe_count and Ir can be
2083
* slightly different because ecounter is updated when executing
2084
* the next BB. E.g. for last BB executed, or when toggling collection
2086
/* FIXME: Hardcoded that each eventset has Ir as first */
2087
if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2088
VG_(printf)("==> Ir %llu, exe %llu\n",
2089
(bbcc->cost + ii->cost_offset)[0], exe_count);
2090
CLG_(print_bbcc_cost)(-2, bbcc);
2091
//CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2095
add_and_zero_Dx(ii->eventset, cost,
2096
bbcc->cost + ii->cost_offset);
2101
void cachesim_after_bbsetup(void)
2103
BBCC* bbcc = CLG_(current_state).bbcc;
2105
if (CLG_(clo).simulate_cache) {
2108
/* only needed if log_* functions are called */
2109
bb_base = bb->obj->offset + bb->offset;
2110
cost_base = bbcc->cost;
2115
void cachesim_finish(void)
2117
if (clo_collect_cacheuse)
2121
/*------------------------------------------------------------*/
2122
/*--- The simulator defined in this file ---*/
2123
/*------------------------------------------------------------*/
2125
struct cachesim_if CLG_(cachesim) = {
2126
.print_opts = cachesim_print_opts,
2127
.parse_opt = cachesim_parse_opt,
2128
.post_clo_init = cachesim_post_clo_init,
2129
.clear = cachesim_clear,
2130
.getdesc = cachesim_getdesc,
2131
.printstat = cachesim_printstat,
2132
.add_icost = cachesim_add_icost,
2133
.after_bbsetup = cachesim_after_bbsetup,
2134
.finish = cachesim_finish,
2136
/* these will be set by cachesim_post_clo_init */
2147
.log_1I0D_name = "(no function)",
2149
.log_1I1Dr_name = "(no function)",
2150
.log_1I1Dw_name = "(no function)",
2151
.log_1I2D_name = "(no function)",
2153
.log_0I1Dr_name = "(no function)",
2154
.log_0I1Dw_name = "(no function)",
2155
.log_0I2D_name = "(no function)"
2159
/*--------------------------------------------------------------------*/
2160
/*--- end ct_sim.c ---*/
2161
/*--------------------------------------------------------------------*/