1
// Copyright 2013 Dolphin Emulator Project
2
// Licensed under GPLv2
3
// Refer to the license.txt file included.
7
// for the PROFILER stuff
13
#include "../../HLE/HLE.h"
14
#include "../../PatchEngine.h"
15
#include "../Profiler.h"
18
#include "JitRegCache.h"
19
#include "Jit64_Tables.h"
20
#include "HW/ProcessorInterface.h"
21
#if defined(_DEBUG) || defined(DEBUGFAST)
22
#include "PowerPCDisasm.h"
26
using namespace PowerPC;
28
// Dolphin's PowerPC->x86 JIT dynamic recompiler
29
// (Nearly) all code by ector (hrydgard)
31
// * x86 & x64 support, lots of shared code.
32
// * Basic block linking
36
// * Does not recompile all instructions - sometimes falls back to inserting a CALL to the corresponding Interpreter function.
38
// Various notes below
40
// Register allocation
41
// RAX - Generic quicktemp register
42
// RBX - point to base of memory map
43
// RSI RDI R12 R13 R14 R15 - free for allocation
44
// RCX RDX R8 R9 R10 R11 - allocate in emergencies. These need to be flushed before functions are called.
45
// RSP - stack pointer, do not generally use, very dangerous
49
// Make sure that all generated code and all emulator state sits under the 2GB boundary so that
50
// RIP addressing can be used easily. Windows will always allocate static code under the 2GB boundary.
51
// Also make sure to use VirtualAlloc and specify EXECUTE permission.
54
// * Should there be any statically allocated registers? r3, r4, r5, r8, r0 come to mind.. maybe sp
55
// * Does it make sense to finish off the remaining non-jitted instructions? Seems we are hitting diminishing returns.
57
// Other considerations
59
// Many instructions have shorter forms for EAX. However, I believe their performance boost
60
// will be as small to be negligible, so I haven't dirtied up the code with that. AMD recommends it in their
61
// optimization manuals, though.
63
// We support block linking. Reserve space at the exits of every block for a full 5-byte jmp. Save 16-bit offsets
64
// from the starts of each block, marking the exits so that they can be nicely patched at any time.
66
// Blocks do NOT use call/ret, they only jmp to each other and to the dispatcher when necessary.
68
// All blocks that can be precompiled will be precompiled. Code will be memory protected - any write will mark
69
// the region as non-compilable, and all links to the page will be torn out and replaced with dispatcher jmps.
71
// Alternatively, icbi instruction SHOULD mark where we can't compile
73
// Seldom-happening events is handled by adding a decrement of a counter to all blr instructions (which are
74
// expensive anyway since we need to return to dispatcher, except when they can be predicted).
76
// TODO: SERIOUS synchronization problem with the video backend setting tokens and breakpoints in dual core mode!!!
77
// Somewhat fixed by disabling idle skipping when certain interrupts are enabled
78
// This is no permanent reliable fix
79
// TODO: Zeldas go whacko when you hang the gfx thread
81
// Idea - Accurate exception handling
82
// Compute register state at a certain instruction by running the JIT in "dry mode", and stopping at the right place.
83
// Not likely to be done :P
86
// Optimization Ideas -
88
* Assume SP is in main RAM (in Wii mode too?) - partly done
89
* Assume all floating point loads and double precision loads+stores are to/from main ram
90
(single precision stores can be used in write gather pipe, specialized fast check added)
91
* AMD only - use movaps instead of movapd when loading ps from memory?
92
* HLE functions like floorf, sin, memcpy, etc - they can be much faster
93
* ABI optimizations - drop F0-F13 on blr, for example. Watch out for context switching.
94
CR2-CR4 are non-volatile, rest of CR is volatile -> dropped on blr.
95
R5-R12 are volatile -> dropped on blr.
96
* classic inlining across calls.
99
stfd -- guaranteed in memory
141
static int CODE_SIZE = 1024*1024*32;
145
extern u32 m_BlockStart;
150
jo.optimizeStack = true;
151
/* This will enable block linking in JitBlockCache::FinalizeBlock(), it gives faster execution but may not
152
be as stable as the alternative (to not link the blocks). However, I have not heard about any good examples
153
where this cause problems, so I'm enabling this by default, since I seem to get perhaps as much as 20% more
154
fps with this option enabled. If you suspect that this option cause problems you can also disable it from the
156
if (Core::g_CoreStartupParameter.bEnableDebugging)
158
jo.enableBlocklink = false;
159
Core::g_CoreStartupParameter.bSkipIdle = false;
163
if (!Core::g_CoreStartupParameter.bJITBlockLinking)
165
jo.enableBlocklink = false;
168
jo.enableBlocklink = !Core::g_CoreStartupParameter.bMMU;
170
jo.fpAccurateFcmp = Core::g_CoreStartupParameter.bEnableFPRF;
171
jo.optimizeGatherPipe = true;
172
jo.fastInterrupts = false;
173
jo.accurateSinglePrecision = true;
174
js.memcheck = Core::g_CoreStartupParameter.bMMU;
176
gpr.SetEmitter(this);
177
fpr.SetEmitter(this);
180
AllocCodeSpace(CODE_SIZE);
186
void Jit64::ClearCache()
189
trampolines.ClearCodeSpace();
193
void Jit64::Shutdown()
198
trampolines.Shutdown();
199
asm_routines.Shutdown();
202
// This is only called by Default() in this file. It will execute an instruction with the interpreter functions.
203
void Jit64::WriteCallInterpreter(UGeckoInstruction inst)
205
gpr.Flush(FLUSH_ALL);
206
fpr.Flush(FLUSH_ALL);
207
if (js.isLastInstruction)
209
MOV(32, M(&PC), Imm32(js.compilerPC));
210
MOV(32, M(&NPC), Imm32(js.compilerPC + 4));
212
Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst);
213
ABI_CallFunctionC((void*)instr, inst.hex);
216
void Jit64::unknown_instruction(UGeckoInstruction inst)
218
PanicAlert("unknown_instruction %08x - Fix me ;)", inst.hex);
221
void Jit64::Default(UGeckoInstruction _inst)
223
WriteCallInterpreter(_inst.hex);
226
void Jit64::HLEFunction(UGeckoInstruction _inst)
228
gpr.Flush(FLUSH_ALL);
229
fpr.Flush(FLUSH_ALL);
230
ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex);
233
void Jit64::DoNothing(UGeckoInstruction _inst)
235
// Yup, just don't do anything.
238
static const bool ImHereDebug = false;
239
static const bool ImHereLog = false;
240
static std::map<u32, int> been_here;
244
static File::IOFile f;
250
f.Open("log64.txt", "w");
252
f.Open("log32.txt", "w");
255
fprintf(f.GetHandle(), "%08x\n", PC);
257
if (been_here.find(PC) != been_here.end())
259
been_here.find(PC)->second++;
260
if ((been_here.find(PC)->second) & 1023)
263
DEBUG_LOG(DYNA_REC, "I'm here - PC = %08x , LR = %08x", PC, LR);
267
void Jit64::Cleanup()
269
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
271
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
274
// SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time.
275
if (MMCR0.Hex || MMCR1.Hex)
276
ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst);
279
void Jit64::WriteExit(u32 destination, int exit_num)
283
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
285
//If nobody has taken care of this yet (this can be removed when all branches are done)
286
JitBlock *b = js.curBlock;
287
b->exitAddress[exit_num] = destination;
288
b->exitPtrs[exit_num] = GetWritableCodePtr();
291
if (jo.enableBlocklink)
293
int block = blocks.GetBlockNumberFromStartAddress(destination);
296
// It exists! Joy of joy!
297
JMP(blocks.GetBlock(block)->checkedEntry, true);
298
b->linkStatus[exit_num] = true;
302
MOV(32, M(&PC), Imm32(destination));
303
JMP(asm_routines.dispatcher, true);
306
void Jit64::WriteExitDestInEAX()
308
MOV(32, M(&PC), R(EAX));
310
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
311
JMP(asm_routines.dispatcher, true);
314
void Jit64::WriteRfiExitDestInEAX()
316
MOV(32, M(&PC), R(EAX));
317
MOV(32, M(&NPC), R(EAX));
319
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
320
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
321
JMP(asm_routines.dispatcher, true);
324
void Jit64::WriteExceptionExit()
327
MOV(32, R(EAX), M(&PC));
328
MOV(32, M(&NPC), R(EAX));
329
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
330
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
331
JMP(asm_routines.dispatcher, true);
334
void Jit64::WriteExternalExceptionExit()
337
MOV(32, R(EAX), M(&PC));
338
MOV(32, M(&NPC), R(EAX));
339
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
340
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
341
JMP(asm_routines.dispatcher, true);
344
void STACKALIGN Jit64::Run()
346
CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
350
void Jit64::SingleStep()
352
CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
359
char fregs[750] = "";
362
for (int i = 0; i < 32; i++)
365
sprintf(reg, "r%02d: %08x ", i, PowerPC::ppcState.gpr[i]);
366
strncat(regs, reg, 500);
371
for (int i = 0; i < 32; i++)
374
sprintf(reg, "f%02d: %016x ", i, riPS0(i));
375
strncat(fregs, reg, 750);
379
DEBUG_LOG(DYNA_REC, "JIT64 PC: %08x SRR0: %08x SRR1: %08x CRfast: %02x%02x%02x%02x%02x%02x%02x%02x FPSCR: %08x MSR: %08x LR: %08x %s %s",
380
PC, SRR0, SRR1, PowerPC::ppcState.cr_fast[0], PowerPC::ppcState.cr_fast[1], PowerPC::ppcState.cr_fast[2], PowerPC::ppcState.cr_fast[3],
381
PowerPC::ppcState.cr_fast[4], PowerPC::ppcState.cr_fast[5], PowerPC::ppcState.cr_fast[6], PowerPC::ppcState.cr_fast[7], PowerPC::ppcState.fpscr,
382
PowerPC::ppcState.msr, PowerPC::ppcState.spr[8], regs, fregs);
385
void STACKALIGN Jit64::Jit(u32 em_address)
387
if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || Core::g_CoreStartupParameter.bJITNoBlockCache)
392
int block_num = blocks.AllocateBlock(em_address);
393
JitBlock *b = blocks.GetBlock(block_num);
394
blocks.FinalizeBlock(block_num, jo.enableBlocklink, DoJit(em_address, &code_buffer, b));
397
const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b)
399
int blockSize = code_buf->GetSize();
401
// Memory exception on instruction fetch
402
bool memory_exception = false;
404
// A broken block is a block that does not end in a branch
405
bool broken_block = false;
407
if (Core::g_CoreStartupParameter.bEnableDebugging)
409
// Comment out the following to disable breakpoints (speed-up)
410
if (!Profiler::g_ProfileBlocks)
412
if (GetState() == CPU_STEPPING)
420
// Memory exception occurred during instruction fetch
421
memory_exception = true;
424
if (Core::g_CoreStartupParameter.bMMU && (em_address & JIT_ICACHE_VMEM_BIT))
426
if (!Memory::TranslateAddress(em_address, Memory::FLAG_OPCODE))
428
// Memory exception occurred during instruction fetch
429
memory_exception = true;
434
js.firstFPInstructionFound = false;
435
js.isLastInstruction = false;
436
js.blockStart = em_address;
437
js.fifoBytesThisBlock = 0;
441
jit->js.numLoadStoreInst = 0;
442
jit->js.numFloatingPointInst = 0;
444
// Analyze the block, collect all instructions it is made of (including inlining,
445
// if that is enabled), reorder instructions for optimal performance, and join joinable instructions.
446
u32 nextPC = em_address;
447
u32 merged_addresses[32];
448
const int capacity_of_merged_addresses = sizeof(merged_addresses) / sizeof(merged_addresses[0]);
449
int size_of_merged_addresses = 0;
450
if (!memory_exception)
452
// If there is a memory exception inside a block (broken_block==true), compile up to that instruction.
453
nextPC = PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, broken_block, code_buf, blockSize, merged_addresses, capacity_of_merged_addresses, size_of_merged_addresses);
456
PPCAnalyst::CodeOp *ops = code_buf->codebuffer;
458
const u8 *start = AlignCode4(); // TODO: Test if this or AlignCode16 make a difference from GetCodePtr
459
b->checkedEntry = start;
462
// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
463
FixupBranch skip = J_CC(CC_NBE);
464
MOV(32, M(&PC), Imm32(js.blockStart));
465
JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming.
468
const u8 *normalEntry = GetCodePtr();
469
b->normalEntry = normalEntry;
472
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
474
// Conditionally add profiling code.
475
if (Profiler::g_ProfileBlocks) {
476
ADD(32, M(&b->runCount), Imm8(1));
485
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart);
487
#if defined(_DEBUG) || defined(DEBUGFAST) || defined(NAN_CHECK)
488
// should help logged stack-traces become more accurate
489
MOV(32, M(&PC), Imm32(js.blockStart));
492
// Start up the register allocators
493
// They use the information in gpa/fpa to preload commonly used registers.
497
js.downcountAmount = 0;
498
if (!Core::g_CoreStartupParameter.bEnableDebugging)
500
for (int i = 0; i < size_of_merged_addresses; ++i)
502
const u32 address = merged_addresses[i];
503
js.downcountAmount += PatchEngine::GetSpeedhackCycles(address);
509
js.compilerPC = nextPC;
510
// Translate instructions
511
for (int i = 0; i < (int)size; i++)
513
js.compilerPC = ops[i].address;
515
js.instructionNumber = i;
516
const GekkoOPInfo *opinfo = ops[i].opinfo;
517
js.downcountAmount += (opinfo->numCyclesMinusOne + 1);
519
if (i == (int)size - 1)
521
// WARNING - cmp->branch merging will screw this up.
522
js.isLastInstruction = true;
524
if (Profiler::g_ProfileBlocks) {
525
// CAUTION!!! push on stack regs you use, do your stuff, then pop
528
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
529
// tic counter += (end tic - start tic)
530
PROFILER_ADD_DIFF_LARGE_INTEGER(&b->ticCounter, &b->ticStop, &b->ticStart);
536
// help peephole optimizations
537
js.next_inst = ops[i + 1].inst;
538
js.next_compilerPC = ops[i + 1].address;
541
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
543
js.fifoBytesThisBlock -= 32;
544
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
545
u32 registersInUse = RegistersInUse();
546
ABI_PushRegistersAndAdjustStack(registersInUse, false);
547
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
548
ABI_PopRegistersAndAdjustStack(registersInUse, false);
551
u32 function = HLE::GetFunctionIndex(ops[i].address);
554
int type = HLE::GetFunctionTypeByIndex(function);
555
if (type == HLE::HLE_HOOK_START || type == HLE::HLE_HOOK_REPLACE)
557
int flags = HLE::GetFunctionFlagsByIndex(function);
558
if (HLE::IsEnabled(flags))
560
HLEFunction(function);
561
if (type == HLE::HLE_HOOK_REPLACE)
563
MOV(32, R(EAX), M(&NPC));
564
js.downcountAmount += js.st.numCycles;
565
WriteExitDestInEAX();
574
if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound)
576
gpr.Flush(FLUSH_ALL);
577
fpr.Flush(FLUSH_ALL);
579
//This instruction uses FPU - needs to add FP exception bailout
580
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); // Test FP enabled bit
581
FixupBranch b1 = J_CC(CC_NZ, true);
583
// If a FPU exception occurs, the exception handler will read
584
// from PC. Update PC with the latest value in case that happens.
585
MOV(32, M(&PC), Imm32(ops[i].address));
586
OR(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE));
587
WriteExceptionExit();
591
js.firstFPInstructionFound = true;
594
// Add an external exception check if the instruction writes to the FIFO.
595
if (jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end())
597
gpr.Flush(FLUSH_ALL);
598
fpr.Flush(FLUSH_ALL);
600
TEST(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_ISI | EXCEPTION_PROGRAM | EXCEPTION_SYSCALL | EXCEPTION_FPU_UNAVAILABLE | EXCEPTION_DSI | EXCEPTION_ALIGNMENT));
601
FixupBranch clearInt = J_CC(CC_NZ, true);
602
TEST(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_EXTERNAL_INT));
603
FixupBranch noExtException = J_CC(CC_Z, true);
604
TEST(32, M((void *)&PowerPC::ppcState.msr), Imm32(0x0008000));
605
FixupBranch noExtIntEnable = J_CC(CC_Z, true);
606
TEST(32, M((void *)&ProcessorInterface::m_InterruptCause), Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | ProcessorInterface::INT_CAUSE_PE_FINISH));
607
FixupBranch noCPInt = J_CC(CC_Z, true);
609
MOV(32, M(&PC), Imm32(ops[i].address));
610
WriteExternalExceptionExit();
612
SetJumpTarget(noCPInt);
613
SetJumpTarget(noExtIntEnable);
614
SetJumpTarget(noExtException);
615
SetJumpTarget(clearInt);
618
if (Core::g_CoreStartupParameter.bEnableDebugging && breakpoints.IsAddressBreakPoint(ops[i].address) && GetState() != CPU_STEPPING)
620
gpr.Flush(FLUSH_ALL);
621
fpr.Flush(FLUSH_ALL);
623
MOV(32, M(&PC), Imm32(ops[i].address));
624
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
625
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
626
FixupBranch noBreakpoint = J_CC(CC_Z);
628
WriteExit(ops[i].address, 0);
629
SetJumpTarget(noBreakpoint);
632
Jit64Tables::CompileInstruction(ops[i]);
634
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
636
// In case we are about to jump to the dispatcher, flush regs
637
gpr.Flush(FLUSH_ALL);
638
fpr.Flush(FLUSH_ALL);
640
TEST(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_DSI));
641
FixupBranch noMemException = J_CC(CC_Z, true);
643
// If a memory exception occurs, the exception handler will read
644
// from PC. Update PC with the latest value in case that happens.
645
MOV(32, M(&PC), Imm32(ops[i].address));
646
WriteExceptionExit();
647
SetJumpTarget(noMemException);
650
if (opinfo->flags & FL_LOADSTORE)
651
++jit->js.numLoadStoreInst;
653
if (opinfo->flags & FL_USE_FPU)
654
++jit->js.numFloatingPointInst;
657
#if defined(_DEBUG) || defined(DEBUGFAST)
658
if (gpr.SanityCheck() || fpr.SanityCheck())
661
DisassembleGekko(ops[i].inst.hex, em_address, ppcInst, 256);
662
//NOTICE_LOG(DYNA_REC, "Unflushed register: %s", ppcInst);
667
i++; // Skip next instruction
674
u32 function = HLE::GetFunctionIndex(js.blockStart);
677
int type = HLE::GetFunctionTypeByIndex(function);
678
if (type == HLE::HLE_HOOK_END)
680
int flags = HLE::GetFunctionFlagsByIndex(function);
681
if (HLE::IsEnabled(flags))
683
HLEFunction(function);
688
if (memory_exception)
690
// Address of instruction could not be translated
691
MOV(32, M(&NPC), Imm32(js.compilerPC));
693
OR(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_ISI));
695
// Remove the invalid instruction from the icache, forcing a recompile
697
MOV(32, M(jit->GetBlockCache()->GetICachePtr(js.compilerPC)), Imm32(JIT_ICACHE_INVALID_WORD));
699
MOV(64, R(RAX), ImmPtr(jit->GetBlockCache()->GetICachePtr(js.compilerPC)));
700
MOV(32,MatR(RAX),Imm32(JIT_ICACHE_INVALID_WORD));
703
WriteExceptionExit();
708
gpr.Flush(FLUSH_ALL);
709
fpr.Flush(FLUSH_ALL);
710
WriteExit(nextPC, 0);
713
b->flags = js.block_flags;
714
b->codeSize = (u32)(GetCodePtr() - normalEntry);
715
b->originalSize = size;
718
LogGeneratedX86(size, code_buf, normalEntry, b);
724
u32 Jit64::RegistersInUse()
728
for (int i = 0; i < NUMXREGS; i++)
733
result |= (1 << (16 + i));