From b80f0dca4b694c7f7cacca04e09a0b168264e15a Mon Sep 17 00:00:00 2001 From: Willi Ye Date: Sun, 6 Oct 2024 14:56:01 +0200 Subject: [PATCH] Implement return stack --- src/core/graphics/gpu_3d/registers_3d.rs | 5 +- src/jit/assembler/arm/branch_assembler.rs | 10 + src/jit/assembler/block_asm.rs | 10 + src/jit/assembler/block_inst.rs | 5 +- .../thumb/branch_instructions_thumb.rs | 2 +- src/jit/emitter/emit.rs | 41 +++- src/jit/emitter/emit_branch.rs | 176 +++++++++++++----- src/jit/emitter/thumb/emit_branch_thumb.rs | 21 ++- src/jit/emitter/thumb/emit_thumb.rs | 15 ++ src/jit/jit_asm.rs | 59 +++--- src/jit/jit_memory.rs | 24 ++- src/jit/op.rs | 26 +++ 12 files changed, 292 insertions(+), 102 deletions(-) diff --git a/src/core/graphics/gpu_3d/registers_3d.rs b/src/core/graphics/gpu_3d/registers_3d.rs index 795d01e3..3c3d0ff9 100644 --- a/src/core/graphics/gpu_3d/registers_3d.rs +++ b/src/core/graphics/gpu_3d/registers_3d.rs @@ -500,7 +500,6 @@ impl Gpu3DRegisters { let mut param_count = FIFO_PARAM_COUNTS[entry.cmd as usize]; if param_count > 1 { if param_count as usize > self.cmd_fifo.len() { - refresh_state(self); break; } @@ -567,10 +566,10 @@ impl Gpu3DRegisters { if self.cmd_pipe_size as usize > self.cmd_fifo.len() { self.cmd_pipe_size = self.cmd_fifo.len() as u8; } - - refresh_state(self); } + refresh_state(self); + if !self.is_cmd_fifo_full() { get_cpu_regs_mut!(emu, ARM9).unhalt(1); } diff --git a/src/jit/assembler/arm/branch_assembler.rs b/src/jit/assembler/arm/branch_assembler.rs index eea0e514..8d26527b 100644 --- a/src/jit/assembler/arm/branch_assembler.rs +++ b/src/jit/assembler/arm/branch_assembler.rs @@ -21,6 +21,16 @@ impl B { u4::new(cond as u8), )) } + + pub fn bl(imm: i32, cond: Cond) -> u32 { + u32::from(B::new( + // Extract first 24 bits, also keep msb + u24::new((((imm << 8) >> 8) & 0xFFFFFF) as u32), + u1::new(1), + u3::new(0b101), + u4::new(cond as u8), + )) + } } #[bitsize(32)] diff --git a/src/jit/assembler/block_asm.rs b/src/jit/assembler/block_asm.rs index 08112d62..083240db 100644 --- a/src/jit/assembler/block_asm.rs +++ b/src/jit/assembler/block_asm.rs @@ -249,18 +249,23 @@ impl<'a> BlockAsm<'a> { pub fn load_u8(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_read(op0, op1, op2, false, MemoryAmount::Byte) } + pub fn store_u8(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_write(op0, op1, op2, false, MemoryAmount::Byte) } + pub fn load_u16(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_read(op0, op1, op2, false, MemoryAmount::Half) } + pub fn store_u16(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_write(op0, op1, op2, false, MemoryAmount::Half) } + pub fn load_u32(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_read(op0, op1, op2, false, MemoryAmount::Word) } + pub fn store_u32(&mut self, op0: impl Into, op1: impl Into, op2: impl Into) { self.transfer_write(op0, op1, op2, false, MemoryAmount::Word) } @@ -401,6 +406,11 @@ impl<'a> BlockAsm<'a> { self.buf.insts.push(BlockInst::Epilogue); } + pub fn epilogue_previous_block(&mut self) { + self.add(BlockReg::Fixed(Reg::SP), BlockReg::Fixed(Reg::SP), ANY_REG_LIMIT as u32 * 4); + self.buf.insts.push(BlockInst::Epilogue); + } + pub fn call(&mut self, func: impl Into) { self.call_internal(func, None::, None::, None::, None::, true) } diff --git a/src/jit/assembler/block_inst.rs b/src/jit/assembler/block_inst.rs index f73bd5ec..e396ef7c 100644 --- a/src/jit/assembler/block_inst.rs +++ b/src/jit/assembler/block_inst.rs @@ -164,7 +164,7 @@ impl BlockInst { (block_reg_set!(Some(*thread_regs_addr_reg)), outputs) } - BlockInst::Call { func_reg, args, .. } => { + BlockInst::Call { func_reg, args, has_return } => { let mut inputs = BlockRegSet::new(); inputs += *func_reg; for arg in args { @@ -180,7 +180,8 @@ impl BlockInst { Some(BlockReg::Fixed(Reg::R2)), Some(BlockReg::Fixed(Reg::R3)), Some(BlockReg::Fixed(Reg::R12)), - Some(BlockReg::Fixed(Reg::CPSR)) + Some(BlockReg::Fixed(Reg::CPSR)), + if *has_return { Some(BlockReg::Fixed(Reg::LR)) } else { None } ), ) } diff --git a/src/jit/disassembler/thumb/branch_instructions_thumb.rs b/src/jit/disassembler/thumb/branch_instructions_thumb.rs index 425cebd2..aaf4a0d5 100644 --- a/src/jit/disassembler/thumb/branch_instructions_thumb.rs +++ b/src/jit/disassembler/thumb/branch_instructions_thumb.rs @@ -13,7 +13,7 @@ mod branch_thumb_ops { #[inline] pub fn blx_reg_t(opcode: u16, op: Op) -> InstInfoThumb { let op0 = Reg::from(((opcode >> 3) & 0xF) as u8); - InstInfoThumb::new(opcode, op, Operands::new_1(Operand::reg(op0)), reg_reserve!(op0), reg_reserve!(Reg::CPSR), 1) + InstInfoThumb::new(opcode, op, Operands::new_1(Operand::reg(op0)), reg_reserve!(op0), reg_reserve!(Reg::LR, Reg::CPSR), 1) } #[inline] diff --git a/src/jit/emitter/emit.rs b/src/jit/emitter/emit.rs index 4e11b665..8ec161f6 100644 --- a/src/jit/emitter/emit.rs +++ b/src/jit/emitter/emit.rs @@ -1,7 +1,7 @@ use crate::core::CpuType; use crate::core::CpuType::ARM7; use crate::jit::assembler::block_asm::BlockAsm; -use crate::jit::assembler::BlockReg; +use crate::jit::assembler::{BlockLabel, BlockReg}; use crate::jit::inst_threag_regs_handler::{register_restore_spsr, restore_thumb_after_restore_spsr, set_pc_arm_mode}; use crate::jit::jit_asm::{JitAsm, JitRuntimeData}; use crate::jit::op::Op; @@ -60,6 +60,16 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { block_asm.call(restore_thumb_after_restore_spsr:: as *const ()); } + if (op.is_mov() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::LR) && !self.jit_buf.current_inst().out_regs.is_reserved(Reg::CPSR)) + || (op.is_multiple_mem_transfer() && *self.jit_buf.current_inst().operands()[0].as_reg_no_shift().unwrap() == Reg::SP) + || (op.is_single_mem_transfer() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::SP)) + { + let guest_pc_reg = block_asm.new_reg(); + block_asm.load_u32(guest_pc_reg, block_asm.thread_regs_addr_reg, Reg::PC as u32 * 4); + self.emit_branch_return_stack_common(block_asm, guest_pc_reg); + block_asm.free_reg(guest_pc_reg); + } + self.emit_branch_out_metadata(block_asm); block_asm.epilogue(); } @@ -73,7 +83,7 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { let accumulated_cycles_reg = block_asm.new_reg(); block_asm.load_u16(accumulated_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_accumulated_cycles_offset() as u32); - + // +2 for branching block_asm.add( result_accumulated_cycles_reg, @@ -127,10 +137,11 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { self._emit_branch_out_metadata(block_asm, true, true) } - pub fn emit_flush_cycles( + pub fn emit_flush_cycles( &mut self, block_asm: &mut BlockAsm, - target_pre_cycle_count_sum: u16, + target_pre_cycle_count_sum: Option, + add_continue_label: bool, continue_fn: ContinueFn, breakout_fn: BreakoutFn, ) { @@ -140,7 +151,7 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { let result_accumulated_cycles_reg = block_asm.new_reg(); self.emit_count_cycles(block_asm, runtime_data_addr_reg, result_accumulated_cycles_reg); - const MAX_LOOP_CYCLE_COUNT: u32 = 255; + const MAX_LOOP_CYCLE_COUNT: u32 = 127; block_asm.cmp( result_accumulated_cycles_reg, match CPU { @@ -149,18 +160,28 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { }, ); + let continue_label = if add_continue_label { Some(block_asm.new_label()) } else { None }; let breakout_label = block_asm.new_label(); block_asm.branch(breakout_label, Cond::HS); - let target_pre_cycle_count_sum_reg = block_asm.new_reg(); - block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32); - block_asm.store_u16(target_pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32); - continue_fn(self, block_asm, runtime_data_addr_reg); + if let Some(target_pre_cycle_count_sum) = target_pre_cycle_count_sum { + let target_pre_cycle_count_sum_reg = block_asm.new_reg(); + block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32); + block_asm.store_u16(target_pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32); + block_asm.free_reg(target_pre_cycle_count_sum_reg); + } + continue_fn(self, block_asm, runtime_data_addr_reg, breakout_label); + if add_continue_label { + block_asm.branch(continue_label.unwrap(), Cond::AL); + } block_asm.label(breakout_label); breakout_fn(self, block_asm); - block_asm.free_reg(target_pre_cycle_count_sum_reg); + if add_continue_label { + block_asm.label(continue_label.unwrap()); + } + block_asm.free_reg(result_accumulated_cycles_reg); block_asm.free_reg(runtime_data_addr_reg); } diff --git a/src/jit/emitter/emit_branch.rs b/src/jit/emitter/emit_branch.rs index 063a4f6e..7d16bb52 100644 --- a/src/jit/emitter/emit_branch.rs +++ b/src/jit/emitter/emit_branch.rs @@ -70,11 +70,16 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { if op == Op::Bl { block_asm.mov(Reg::LR, self.jit_buf.current_pc + 4); + let target_pc_reg = block_asm.new_reg(); + block_asm.mov(target_pc_reg, target_pc); + self.emit_branch_reg_common(block_asm, target_pc_reg, true); + block_asm.free_reg(target_pc_reg); + } else { + self.emit_branch_label_common::(block_asm, target_pc, inst_info.cond); } - self.emit_branch_label_common::(block_asm, target_pc, inst_info.cond, op == Op::Bl); } - pub fn emit_branch_label_common(&mut self, block_asm: &mut BlockAsm, target_pc: u32, cond: Cond, has_lr_return: bool) { + pub fn emit_branch_label_common(&mut self, block_asm: &mut BlockAsm, target_pc: u32, cond: Cond) { match Self::analyze_branch_label::(&self.jit_buf.insts, self.jit_buf.current_index, cond, self.jit_buf.current_pc, target_pc) { JitBranchInfo::Local(target_index) => { let target_pre_cycle_count_sum = self.jit_buf.insts_cycle_counts[target_index] - self.jit_buf.insts[target_index].cycle as u16; @@ -84,11 +89,9 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { self.emit_flush_cycles( block_asm, - target_pre_cycle_count_sum, - |asm, block_asm, runtime_data_addr_reg| { - if has_lr_return { - asm.emit_return_stack_write(block_asm, runtime_data_addr_reg); - } + Some(target_pre_cycle_count_sum), + false, + |asm, block_asm, _, _| { if DEBUG_LOG { block_asm.call2(Self::debug_branch_label as *const (), asm.jit_buf.current_pc, target_pc); } @@ -116,7 +119,7 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { JitBranchInfo::None => { let target_pc_reg = block_asm.new_reg(); block_asm.mov(target_pc_reg, target_pc); - self.emit_branch_reg_common(block_asm, target_pc_reg, has_lr_return); + self.emit_branch_reg_common(block_asm, target_pc_reg, false); block_asm.free_reg(target_pc_reg); } } @@ -129,8 +132,66 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { block_asm.mov(Reg::PC, target_pc_reg); block_asm.save_context(); - self.emit_branch_out_metadata(block_asm); - block_asm.epilogue(); + if target_pc_reg == Reg::LR { + self.emit_branch_return_stack_common(block_asm, target_pc_reg.into()); + } else { + self.emit_branch_out_metadata(block_asm); + block_asm.epilogue(); + } + } + + pub fn emit_branch_return_stack_common(&mut self, block_asm: &mut BlockAsm, target_pc_reg: BlockReg) { + self.emit_flush_cycles( + block_asm, + None, + false, + |asm, block_asm, runtime_data_addr_reg, breakout_label| { + let return_stack_ptr_reg = block_asm.new_reg(); + + // block_asm.bkpt(1); + + block_asm.load_u8(return_stack_ptr_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_ptr_offset() as u32); + block_asm.cmp(return_stack_ptr_reg, 0); + block_asm.branch(breakout_label, Cond::EQ); + + block_asm.sub(return_stack_ptr_reg, return_stack_ptr_reg, 1); + + let return_stack_reg = block_asm.new_reg(); + block_asm.add(return_stack_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_offset() as u32); + + let desired_lr_reg = block_asm.new_reg(); + block_asm.load_u32(desired_lr_reg, return_stack_reg, (return_stack_ptr_reg.into(), ShiftType::Lsl, BlockOperand::from(2))); + + let aligned_target_pc_reg = block_asm.new_reg(); + let thumb_bit_mask_reg = block_asm.new_reg(); + block_asm.and(thumb_bit_mask_reg, target_pc_reg, 1); + Self::emit_align_guest_pc(block_asm, target_pc_reg, aligned_target_pc_reg); + block_asm.orr(aligned_target_pc_reg, aligned_target_pc_reg, thumb_bit_mask_reg); + + block_asm.cmp(desired_lr_reg, aligned_target_pc_reg); + block_asm.branch(breakout_label, Cond::NE); + + block_asm.store_u8(return_stack_ptr_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_ptr_offset() as u32); + + Self::emit_set_cpsr_thumb_bit(block_asm, aligned_target_pc_reg); + + if DEBUG_LOG { + block_asm.call2(Self::debug_branch_lr as *const (), asm.jit_buf.current_pc, aligned_target_pc_reg); + } + + block_asm.epilogue_previous_block(); + + block_asm.free_reg(thumb_bit_mask_reg); + block_asm.free_reg(aligned_target_pc_reg); + block_asm.free_reg(desired_lr_reg); + block_asm.free_reg(return_stack_reg); + block_asm.free_reg(return_stack_ptr_reg); + }, + |asm, block_asm| { + asm.emit_branch_out_metadata(block_asm); + block_asm.epilogue(); + }, + ) } pub fn emit_blx(&mut self, block_asm: &mut BlockAsm) { @@ -141,7 +202,7 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { self.emit_branch_reg_common(block_asm, target_pc_reg.into(), true); } - fn emit_return_stack_write(&mut self, block_asm: &mut BlockAsm, runtime_data_addr_reg: BlockReg) { + fn emit_return_stack_write_desired_lr(&mut self, block_asm: &mut BlockAsm, runtime_data_addr_reg: BlockReg) { let return_stack_ptr_reg = block_asm.new_reg(); block_asm.load_u8(return_stack_ptr_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_ptr_offset() as u32); @@ -149,58 +210,48 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { let return_stack_reg = block_asm.new_reg(); block_asm.add(return_stack_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_offset() as u32); - block_asm.add(return_stack_reg, return_stack_reg, (return_stack_ptr_reg.into(), ShiftType::Lsl, BlockOperand::from(3))); - block_asm.store_u32(Reg::LR, return_stack_reg, 0); + block_asm.store_u32(Reg::LR, return_stack_reg, (return_stack_ptr_reg.into(), ShiftType::Lsl, BlockOperand::from(2))); - let return_pre_cycle_count_sum_reg = block_asm.new_reg(); - block_asm.mov(return_pre_cycle_count_sum_reg, self.jit_buf.insts_cycle_counts[self.jit_buf.current_index] as u32); - block_asm.store_u16(return_pre_cycle_count_sum_reg, return_stack_reg, 4); + if DEBUG_LOG { + block_asm.call3(Self::debug_push_return_stack as *const (), self.jit_buf.current_pc, Reg::LR, return_stack_ptr_reg); + } block_asm.add(return_stack_ptr_reg, return_stack_ptr_reg, 1); block_asm.store_u8(return_stack_ptr_reg, runtime_data_addr_reg, JitRuntimeData::get_return_stack_ptr_offset() as u32); - block_asm.free_reg(return_pre_cycle_count_sum_reg); block_asm.free_reg(return_stack_reg); block_asm.free_reg(return_stack_ptr_reg); } + fn emit_return_write_pre_cycle_count_sum(&mut self, block_asm: &mut BlockAsm, runtime_data_addr_reg: BlockReg) { + let total_cycles = self.jit_buf.insts_cycle_counts[self.jit_buf.current_index]; + let total_cycles_reg = block_asm.new_reg(); + block_asm.mov(total_cycles_reg, total_cycles as u32); + block_asm.store_u16(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32); + block_asm.free_reg(total_cycles_reg); + } + pub fn emit_branch_reg_common(&mut self, block_asm: &mut BlockAsm, target_pc_reg: BlockReg, has_lr_return: bool) { block_asm.mov(Reg::PC, target_pc_reg); block_asm.save_context(); self.emit_flush_cycles( block_asm, - 0, - |asm, block_asm, runtime_data_addr_reg| { + Some(0), + true, + |asm, block_asm, runtime_data_addr_reg, _| { if has_lr_return { - asm.emit_return_stack_write(block_asm, runtime_data_addr_reg); + asm.emit_return_stack_write_desired_lr(block_asm, runtime_data_addr_reg); } - let cpsr_reg = block_asm.new_reg(); - let addr_mask_reg = block_asm.new_reg(); - - // Set thumb bit - block_asm.load_u32(cpsr_reg, block_asm.thread_regs_addr_reg, Reg::CPSR as u32 * 4); - block_asm.bfi(cpsr_reg, target_pc_reg, 5, 1); - block_asm.store_u32(cpsr_reg, block_asm.thread_regs_addr_reg, Reg::CPSR as u32 * 4); - - block_asm.free_reg(cpsr_reg); + Self::emit_set_cpsr_thumb_bit(block_asm, target_pc_reg); if DEBUG_LOG { - block_asm.call2(Self::debug_branch_reg as *const (), asm.jit_buf.current_pc, target_pc_reg); + // block_asm.call2(Self::debug_branch_reg as *const (), asm.jit_buf.current_pc, target_pc_reg); } - let target_addr_reg = block_asm.new_reg(); - - // Align pc to !1 or !3 - // let thumb = (target_pc & 1) == 1; - // let addr_mask = !(1 | ((!thumb as u32) << 1)); - // let target_addr = target_pc & addr_mask; - block_asm.mvn(addr_mask_reg, 3); - block_asm.orr(addr_mask_reg, addr_mask_reg, (target_pc_reg.into(), ShiftType::Lsl, BlockOperand::from(1))); - block_asm.and(target_addr_reg, target_pc_reg, addr_mask_reg); - - block_asm.free_reg(addr_mask_reg); + let aligned_target_reg = block_asm.new_reg(); + Self::emit_align_guest_pc(block_asm, target_pc_reg, aligned_target_reg); let map_ptr = get_jit!(asm.emu).jit_memory_map.get_map_ptr::(); @@ -209,26 +260,28 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { let map_entry_base_ptr_reg = block_asm.new_reg(); block_asm.mov(map_ptr_reg, map_ptr as u32); - block_asm.mov(map_index_reg, (target_addr_reg.into(), ShiftType::Lsr, BlockOperand::from(jit_memory_map::BLOCK_SHIFT as u32 + 1))); + block_asm.mov(map_index_reg, (aligned_target_reg.into(), ShiftType::Lsr, BlockOperand::from(jit_memory_map::BLOCK_SHIFT as u32 + 1))); block_asm.load_u32(map_entry_base_ptr_reg, map_ptr_reg, (map_index_reg.into(), ShiftType::Lsl, BlockOperand::from(2))); let block_size_mask_reg = map_index_reg; block_asm.mov(block_size_mask_reg, (jit_memory_map::BLOCK_SIZE as u32 - 1) << 2); - block_asm.and(target_addr_reg, block_size_mask_reg, (target_addr_reg.into(), ShiftType::Lsl, BlockOperand::from(1))); + block_asm.and(aligned_target_reg, block_size_mask_reg, (aligned_target_reg.into(), ShiftType::Lsl, BlockOperand::from(1))); let entry_fn_reg = block_asm.new_reg(); - block_asm.load_u32(entry_fn_reg, map_entry_base_ptr_reg, target_addr_reg); + block_asm.load_u32(entry_fn_reg, map_entry_base_ptr_reg, aligned_target_reg); + block_asm.call1(entry_fn_reg, 0); if has_lr_return { - block_asm.call1(entry_fn_reg, 0); + asm.emit_return_write_pre_cycle_count_sum(block_asm, runtime_data_addr_reg); + block_asm.restore_reg(Reg::CPSR); } else { - block_asm.call1_no_return(entry_fn_reg, 0); + block_asm.epilogue_previous_block(); } block_asm.free_reg(entry_fn_reg); block_asm.free_reg(map_entry_base_ptr_reg); block_asm.free_reg(map_index_reg); block_asm.free_reg(map_ptr_reg); - block_asm.free_reg(target_addr_reg); + block_asm.free_reg(aligned_target_reg); }, |asm, block_asm| { asm.emit_branch_out_metadata_no_count_cycles(block_asm); @@ -254,6 +307,31 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { block_asm.free_reg(target_pc_reg); } + fn emit_set_cpsr_thumb_bit(block_asm: &mut BlockAsm, guest_pc_reg: BlockReg) { + let cpsr_reg = block_asm.new_reg(); + block_asm.load_u32(cpsr_reg, block_asm.thread_regs_addr_reg, Reg::CPSR as u32 * 4); + block_asm.bfi(cpsr_reg, guest_pc_reg, 5, 1); + block_asm.store_u32(cpsr_reg, block_asm.thread_regs_addr_reg, Reg::CPSR as u32 * 4); + block_asm.free_reg(cpsr_reg); + } + + fn emit_align_guest_pc(block_asm: &mut BlockAsm, guest_pc_reg: BlockReg, aligned_guest_pc_reg: BlockReg) { + // Align pc to 2 or 4 + // let thumb = (guest_pc & 1) == 1; + // let addr_mask = !(1 | ((!thumb as u32) << 1)); + // let aligned_guest_pc = guest_pc & addr_mask; + + let addr_mask_reg = block_asm.new_reg(); + block_asm.mvn(addr_mask_reg, 3); + block_asm.orr(addr_mask_reg, addr_mask_reg, (guest_pc_reg.into(), ShiftType::Lsl, BlockOperand::from(1))); + block_asm.and(aligned_guest_pc_reg, guest_pc_reg, addr_mask_reg); + block_asm.free_reg(addr_mask_reg); + } + + extern "C" fn debug_push_return_stack(current_pc: u32, lr_pc: u32, stack_size: u8) { + println!("{CPU:?} push {lr_pc:x} to return stack with size {stack_size} at {current_pc:x}") + } + extern "C" fn debug_branch_label(current_pc: u32, target_pc: u32) { println!("{CPU:?} branch label from {current_pc:x} to {target_pc:x}") } @@ -261,4 +339,8 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { extern "C" fn debug_branch_reg(current_pc: u32, target_pc: u32) { println!("{CPU:?} branch reg from {current_pc:x} to {target_pc:x}") } + + extern "C" fn debug_branch_lr(current_pc: u32, target_pc: u32) { + println!("{CPU:?} branch lr from {current_pc:x} to {target_pc:x}") + } } diff --git a/src/jit/emitter/thumb/emit_branch_thumb.rs b/src/jit/emitter/thumb/emit_branch_thumb.rs index 0e1fe5c6..26ef23f1 100644 --- a/src/jit/emitter/thumb/emit_branch_thumb.rs +++ b/src/jit/emitter/thumb/emit_branch_thumb.rs @@ -32,7 +32,7 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { }; block_asm.start_cond_block(cond); - self.emit_branch_label_common::(block_asm, target_pc | 1, cond, false); + self.emit_branch_label_common::(block_asm, target_pc | 1, cond); block_asm.end_cond_block(); } @@ -57,18 +57,25 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { } block_asm.mov(Reg::LR, (self.jit_buf.current_pc + 2) | 1); - self.emit_branch_label_common::(block_asm, target_pc, Cond::AL, true); + let target_pc_reg = block_asm.new_reg(); + block_asm.mov(target_pc_reg, target_pc); + self.emit_branch_reg_common(block_asm, target_pc_reg, true); + block_asm.free_reg(target_pc_reg); } pub fn emit_bx_thumb(&mut self, block_asm: &mut BlockAsm) { let inst_info = self.jit_buf.current_inst(); + let target_pc_reg = *inst_info.operands()[0].as_reg_no_shift().unwrap(); - let op0 = *inst_info.operands()[0].as_reg_no_shift().unwrap(); - - block_asm.mov(Reg::PC, op0); + block_asm.mov(Reg::PC, target_pc_reg); block_asm.save_context(); - self.emit_branch_out_metadata(block_asm); - block_asm.epilogue(); + + if target_pc_reg == Reg::LR { + self.emit_branch_return_stack_common(block_asm, target_pc_reg.into()); + } else { + self.emit_branch_out_metadata(block_asm); + block_asm.epilogue(); + } } pub fn emit_blx_thumb(&mut self, block_asm: &mut BlockAsm) { diff --git a/src/jit/emitter/thumb/emit_thumb.rs b/src/jit/emitter/thumb/emit_thumb.rs index 82a74e31..c7714b89 100644 --- a/src/jit/emitter/thumb/emit_thumb.rs +++ b/src/jit/emitter/thumb/emit_thumb.rs @@ -76,6 +76,21 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> { block_asm.call(set_pc_thumb_mode:: as *const ()); } + // R9 can be used as a substitution for SP for branch prediction + if (op == Op::MovHT && self.jit_buf.current_inst().src_regs.is_reserved(Reg::LR)) + || (op.is_multiple_mem_transfer() + && match *self.jit_buf.current_inst().operands()[0].as_reg_no_shift().unwrap() { + Reg::R9 | Reg::SP => true, + _ => false, + }) + || (op.is_single_mem_transfer() && (self.jit_buf.current_inst().src_regs.is_reserved(Reg::R9) || self.jit_buf.current_inst().src_regs.is_reserved(Reg::SP))) + { + let guest_pc_reg = block_asm.new_reg(); + block_asm.load_u32(guest_pc_reg, block_asm.thread_regs_addr_reg, Reg::PC as u32 * 4); + self.emit_branch_return_stack_common(block_asm, guest_pc_reg); + block_asm.free_reg(guest_pc_reg); + } + self.emit_branch_out_metadata(block_asm); block_asm.epilogue(); } diff --git a/src/jit/jit_asm.rs b/src/jit/jit_asm.rs index 634cea62..e1b1403d 100644 --- a/src/jit/jit_asm.rs +++ b/src/jit/jit_asm.rs @@ -11,8 +11,8 @@ use crate::jit::reg::Reg; use crate::jit::reg::{reg_reserve, RegReserve}; use crate::logging::debug_println; use crate::{get_jit_asm_ptr, DEBUG_LOG, DEBUG_LOG_BRANCH_OUT}; -use static_assertions::const_assert_eq; use std::cell::UnsafeCell; +use std::intrinsics::unlikely; use std::{mem, ptr}; pub struct JitBuf { @@ -46,15 +46,6 @@ impl JitBuf { } } -#[repr(C)] -#[derive(Copy, Clone, Default)] -pub struct JitBlockLinkData { - pub desired_lr: u32, - pub return_pre_cycle_count_sum: u16, -} - -const_assert_eq!(size_of::(), 8); - pub const RETURN_STACK_SIZE: usize = 32; #[repr(C)] @@ -65,7 +56,7 @@ pub struct JitRuntimeData { pub idle_loop: bool, pub host_sp: usize, pub return_stack_ptr: u8, - pub return_stack: [JitBlockLinkData; RETURN_STACK_SIZE], + pub return_stack: [u32; RETURN_STACK_SIZE], } impl JitRuntimeData { @@ -77,9 +68,9 @@ impl JitRuntimeData { idle_loop: false, host_sp: 0, return_stack_ptr: 0, - return_stack: [JitBlockLinkData::default(); RETURN_STACK_SIZE], + return_stack: [0; RETURN_STACK_SIZE], }; - assert_eq!(size_of_val(&instance.return_stack), 32 * 8); + assert_eq!(size_of_val(&instance.return_stack), RETURN_STACK_SIZE * 4); instance } @@ -117,10 +108,12 @@ impl JitRuntimeData { } pub extern "C" fn emit_code_block(store_host_sp: bool) { - let asm = unsafe { get_jit_asm_ptr::().as_mut().unwrap_unchecked() }; + let (guest_pc, thumb) = { + let asm = unsafe { get_jit_asm_ptr::().as_mut().unwrap_unchecked() }; - let guest_pc = get_regs!(asm.emu, CPU).pc; - let thumb = (guest_pc & 1) == 1; + let guest_pc = get_regs!(asm.emu, CPU).pc; + (guest_pc, (guest_pc & 1) == 1) + }; if thumb { emit_code_block_internal::(store_host_sp, guest_pc & !1) } else { @@ -166,27 +159,40 @@ fn emit_code_block_internal(store_host_sp } let jit_entry = { - // unsafe { BLOCK_LOG = guest_pc == 0x2000800 }; + // unsafe { BLOCK_LOG = guest_pc == 0x20b2688 }; let guest_regs_ptr = get_regs_mut!(asm.emu, CPU).get_reg_mut_ptr(); let mut block_asm = unsafe { (*asm.block_asm_buf.get()).new_asm(guest_regs_ptr, ptr::addr_of_mut!((*asm).runtime_data.host_sp)) }; + if DEBUG_LOG { + block_asm.call1(debug_enter_block:: as *const (), guest_pc | (THUMB as u32)); + block_asm.restore_reg(Reg::CPSR); + } + + // if guest_pc == 0x20b2688 { + // block_asm.bkpt(2); + // } + for i in 0..asm.jit_buf.insts.len() { asm.jit_buf.current_index = i; asm.jit_buf.current_pc = guest_pc + (i << if THUMB { 1 } else { 2 }) as u32; debug_println!("{CPU:?} emitting {:?} at pc: {:x}", asm.jit_buf.current_inst(), asm.jit_buf.current_pc); + // if asm.jit_buf.current_pc == 0x20216e2 { + // block_asm.bkpt(1); + // } + if THUMB { asm.emit_thumb(&mut block_asm); } else { asm.emit(&mut block_asm); } - if DEBUG_LOG { - block_asm.save_context(); - block_asm.call2(debug_after_exec_op:: as *const (), asm.jit_buf.current_pc, asm.jit_buf.current_inst().opcode); - block_asm.restore_reg(Reg::CPSR); - } + // if DEBUG_LOG { + // block_asm.save_context(); + // block_asm.call2(debug_after_exec_op:: as *const (), asm.jit_buf.current_pc, asm.jit_buf.current_inst().opcode); + // block_asm.restore_reg(Reg::CPSR); + // } } let opcodes = block_asm.finalize(guest_pc, THUMB); @@ -196,7 +202,10 @@ fn emit_code_block_internal(store_host_sp } todo!() } - let insert_entry = get_jit_mut!(asm.emu).insert_block::(&opcodes, guest_pc); + let (insert_entry, flushed) = get_jit_mut!(asm.emu).insert_block::(&opcodes, guest_pc); + if unlikely(flushed) { + asm.runtime_data.return_stack_ptr = 0; + } let jit_entry: extern "C" fn(bool) = unsafe { mem::transmute(insert_entry) }; if DEBUG_LOG { @@ -309,3 +318,7 @@ unsafe extern "C" fn debug_after_exec_op(pc: u32, opcode: u3 debug_inst_info::(get_regs!((*asm).emu, CPU), pc, &format!("\n\t{:?} {:?}", CPU, inst_info)); } + +extern "C" fn debug_enter_block(pc: u32) { + println!("{CPU:?} execute {pc:x}") +} diff --git a/src/jit/jit_memory.rs b/src/jit/jit_memory.rs index b9e60696..819d19da 100644 --- a/src/jit/jit_memory.rs +++ b/src/jit/jit_memory.rs @@ -131,9 +131,14 @@ impl JitMemory { } } - fn allocate_block(&mut self, required_size: usize, insert_at_end: bool) -> usize { + fn allocate_block(&mut self, required_size: usize, insert_at_end: bool) -> (usize, bool) { + let mut flushed = false; let free_size = self.mem_end - self.mem_start; if free_size < required_size { + debug_println!("Jit memory reset"); + + flushed = true; + self.mem_start = 0; self.mem_end = JIT_MEMORY_SIZE; @@ -148,28 +153,28 @@ impl JitMemory { if insert_at_end { self.mem_end -= required_size; - self.mem_end + (self.mem_end, flushed) } else { let addr = self.mem_start; self.mem_start += required_size; - addr + (addr, flushed) } } - fn insert(&mut self, opcodes: &[u32], insert_at_end: bool) -> (usize, usize) { + fn insert(&mut self, opcodes: &[u32], insert_at_end: bool) -> (usize, usize, bool) { let aligned_size = utils::align_up(size_of_val(opcodes), *PAGE_SIZE); - let allocated_offset_addr = self.allocate_block(aligned_size, insert_at_end); + let (allocated_offset_addr, flushed) = self.allocate_block(aligned_size, insert_at_end); utils::write_to_mem_slice(&mut self.mem, allocated_offset_addr, opcodes); self.flush_cache(allocated_offset_addr, aligned_size); - (allocated_offset_addr, aligned_size) + (allocated_offset_addr, aligned_size, flushed) } - pub fn insert_block(&mut self, opcodes: &[u32], guest_pc: u32) -> *const extern "C" fn(bool) { + pub fn insert_block(&mut self, opcodes: &[u32], guest_pc: u32) -> (*const extern "C" fn(bool), bool) { macro_rules! insert { ($entries:expr, $live_ranges:expr, $insert_at_end:expr) => {{ - let (allocated_offset_addr, aligned_size) = self.insert(opcodes, $insert_at_end); + let (allocated_offset_addr, aligned_size, flushed) = self.insert(opcodes, $insert_at_end); let jit_entry_addr = (allocated_offset_addr + self.mem.as_ptr() as usize) as *const extern "C" fn(bool); @@ -198,7 +203,7 @@ impl JitMemory { ); } - jit_entry_addr + (jit_entry_addr, flushed) }}; } @@ -236,6 +241,7 @@ impl JitMemory { *live_range &= !(1 << live_ranges_bit); let guest_addr_start = $guest_addr & !(JIT_LIVE_RANGE_PAGE_SIZE - 1); + debug_println!("Invalidating jit {guest_addr_start:x} - {:x}", guest_addr_start + JIT_LIVE_RANGE_PAGE_SIZE); $( let jit_entry_start = self.jit_memory_map.get_jit_entry::<{ $cpu_entry }>(guest_addr_start); unsafe { slice::from_raw_parts_mut(jit_entry_start, JIT_LIVE_RANGE_PAGE_SIZE as usize).fill( diff --git a/src/jit/op.rs b/src/jit/op.rs index 8eac8d17..3826f09a 100644 --- a/src/jit/op.rs +++ b/src/jit/op.rs @@ -2115,6 +2115,32 @@ impl Op { | Op::MvnsRrr ) } + + pub const fn is_mov(self) -> bool { + matches!( + self, + Op::MovAri + | Op::MovArr + | Op::MovImm + | Op::MovLli + | Op::MovLlr + | Op::MovLri + | Op::MovLrr + | Op::MovRri + | Op::MovRrr + | Op::MovsAri + | Op::MovsArr + | Op::MovsImm + | Op::MovsLli + | Op::MovsLlr + | Op::MovsLri + | Op::MovsLrr + | Op::MovsRri + | Op::MovsRrr + | Op::MovHT + | Op::MovImm8T + ) + } } impl From for Op {