Skip to content

Commit

Permalink
WIP return stack optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
Grarak committed Oct 6, 2024
1 parent f9017fa commit e07d536
Show file tree
Hide file tree
Showing 12 changed files with 329 additions and 152 deletions.
10 changes: 10 additions & 0 deletions src/jit/assembler/arm/branch_assembler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ impl B {
u4::new(cond as u8),
))
}

pub fn bl(imm: i32, cond: Cond) -> u32 {
u32::from(B::new(
// Extract first 24 bits, also keep msb
u24::new((((imm << 8) >> 8) & 0xFFFFFF) as u32),
u1::new(1),
u3::new(0b101),
u4::new(cond as u8),
))
}
}

#[bitsize(32)]
Expand Down
50 changes: 44 additions & 6 deletions src/jit/assembler/block_asm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::jit::assembler::block_inst_list::BlockInstList;
use crate::jit::assembler::block_reg_set::BlockRegSet;
use crate::jit::assembler::{block_reg_allocator, BlockAsmBuf, BlockInst, BlockLabel, BlockOperand, BlockOperandShift, BlockReg, ANY_REG_LIMIT};
use crate::jit::inst_info::InstInfo;
use crate::jit::reg::{Reg, RegReserve};
use crate::jit::reg::{reg_reserve, Reg, RegReserve};
use crate::jit::{Cond, MemoryAmount, ShiftType};
use crate::utils::{NoHashMap, NoHashSet};

Expand Down Expand Up @@ -101,7 +101,7 @@ impl<'a> BlockAsm<'a> {
instance.start_cond_block(Cond::NE);
let host_sp_addr_reg = thread_regs_addr_reg;
instance.mov(host_sp_addr_reg, host_sp_ptr as u32);
instance.transfer_write(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0, false, MemoryAmount::Word);
instance.store_u32(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0);
instance.end_cond_block();

instance.sub(BlockReg::Fixed(Reg::SP), BlockReg::Fixed(Reg::SP), ANY_REG_LIMIT as u32 * 4); // Reserve for spilled registers
Expand Down Expand Up @@ -246,6 +246,30 @@ impl<'a> BlockAsm<'a> {
})
}

pub fn load_u8(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_read(op0, op1, op2, false, MemoryAmount::Byte)
}

pub fn store_u8(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_write(op0, op1, op2, false, MemoryAmount::Byte)
}

pub fn load_u16(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_read(op0, op1, op2, false, MemoryAmount::Half)
}

pub fn store_u16(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_write(op0, op1, op2, false, MemoryAmount::Half)
}

pub fn load_u32(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_read(op0, op1, op2, false, MemoryAmount::Word)
}

pub fn store_u32(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
self.transfer_write(op0, op1, op2, false, MemoryAmount::Word)
}

pub fn transfer_read(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>, signed: bool, amount: MemoryAmount) {
self.transfer(BlockTransferOp::Read, op0, op1, op2, signed, amount)
}
Expand Down Expand Up @@ -347,6 +371,7 @@ impl<'a> BlockAsm<'a> {
cond,
block_index: 0,
skip: false,
has_return: false,
})
}

Expand Down Expand Up @@ -378,7 +403,12 @@ impl<'a> BlockAsm<'a> {
pub fn epilogue(&mut self) {
let host_sp_addr_reg = self.thread_regs_addr_reg;
self.mov(host_sp_addr_reg, self.host_sp_ptr as u32);
self.transfer_read(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0, false, MemoryAmount::Word);
self.load_u32(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0);
self.buf.insts.push(BlockInst::Epilogue);
}

pub fn epilogue_previous_block(&mut self) {
self.add(BlockReg::Fixed(Reg::SP), BlockReg::Fixed(Reg::SP), ANY_REG_LIMIT as u32 * 4);
self.buf.insts.push(BlockInst::Epilogue);
}

Expand Down Expand Up @@ -440,6 +470,9 @@ impl<'a> BlockAsm<'a> {
}
}
self.mov(self.tmp_func_call_reg, func.into());
if has_return {
self.transfer_push(BlockReg::Fixed(Reg::SP), reg_reserve!(Reg::LR));
}
self.insert_inst(BlockInst::Call {
func_reg: self.tmp_func_call_reg,
args: [
Expand All @@ -450,6 +483,9 @@ impl<'a> BlockAsm<'a> {
],
has_return,
});
if has_return {
self.transfer_pop(BlockReg::Fixed(Reg::SP), reg_reserve!(Reg::LR));
}
}

pub fn bkpt(&mut self, id: u16) {
Expand All @@ -471,7 +507,7 @@ impl<'a> BlockAsm<'a> {
self.insert_inst(BlockInst::GuestPc(pc));
}

pub fn guest_branch(&mut self, cond: Cond, target_pc: u32) {
pub fn guest_branch(&mut self, cond: Cond, target_pc: u32, has_return: bool) {
let label = match self.buf.guest_branches_mapping.get(&target_pc) {
None => {
let label = self.new_label();
Expand All @@ -485,6 +521,7 @@ impl<'a> BlockAsm<'a> {
cond,
block_index: 0,
skip: false,
has_return,
});
}

Expand Down Expand Up @@ -850,10 +887,11 @@ impl<'a> BlockAsm<'a> {
for branch_placeholder in branch_placeholders {
let opcode = opcodes[branch_placeholder];
let cond = Cond::from((opcode >> 28) as u8);
let block_index = opcode & 0xFFFFFFF;
let has_return = (opcode >> 27) & 1 == 1;
let block_index = opcode & 0x7FFFFFF;
let branch_to = opcodes_offset[block_index as usize];
let diff = branch_to as i32 - branch_placeholder as i32;
opcodes[branch_placeholder] = B::b(diff - 2, cond);
opcodes[branch_placeholder] = if has_return { B::bl(diff - 2, cond) } else { B::b(diff - 2, cond) };
}

opcodes
Expand Down
30 changes: 24 additions & 6 deletions src/jit/assembler/block_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ impl BlockInst {
BlockInst::Bfc { operand, .. } => (block_reg_set!(Some(*operand)), block_reg_set!(Some(*operand))),
BlockInst::Bfi { operands, .. } => (block_reg_set!(Some(operands[0]), Some(operands[1])), block_reg_set!(Some(operands[0]))),

BlockInst::Branch { has_return, .. } => (block_reg_set!(), block_reg_set!(if *has_return { Some(BlockReg::Fixed(Reg::LR)) } else { None })),

BlockInst::SaveContext { .. } => (block_reg_set!(), block_reg_set!()),
BlockInst::SaveReg {
guest_reg,
Expand Down Expand Up @@ -164,7 +166,7 @@ impl BlockInst {
(block_reg_set!(Some(*thread_regs_addr_reg)), outputs)
}

BlockInst::Call { func_reg, args, .. } => {
BlockInst::Call { func_reg, args, has_return } => {
let mut inputs = BlockRegSet::new();
inputs += *func_reg;
for arg in args {
Expand All @@ -180,7 +182,8 @@ impl BlockInst {
Some(BlockReg::Fixed(Reg::R2)),
Some(BlockReg::Fixed(Reg::R3)),
Some(BlockReg::Fixed(Reg::R12)),
Some(BlockReg::Fixed(Reg::CPSR))
Some(BlockReg::Fixed(Reg::CPSR)),
if *has_return { Some(BlockReg::Fixed(Reg::LR)) } else { None }
),
)
}
Expand All @@ -205,7 +208,7 @@ impl BlockInst {
block_reg_set!(Some(BlockReg::Fixed(Reg::SP)), Some(BlockReg::Fixed(Reg::PC))),
),

BlockInst::Label { .. } | BlockInst::Branch { .. } | BlockInst::GuestPc(_) | BlockInst::Bkpt(_) => (block_reg_set!(), block_reg_set!()),
BlockInst::Label { .. } | BlockInst::GuestPc(_) | BlockInst::Bkpt(_) => (block_reg_set!(), block_reg_set!()),
}
}

Expand Down Expand Up @@ -542,11 +545,13 @@ impl BlockInst {
}
},

BlockInst::Branch { cond, block_index, skip, .. } => {
BlockInst::Branch {
cond, block_index, skip, has_return, ..
} => {
if !*skip {
// Encode label and cond as u32
// Branch offset can only be figured out later
opcodes.push(((*cond as u32) << 28) | (*block_index as u32));
opcodes.push(((*cond as u32) << 28) | ((*has_return as u32) << 27) | (*block_index as u32));
branch_placeholders.push(opcodes_offset + opcode_index);
}
}
Expand Down Expand Up @@ -717,6 +722,7 @@ pub enum BlockInst {
cond: Cond,
block_index: usize,
skip: bool,
has_return: bool,
},

SaveContext {
Expand Down Expand Up @@ -814,7 +820,19 @@ impl Debug for BlockInst {
};
write!(f, "label {label:?} {guest_pc}:")
}
BlockInst::Branch { label, cond, block_index, skip } => write!(f, "B{cond:?} {label:?}, block index: {block_index}, skip: {skip}"),
BlockInst::Branch {
label,
cond,
block_index,
skip,
has_return,
} => {
if *has_return {
write!(f, "Bl{cond:?} {label:?}, block index: {block_index}, skip: {skip}")
} else {
write!(f, "B{cond:?} {label:?}, block index: {block_index}, skip: {skip}")
}
}
BlockInst::SaveContext { .. } => write!(f, "SaveContext"),
BlockInst::SaveReg { guest_reg, reg_mapped, .. } => write!(f, "SaveReg {guest_reg:?}, mapped: {reg_mapped:?}"),
BlockInst::RestoreReg { guest_reg, reg_mapped, .. } => write!(f, "RestoreReg {guest_reg:?}, mapped: {reg_mapped:?}"),
Expand Down
4 changes: 2 additions & 2 deletions src/jit/disassembler/branch_instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mod branch_ops {
Op::Blx,
Operands::new_1(Operand::imm(op0 as u32)),
reg_reserve!(),
reg_reserve!(),
reg_reserve!(Reg::LR, Reg::CPSR),
1,
)
} else {
Expand All @@ -47,7 +47,7 @@ mod branch_ops {
Op::Blx,
Operands::new_1(Operand::imm(op0 as u32)),
reg_reserve!(),
reg_reserve!(),
reg_reserve!(Reg::LR, Reg::CPSR),
1,
)
} else {
Expand Down
72 changes: 32 additions & 40 deletions src/jit/emitter/emit.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use crate::core::CpuType;
use crate::core::CpuType::ARM7;
use crate::jit::assembler::block_asm::BlockAsm;
use crate::jit::assembler::BlockReg;
use crate::jit::assembler::{BlockLabel, BlockReg};
use crate::jit::inst_threag_regs_handler::{register_restore_spsr, restore_thumb_after_restore_spsr, set_pc_arm_mode};
use crate::jit::jit_asm::{JitAsm, JitRuntimeData};
use crate::jit::op::Op;
use crate::jit::reg::Reg;
use crate::jit::{Cond, MemoryAmount};
use crate::jit::Cond;
use crate::DEBUG_LOG_BRANCH_OUT;
use CpuType::ARM9;

Expand Down Expand Up @@ -60,6 +60,16 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
block_asm.call(restore_thumb_after_restore_spsr::<CPU> as *const ());
}

if (op.is_mov() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::LR) && !self.jit_buf.current_inst().out_regs.is_reserved(Reg::CPSR))
|| (op.is_multiple_mem_transfer() && *self.jit_buf.current_inst().operands()[0].as_reg_no_shift().unwrap() == Reg::SP)
|| (op.is_single_mem_transfer() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::SP))
{
let guest_pc_reg = block_asm.new_reg();
block_asm.load_u32(guest_pc_reg, block_asm.thread_regs_addr_reg, Reg::PC as u32 * 4);
self.emit_branch_return_stack_common(block_asm, guest_pc_reg);
block_asm.free_reg(guest_pc_reg);
}

self.emit_branch_out_metadata(block_asm);
block_asm.epilogue();
}
Expand All @@ -77,15 +87,15 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
if DEBUG_LOG_BRANCH_OUT {
let pc_reg = block_asm.new_reg();
block_asm.mov(pc_reg, self.jit_buf.current_pc);
block_asm.transfer_write(pc_reg, runtime_data_addr_reg, JitRuntimeData::get_out_pc_offset() as u32, false, MemoryAmount::Word);
block_asm.store_u32(pc_reg, runtime_data_addr_reg, JitRuntimeData::get_out_pc_offset() as u32);

block_asm.free_reg(pc_reg);
}
block_asm.transfer_write(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_out_total_cycles_offset() as u32, false, MemoryAmount::Word);
block_asm.store_u32(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_out_total_cycles_offset() as u32);
if set_idle_loop {
let idle_loop_reg = block_asm.new_reg();
block_asm.mov(idle_loop_reg, 1);
block_asm.transfer_write(idle_loop_reg, runtime_data_addr_reg, JitRuntimeData::get_idle_loop_offset() as u32, false, MemoryAmount::Byte);
block_asm.store_u8(idle_loop_reg, runtime_data_addr_reg, JitRuntimeData::get_idle_loop_offset() as u32);

block_asm.free_reg(idle_loop_reg);
}
Expand All @@ -102,73 +112,55 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
self._emit_branch_out_metadata(block_asm, true)
}

pub fn emit_flush_cycles<ContinueFn: Fn(&mut Self, &mut BlockAsm, BlockReg), BreakoutFn: Fn(&mut Self, &mut BlockAsm)>(
pub fn emit_flush_cycles<ContinueFn: Fn(&mut Self, &mut BlockAsm, BlockReg, BlockLabel), BreakoutFn: Fn(&mut Self, &mut BlockAsm)>(
&mut self,
block_asm: &mut BlockAsm,
target_pre_cycle_count_sum: u16,
target_pre_cycle_count_sum: Option<u16>,
continue_fn: ContinueFn,
breakout_fn: BreakoutFn,
) {
let runtime_data_addr_reg = block_asm.new_reg();
block_asm.mov(runtime_data_addr_reg, self.runtime_data.get_addr() as u32);

let accumulated_cycles_reg = block_asm.new_reg();
block_asm.transfer_read(
accumulated_cycles_reg,
runtime_data_addr_reg,
JitRuntimeData::get_accumulated_cycles_offset() as u32,
false,
MemoryAmount::Half,
);
block_asm.load_u16(accumulated_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_accumulated_cycles_offset() as u32);

let pre_cycle_count_sum_reg = block_asm.new_reg();
block_asm.transfer_read(
pre_cycle_count_sum_reg,
runtime_data_addr_reg,
JitRuntimeData::get_pre_cycle_count_sum_offset() as u32,
false,
MemoryAmount::Half,
);
block_asm.load_u16(pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32);

let total_cycles_reg = block_asm.new_reg();
// +2 for branching
block_asm.add(total_cycles_reg, accumulated_cycles_reg, self.jit_buf.insts_cycle_counts[self.jit_buf.current_index] as u32 + 2);
block_asm.sub(total_cycles_reg, total_cycles_reg, pre_cycle_count_sum_reg);

const MAX_LOOP_CYCLE_COUNT: u32 = 256;
const MAX_LOOP_CYCLE_COUNT: u32 = 255;
block_asm.cmp(
total_cycles_reg,
match CPU {
ARM9 => MAX_LOOP_CYCLE_COUNT * 2,
ARM7 => MAX_LOOP_CYCLE_COUNT,
} - 1,
},
);

let continue_label = block_asm.new_label();
let breakout_label = block_asm.new_label();
block_asm.branch(breakout_label, Cond::HI);
block_asm.branch(breakout_label, Cond::HS);

block_asm.transfer_write(
total_cycles_reg,
runtime_data_addr_reg,
JitRuntimeData::get_accumulated_cycles_offset() as u32,
false,
MemoryAmount::Half,
);
block_asm.store_u16(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_accumulated_cycles_offset() as u32);

let target_pre_cycle_count_sum_reg = block_asm.new_reg();
block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32);
block_asm.transfer_write(
target_pre_cycle_count_sum_reg,
runtime_data_addr_reg,
JitRuntimeData::get_pre_cycle_count_sum_offset() as u32,
false,
MemoryAmount::Half,
);
continue_fn(self, block_asm, runtime_data_addr_reg);
if let Some(target_pre_cycle_count_sum) = target_pre_cycle_count_sum {
block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32);
block_asm.store_u16(target_pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32);
}
continue_fn(self, block_asm, runtime_data_addr_reg, breakout_label);
block_asm.branch(continue_label, Cond::AL);

block_asm.label(breakout_label);
breakout_fn(self, block_asm);

block_asm.label(continue_label);

block_asm.free_reg(target_pre_cycle_count_sum_reg);
block_asm.free_reg(total_cycles_reg);
block_asm.free_reg(pre_cycle_count_sum_reg);
Expand Down
Loading

0 comments on commit e07d536

Please sign in to comment.