diff --git a/Cargo.toml b/Cargo.toml index 0320249..03cf66b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,9 @@ debug = true [profile.release.package."*"] opt-level = 3 +#debug = 1 [profile.release] panic = "abort" lto = "fat" +#debug = 1 diff --git a/src/cartridge_io.rs b/src/cartridge_io.rs index 002f8b8..f16cee9 100644 --- a/src/cartridge_io.rs +++ b/src/cartridge_io.rs @@ -84,7 +84,7 @@ pub struct CartridgeIo { pub file_name: String, pub file_size: u32, pub header: CartridgeHeader, - content_pages: UnsafeCell>>, + content_pages: UnsafeCell>>, save_file_path: PathBuf, pub save_file_size: u32, save_buf: Mutex<(Vec, bool)>, @@ -136,13 +136,13 @@ impl CartridgeIo { }) } - fn get_page(&self, page_addr: u32) -> io::Result<*const [u8; PAGE_SIZE * 4]> { - debug_assert_eq!(page_addr & ((PAGE_SIZE * 4) as u32 - 1), 0); + fn get_page(&self, page_addr: u32) -> io::Result<*const [u8; PAGE_SIZE]> { + debug_assert_eq!(page_addr & (PAGE_SIZE as u32 - 1), 0); let pages = unsafe { self.content_pages.get().as_mut_unchecked() }; match pages.get(&page_addr) { None => { // exceeds 8MB - if pages.len() >= 512 { + if pages.len() >= 2048 { debug_println!("clear cartridge pages"); pages.clear(); } @@ -162,7 +162,7 @@ impl CartridgeIo { while remaining > 0 { let slice_start = slice.len() - remaining; - let page_addr = (offset + slice_start as u32) & !((PAGE_SIZE * 4) as u32 - 1); + let page_addr = (offset + slice_start as u32) & !(PAGE_SIZE as u32 - 1); let page_offset = offset + slice_start as u32 - page_addr; let page = match self.get_page(page_addr) { Ok(page) => page, diff --git a/src/core/graphics/gpu_3d/registers_3d.rs b/src/core/graphics/gpu_3d/registers_3d.rs index 3a621b0..d8b656c 100644 --- a/src/core/graphics/gpu_3d/registers_3d.rs +++ b/src/core/graphics/gpu_3d/registers_3d.rs @@ -4,12 +4,13 @@ use crate::core::graphics::gpu::{DISPLAY_HEIGHT, DISPLAY_WIDTH}; use crate::core::memory::dma::DmaTransferMode; use crate::core::CpuType::ARM9; use crate::fixed_fifo::FixedFifo; -use crate::math::{Matrix, Vectori16, Vectori32, Vectoru16}; +use crate::math::{Matrix, Vectorf32, Vectori16, Vectori32, Vectoru16}; use crate::utils::{rgb5_to_rgb6, HeapMem}; use bilge::prelude::*; use std::hint::unreachable_unchecked; -use std::intrinsics::{unchecked_div, unlikely}; +use std::intrinsics::unlikely; use std::mem; +use std::mem::MaybeUninit; #[bitsize(32)] #[derive(Copy, Clone, FromBits)] @@ -232,25 +233,32 @@ pub struct Vertex { pub color: u32, } -fn intersect(v1: &Vectori32<4>, v2: &Vectori32<4>, val1: i32, val2: i32) -> Vectori32<4> { - let d1 = val1 as i64 + v1[3] as i64; - let d2 = val2 as i64 + v2[3] as i64; - if d2 == d1 { +fn intersect(v1: &Vectorf32<4>, v2: &Vectorf32<4>, val1: f32, val2: f32) -> Vectorf32<4> { + let d1 = val1 + v1[3]; + let d2 = val2 + v2[3]; + if (d2 - d1).abs() < f32::EPSILON { return *v1; } - let mut vertex = Vectori32::default(); - for i in 0..4 { - vertex[i] = v1[i] + unsafe { unchecked_div((v2[i] - v1[i]) as i64 * -d1, d2 - d1) } as i32; - } + let mut vertex = Vectorf32::default(); + let dist_inverse = -d1 / (d2 - d1); + vertex[0] = v1[0] + ((v2[0] - v1[0]) * dist_inverse); + vertex[1] = v1[1] + ((v2[1] - v1[1]) * dist_inverse); + vertex[2] = v1[2] + ((v2[2] - v1[2]) * dist_inverse); + vertex[3] = v1[3] + ((v2[3] - v1[3]) * dist_inverse); vertex } -fn clip_polygon(unclipped: &[Vectori32<4>; 4], clipped: &mut [Vectori32<4>; 10], size: &mut usize) -> bool { +fn clip_polygon(unclipped: &[Vectori32<4>; 4], clipped: &mut [Vectorf32<4>; 10], size: &mut usize) -> bool { let mut clip = false; - let mut vertices = [Vectori32::<4>::default(); 10]; - vertices[..4].copy_from_slice(unclipped); + let mut vertices = [Vectorf32::<4>::default(); 10]; + for i in 0..4 { + for j in 0..4 { + const NORMALIZE: f32 = 1f32 / 4096f32; + vertices[i][j] = unclipped[i][j] as f32 * NORMALIZE; + } + } for i in 0..6 { let old_size = *size; @@ -496,7 +504,7 @@ impl Gpu3DRegisters { }; while !self.cmd_fifo.is_empty() && executed_cycles < cycle_diff && !self.flushed { - let mut params = Vec::new(); + let mut params: [u32; 32] = unsafe { MaybeUninit::uninit().assume_init() }; let entry = unsafe { *self.cmd_fifo.front_unchecked() }; let mut param_count = unsafe { *FIFO_PARAM_COUNTS.get_unchecked(entry.cmd as usize) }; if param_count > 1 { @@ -504,9 +512,8 @@ impl Gpu3DRegisters { break; } - params.reserve(param_count as usize); - for _ in 0..param_count { - params.push(unsafe { self.cmd_fifo.front_unchecked().param }); + for i in 0..param_count { + unsafe { *params.get_unchecked_mut(i as usize) = self.cmd_fifo.front_unchecked().param }; self.cmd_fifo.pop_front(); } } else { @@ -521,17 +528,17 @@ impl Gpu3DRegisters { 0x13 => self.exe_mtx_store(entry.param), 0x14 => self.exe_mtx_restore(entry.param), 0x15 => self.exe_mtx_identity(), - 0x16 => self.exe_mtx_load44(unsafe { params.try_into().unwrap_unchecked() }), - 0x17 => self.exe_mtx_load43(unsafe { params.try_into().unwrap_unchecked() }), - 0x18 => self.exe_mtx_mult44(unsafe { params.try_into().unwrap_unchecked() }), - 0x19 => self.exe_mtx_mult43(unsafe { params.try_into().unwrap_unchecked() }), - 0x1A => self.exe_mtx_mult33(unsafe { params.try_into().unwrap_unchecked() }), - 0x1B => self.exe_mtx_scale(unsafe { params.try_into().unwrap_unchecked() }), - 0x1C => self.exe_mtx_trans(unsafe { params.try_into().unwrap_unchecked() }), + 0x16 => self.exe_mtx_load44(unsafe { mem::transmute(¶ms) }), + 0x17 => self.exe_mtx_load43(unsafe { mem::transmute(¶ms) }), + 0x18 => self.exe_mtx_mult44(unsafe { mem::transmute(¶ms) }), + 0x19 => self.exe_mtx_mult43(unsafe { mem::transmute(¶ms) }), + 0x1A => self.exe_mtx_mult33(unsafe { mem::transmute(¶ms) }), + 0x1B => self.exe_mtx_scale(unsafe { mem::transmute(¶ms) }), + 0x1C => self.exe_mtx_trans(unsafe { mem::transmute(¶ms) }), 0x20 => self.exe_color(entry.param), 0x21 => self.exe_normal(entry.param), 0x22 => self.exe_tex_coord(entry.param), - 0x23 => self.exe_vtx16(unsafe { params.try_into().unwrap_unchecked() }), + 0x23 => self.exe_vtx16(unsafe { mem::transmute(¶ms) }), 0x24 => self.exe_vtx10(entry.param), 0x25 => self.exe_vtx_x_y(entry.param), 0x26 => self.exe_vtx_x_z(entry.param), @@ -549,10 +556,10 @@ impl Gpu3DRegisters { 0x41 => {} 0x50 => self.exe_swap_buffers(entry.param), 0x60 => self.exe_viewport(entry.param), - 0x70 => self.exe_box_test(unsafe { params.try_into().unwrap_unchecked() }), - 0x71 => self.exe_pos_test(unsafe { params.try_into().unwrap_unchecked() }), + 0x70 => self.exe_box_test(unsafe { mem::transmute(¶ms) }), + 0x71 => self.exe_pos_test(unsafe { mem::transmute(¶ms) }), 0x72 => self.exe_vec_test(entry.param), - _ => unreachable!(), + _ => unsafe { unreachable_unchecked() }, } executed_cycles += 4; @@ -580,6 +587,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_mtx_push(&mut self) { match self.mtx_mode { MtxMode::Projection => { @@ -609,6 +617,7 @@ impl Gpu3DRegisters { self.decrease_mtx_queue(); } + #[inline(never)] fn exe_mtx_pop(&mut self, param: u32) { match self.mtx_mode { MtxMode::Projection => { @@ -639,6 +648,7 @@ impl Gpu3DRegisters { self.decrease_mtx_queue(); } + #[inline(never)] fn exe_mtx_store(&mut self, param: u32) { match self.mtx_mode { MtxMode::Projection => self.matrices.proj_stack = self.matrices.proj, @@ -656,6 +666,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_mtx_restore(&mut self, param: u32) { match self.mtx_mode { MtxMode::Projection => { @@ -677,6 +688,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_mtx_identity(&mut self) { match self.mtx_mode { MtxMode::Projection => self.matrices.proj = Matrix::default(), @@ -708,11 +720,13 @@ impl Gpu3DRegisters { } } - fn exe_mtx_load44(&mut self, param: [u32; 16]) { - self.mtx_load(unsafe { mem::transmute(param) }); + #[inline(never)] + fn exe_mtx_load44(&mut self, param: &[u32; 16]) { + self.mtx_load(unsafe { mem::transmute(*param) }); } - fn exe_mtx_load43(&mut self, param: [u32; 12]) { + #[inline(never)] + fn exe_mtx_load43(&mut self, param: &[u32; 12]) { let mut mtx = Matrix::default(); for i in 0..4 { mtx.as_mut()[i * 4..i * 4 + 3].copy_from_slice(unsafe { mem::transmute(¶m[i * 3..i * 3 + 3]) }); @@ -741,11 +755,13 @@ impl Gpu3DRegisters { } } - fn exe_mtx_mult44(&mut self, param: [u32; 16]) { - self.mtx_mult(unsafe { mem::transmute(param) }); + #[inline(never)] + fn exe_mtx_mult44(&mut self, param: &[u32; 16]) { + self.mtx_mult(unsafe { mem::transmute(*param) }); } - fn exe_mtx_mult43(&mut self, param: [u32; 12]) { + #[inline(never)] + fn exe_mtx_mult43(&mut self, param: &[u32; 12]) { let mut mtx = Matrix::default(); for i in 0..4 { mtx.as_mut()[i * 4..i * 4 + 3].copy_from_slice(unsafe { mem::transmute(¶m[i * 3..i * 3 + 3]) }); @@ -753,7 +769,8 @@ impl Gpu3DRegisters { self.mtx_mult(mtx); } - fn exe_mtx_mult33(&mut self, param: [u32; 9]) { + #[inline(never)] + fn exe_mtx_mult33(&mut self, param: &[u32; 9]) { let mut mtx = Matrix::default(); for i in 0..3 { mtx.as_mut()[i * 4..i * 4 + 3].copy_from_slice(unsafe { mem::transmute(¶m[i * 3..i * 3 + 3]) }); @@ -761,7 +778,8 @@ impl Gpu3DRegisters { self.mtx_mult(mtx); } - fn exe_mtx_scale(&mut self, param: [u32; 3]) { + #[inline(never)] + fn exe_mtx_scale(&mut self, param: &[u32; 3]) { let mut mtx = Matrix::default(); for i in 0..3 { mtx[i * 5] = param[i] as i32; @@ -769,16 +787,19 @@ impl Gpu3DRegisters { self.mtx_mult(mtx); } - fn exe_mtx_trans(&mut self, param: [u32; 3]) { + #[inline(never)] + fn exe_mtx_trans(&mut self, param: &[u32; 3]) { let mut mtx = Matrix::default(); mtx.as_mut()[12..15].copy_from_slice(unsafe { mem::transmute(param.as_slice()) }); self.mtx_mult(mtx); } + #[inline(never)] fn exe_color(&mut self, param: u32) { self.saved_vertex.color = rgb5_to_rgb6(param); } + #[inline(never)] fn exe_normal(&mut self, param: u32) { let normal_vector_param = NormalVector::from(param); let mut normal_vector = Vectori32::<3>::default(); @@ -844,6 +865,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_tex_coord(&mut self, param: u32) { let tex_coord = TexCoord::from(param); if self.texture_coord_mode == TextureCoordTransMode::TexCoord { @@ -863,7 +885,8 @@ impl Gpu3DRegisters { } } - fn exe_vtx16(&mut self, params: [u32; 2]) { + #[inline(never)] + fn exe_vtx16(&mut self, params: &[u32; 2]) { self.saved_vertex.coords[0] = params[0] as i16 as i32; self.saved_vertex.coords[1] = (params[0] >> 16) as i16 as i32; self.saved_vertex.coords[2] = params[1] as i16 as i32; @@ -871,6 +894,7 @@ impl Gpu3DRegisters { self.add_vertex(); } + #[inline(never)] fn exe_vtx10(&mut self, param: u32) { self.saved_vertex.coords[0] = ((param & 0x3FF) << 6) as i16 as i32; self.saved_vertex.coords[1] = ((param & 0xFFC00) >> 4) as i16 as i32; @@ -879,6 +903,7 @@ impl Gpu3DRegisters { self.add_vertex(); } + #[inline(never)] fn exe_vtx_x_y(&mut self, param: u32) { self.saved_vertex.coords[0] = param as i16 as i32; self.saved_vertex.coords[1] = (param >> 16) as i16 as i32; @@ -886,6 +911,7 @@ impl Gpu3DRegisters { self.add_vertex(); } + #[inline(never)] fn exe_vtx_x_z(&mut self, param: u32) { self.saved_vertex.coords[0] = param as i16 as i32; self.saved_vertex.coords[2] = (param >> 16) as i16 as i32; @@ -893,6 +919,7 @@ impl Gpu3DRegisters { self.add_vertex(); } + #[inline(never)] fn exe_vtx_y_z(&mut self, param: u32) { self.saved_vertex.coords[1] = param as i16 as i32; self.saved_vertex.coords[2] = (param >> 16) as i16 as i32; @@ -900,6 +927,7 @@ impl Gpu3DRegisters { self.add_vertex(); } + #[inline(never)] fn exe_vtx_diff(&mut self, param: u32) { self.saved_vertex.coords[0] += (((param & 0x3FF) << 6) as i16 as i32) >> 6; self.saved_vertex.coords[1] += (((param & 0xFFC00) >> 4) as i16 as i32) >> 6; @@ -912,6 +940,7 @@ impl Gpu3DRegisters { self.polygon_attr = param.into(); } + #[inline(never)] fn exe_tex_image_param(&mut self, param: u32) { let Self { saved_polygon, texture_coord_mode, .. @@ -924,6 +953,7 @@ impl Gpu3DRegisters { self.saved_polygon.palette_addr = (param & 0x1FFF) as u16; } + #[inline(never)] fn exe_dif_amb(&mut self, param: u32) { let material_color0 = MaterialColor0::from(param); self.diffuse_color = rgb5_to_rgb6(u32::from(material_color0.dif())); @@ -934,6 +964,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_spe_emi(&mut self, param: u32) { let material_color1 = MaterialColor1::from(param); self.specular_color = rgb5_to_rgb6(u32::from(material_color1.spe())); @@ -941,6 +972,7 @@ impl Gpu3DRegisters { self.shininess_enabled = material_color1.set_shininess(); } + #[inline(never)] fn exe_light_vector(&mut self, param: u32) { let light_vector = LightVector::from(param); let num = u8::from(light_vector.num()) as usize; @@ -957,11 +989,13 @@ impl Gpu3DRegisters { self.half_vectors[num][2] = (self.light_vectors[num][2] - (1 << 12)) >> 1; } + #[inline(never)] fn exe_light_color(&mut self, param: u32) { let light_color = LightColor::from(param); self.light_colors[u8::from(light_color.num()) as usize] = rgb5_to_rgb6(u32::from(light_color.color())); } + #[inline(never)] fn exe_shininess(&mut self, param: u32) { let shininess = Shininess::from(param); self.shininess[0] = shininess.shininess0(); @@ -970,6 +1004,7 @@ impl Gpu3DRegisters { self.shininess[3] = shininess.shininess3(); } + #[inline(never)] fn exe_begin_vtxs(&mut self, param: u32) { if self.vertex_count < self.polygon_type.vertex_count() as usize { self.vertices.count_in -= self.vertex_count; @@ -999,6 +1034,7 @@ impl Gpu3DRegisters { self.flushed = true; } + #[inline(never)] fn exe_viewport(&mut self, param: u32) { let viewport = Viewport::from(param); self.viewport_next[0] = viewport.x1() as u16; @@ -1007,7 +1043,8 @@ impl Gpu3DRegisters { self.viewport_next[3] = (191 - viewport.y1() as u16) - self.viewport_next[1] + 1; } - fn exe_box_test(&mut self, params: [u32; 3]) { + #[inline(never)] + fn exe_box_test(&mut self, params: &[u32; 3]) { let mut box_test_coords = [ params[0] as i16, (params[0] >> 16) as i16, @@ -1053,7 +1090,7 @@ impl Gpu3DRegisters { for i in 0..6 { let mut size = 4; - let mut clipped = [Vectori32::<4>::default(); 10]; + let mut clipped = [Vectorf32::<4>::default(); 10]; clip_polygon(&faces[i], &mut clipped, &mut size); @@ -1066,10 +1103,11 @@ impl Gpu3DRegisters { self.gx_stat.set_box_test_result(false); } - fn exe_pos_test(&mut self, params: [u32; 2]) { + #[inline(never)] + fn exe_pos_test(&mut self, params: &[u32; 2]) { self.saved_vertex.coords[0] = params[0] as i16 as i32; self.saved_vertex.coords[1] = (params[0] >> 16) as i16 as i32; - self.saved_vertex.coords[2] = (params[1]) as i16 as i32; + self.saved_vertex.coords[2] = params[1] as i16 as i32; self.saved_vertex.coords[3] = 1 << 12; if self.clip_dirty { @@ -1085,6 +1123,7 @@ impl Gpu3DRegisters { } } + #[inline(never)] fn exe_vec_test(&mut self, param: u32) { let mut vector = Vectori32::<3>::default(); vector[0] = (((param & 0x000003FF) << 6) as i16 as i32) >> 3; @@ -1222,7 +1261,7 @@ impl Gpu3DRegisters { self.clockwise = !self.clockwise; } - let mut clipped = [Vectori32::<4>::default(); 10]; + let mut clipped = [Vectorf32::<4>::default(); 10]; let cull = (!self.render_front && dot > 0) || (!self.render_back && dot < 0); let mut clipped_size = self.saved_polygon.size; let clip = if cull { false } else { clip_polygon(&unclipped, &mut clipped, &mut clipped_size) }; diff --git a/src/core/memory/cartridge.rs b/src/core/memory/cartridge.rs index 6ad0ed7..17b405a 100644 --- a/src/core/memory/cartridge.rs +++ b/src/core/memory/cartridge.rs @@ -16,7 +16,7 @@ struct AuxSpiCnt { busy: u1, not_used2: u5, nds_slot_mode: u1, - transfer_ready_irq: u1, + transfer_ready_irq: bool, nds_slot_enable: u1, } @@ -41,7 +41,7 @@ pub struct RomCtrl { key1_gap_clks: u1, resb_release_reset: u1, wr: u1, - block_start_status: u1, + block_start_status: bool, } impl Default for RomCtrl { @@ -115,8 +115,8 @@ impl Cartridge { inner.rom_ctrl.set_data_word_status(u1::new(0)); inner.read_count += 4; if inner.read_count == inner.block_size { - inner.rom_ctrl.set_block_start_status(u1::new(0)); - if bool::from(inner.aux_spi_cnt.transfer_ready_irq()) { + inner.rom_ctrl.set_block_start_status(false); + if inner.aux_spi_cnt.transfer_ready_irq() { get_cpu_regs_mut!(emu, CPU).send_interrupt(InterruptFlag::NdsSlotTransferCompletion, emu); } } else { @@ -301,7 +301,7 @@ impl Cartridge { let inner = &mut self.inner[CPU]; inner.rom_ctrl.set_resb_release_reset(new_rom_ctrl.resb_release_reset()); - let transfer = !bool::from(inner.rom_ctrl.block_start_status()) && bool::from(new_rom_ctrl.block_start_status()); + let transfer = !inner.rom_ctrl.block_start_status() && new_rom_ctrl.block_start_status(); mask &= 0xDF7F7FFF; inner.rom_ctrl = ((u32::from(inner.rom_ctrl) & !mask) | (value & mask)).into(); @@ -352,8 +352,8 @@ impl Cartridge { if inner.block_size == 0 { inner.rom_ctrl.set_data_word_status(u1::new(0)); - inner.rom_ctrl.set_block_start_status(u1::new(0)); - if bool::from(inner.aux_spi_cnt.transfer_ready_irq()) { + inner.rom_ctrl.set_block_start_status(false); + if inner.aux_spi_cnt.transfer_ready_irq() { get_cpu_regs_mut!(emu, CPU).send_interrupt(InterruptFlag::NdsSlotTransferCompletion, emu); } } else { diff --git a/src/core/memory/io_arm7.rs b/src/core/memory/io_arm7.rs index 2ac23e9..5af5272 100644 --- a/src/core/memory/io_arm7.rs +++ b/src/core/memory/io_arm7.rs @@ -8,7 +8,6 @@ use crate::core::timers::Timers; use crate::core::wifi::Wifi; use crate::core::CpuType::ARM7; use crate::utils::Convert; -use std::intrinsics::likely; use std::sync::atomic::AtomicU16; use std::sync::Arc; @@ -34,22 +33,19 @@ impl IoArm7 { } pub fn read(&mut self, addr_offset: u32, emu: &mut Emu) -> T { - if likely(IoArm7ReadLut::is_in_range(addr_offset)) { - T::from(IoArm7ReadLut::read(addr_offset, size_of::() as u8, emu)) - } else if IoArm7ReadLutUpper::is_in_range(addr_offset) { - T::from(IoArm7ReadLutUpper::read(addr_offset, size_of::() as u8, emu)) - } else if IoArm7ReadLutWifi::is_in_range(addr_offset) { - T::from(IoArm7ReadLutWifi::read(addr_offset, size_of::() as u8, emu)) - } else { - T::from(0) + match addr_offset & 0xF00000 { + 0x0 if IoArm7ReadLut::is_in_range(addr_offset) => T::from(IoArm7ReadLut::read(addr_offset, size_of::() as u8, emu)), + 0x100000 if IoArm7ReadLutUpper::is_in_range(addr_offset) => T::from(IoArm7ReadLutUpper::read(addr_offset, size_of::() as u8, emu)), + 0x800000 if IoArm7ReadLutWifi::is_in_range(addr_offset) => T::from(IoArm7ReadLutWifi::read(addr_offset, size_of::() as u8, emu)), + _ => T::from(0), } } pub fn write(&mut self, addr_offset: u32, value: T, emu: &mut Emu) { - if likely(IoArm7WriteLut::is_in_range(addr_offset)) { - IoArm7WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu); - } else if IoArm7WriteLutWifi::is_in_range(addr_offset) { - IoArm7WriteLutWifi::write(value.into(), addr_offset, size_of::() as u8, emu); + match addr_offset & 0xF00000 { + 0x0 if IoArm7WriteLut::is_in_range(addr_offset) => IoArm7WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu), + 0x800000 if IoArm7WriteLutWifi::is_in_range(addr_offset) => IoArm7WriteLutWifi::write(value.into(), addr_offset, size_of::() as u8, emu), + _ => {} } } } diff --git a/src/core/memory/io_arm9.rs b/src/core/memory/io_arm9.rs index 5ad1d46..347d096 100644 --- a/src/core/memory/io_arm9.rs +++ b/src/core/memory/io_arm9.rs @@ -23,20 +23,22 @@ impl IoArm9 { } pub fn read(&mut self, addr_offset: u32, emu: &mut Emu) -> T { - if likely(IoArm9ReadLut::is_in_range(addr_offset)) { - T::from(IoArm9ReadLut::read(addr_offset, size_of::() as u8, emu)) - } else if IoArm9ReadLutUpper::is_in_range(addr_offset) { - T::from(IoArm9ReadLutUpper::read(addr_offset, size_of::() as u8, emu)) - } else { - T::from(0) + match addr_offset & 0xF00000 { + 0x0 if IoArm9ReadLut::is_in_range(addr_offset) => T::from(IoArm9ReadLut::read(addr_offset, size_of::() as u8, emu)), + 0x100000 if IoArm9ReadLutUpper::is_in_range(addr_offset) => T::from(IoArm9ReadLutUpper::read(addr_offset, size_of::() as u8, emu)), + _ => T::from(0), } } pub fn write(&mut self, addr_offset: u32, value: T, emu: &mut Emu) { - IoArm9WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu); + if likely(IoArm9WriteLut::is_in_range(addr_offset)) { + IoArm9WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu); + } } pub fn write_fixed_slice(&mut self, addr_offset: u32, slice: &[T], emu: &mut Emu) { - IoArm9WriteLut::write_fixed_slice(addr_offset, slice, emu); + if likely(IoArm9WriteLut::is_in_range(addr_offset)) { + IoArm9WriteLut::write_fixed_slice(addr_offset, slice, emu); + } } } diff --git a/src/jit/assembler/block_asm.rs b/src/jit/assembler/block_asm.rs index cf15abd..da280b4 100644 --- a/src/jit/assembler/block_asm.rs +++ b/src/jit/assembler/block_asm.rs @@ -942,7 +942,7 @@ impl<'a> BlockAsm<'a> { } basic_block.remove_dead_code(self); - basic_block.consolidate_reg_io(self); + // basic_block.consolidate_reg_io(self); } (basic_blocks, reachable_blocks) diff --git a/src/jit/assembler/block_inst_list.rs b/src/jit/assembler/block_inst_list.rs index 33c6123..129869a 100644 --- a/src/jit/assembler/block_inst_list.rs +++ b/src/jit/assembler/block_inst_list.rs @@ -205,9 +205,9 @@ impl<'a> Iterator for BlockIntListIter<'a> { if unlikely(self.entry.is_null()) { None } else { - let entry = unsafe { self.entry.as_ref() }; - self.entry = entry?.next; - entry + let entry = unsafe { self.entry.as_ref_unchecked() }; + self.entry = entry.next; + Some(entry) } } @@ -233,12 +233,12 @@ impl<'a> Iterator for BlockIntListRevIter<'a> { type Item = &'a BlockInstListEntry; fn next(&mut self) -> Option { - if self.entry.is_null() { + if unlikely(self.entry.is_null()) { None } else { - let entry = unsafe { self.entry.as_ref() }; - self.entry = entry?.previous; - entry + let entry = unsafe { self.entry.as_ref_unchecked() }; + self.entry = entry.previous; + Some(entry) } } diff --git a/src/jit/jit_asm.rs b/src/jit/jit_asm.rs index e16a7f4..047aa57 100644 --- a/src/jit/jit_asm.rs +++ b/src/jit/jit_asm.rs @@ -181,7 +181,7 @@ fn emit_code_block_internal(asm: &mut Jit } let jit_entry = { - // unsafe { BLOCK_LOG = guest_pc == 0x200187c }; + // unsafe { BLOCK_LOG = guest_pc == 0x20d755c }; let mut block_asm = asm.new_block_asm(false); diff --git a/src/jit/jit_memory.rs b/src/jit/jit_memory.rs index 30464c4..04952ef 100644 --- a/src/jit/jit_memory.rs +++ b/src/jit/jit_memory.rs @@ -11,8 +11,8 @@ use crate::jit::reg::Reg; use crate::jit::Cond; use crate::logging::debug_println; use crate::mmap::{flush_icache, Mmap, PAGE_SIZE}; +use crate::utils; use crate::utils::{HeapMem, HeapMemU8}; -use crate::{utils, IS_DEBUG}; use paste::paste; use std::intrinsics::unlikely; use std::marker::ConstParamTy; @@ -94,8 +94,6 @@ pub struct JitLiveRanges { pub vram_arm7: HeapMemU8<{ (vram::ARM7_SIZE / JIT_LIVE_RANGE_PAGE_SIZE / 8) as usize }>, } -const JIT_PERF_MAP_RECORD: bool = IS_DEBUG; - #[cfg(target_os = "linux")] struct JitPerfMapRecord { common_records: Vec<(usize, usize, String)>, @@ -115,27 +113,21 @@ impl JitPerfMapRecord { } fn record_common(&mut self, jit_start: usize, jit_size: usize, name: impl AsRef) { - if JIT_PERF_MAP_RECORD { - self.common_records.push((jit_start, jit_size, name.as_ref().to_string())); - use std::io::Write; - writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {}", name.as_ref()).unwrap(); - } + self.common_records.push((jit_start, jit_size, name.as_ref().to_string())); + use std::io::Write; + writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {}", name.as_ref()).unwrap(); } fn record(&mut self, jit_start: usize, jit_size: usize, guest_pc: u32, cpu_type: CpuType) { - if JIT_PERF_MAP_RECORD { - use std::io::Write; - writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {cpu_type:?}_{guest_pc:x}").unwrap(); - } + use std::io::Write; + writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {cpu_type:?}_{guest_pc:x}").unwrap(); } fn reset(&mut self) { - if JIT_PERF_MAP_RECORD { - self.perf_map = std::fs::File::create(&self.perf_map_path).unwrap(); - for (jit_start, jit_size, name) in &self.common_records { - use std::io::Write; - writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {name}").unwrap(); - } + self.perf_map = std::fs::File::create(&self.perf_map_path).unwrap(); + for (jit_start, jit_size, name) in &self.common_records { + use std::io::Write; + writeln!(self.perf_map, "{jit_start:x} {jit_size:x} {name}").unwrap(); } } } diff --git a/src/math.rs b/src/math.rs index 0c86ed2..62d1c5e 100644 --- a/src/math.rs +++ b/src/math.rs @@ -1,6 +1,7 @@ use paste::paste; +use std::arch::asm; +use std::ops; use std::ops::{Index, IndexMut}; -use std::{mem, ops}; #[derive(Copy, Clone)] pub struct Matrix([i32; 16]); @@ -69,7 +70,7 @@ macro_rules! define_vector { impl Default for [] { fn default() -> Self { - unsafe { mem::zeroed() } + []([$t::default(); SIZE]) } } @@ -99,17 +100,6 @@ macro_rules! define_vector { } } - impl ops::Mul<$t> for [] { - type Output = Self; - - fn mul(mut self, rhs: $t) -> Self::Output { - for i in 0..SIZE { - self.0[i] *= rhs - } - self - } - } - impl From<[]<3>> for []<4> { fn from(value: []<3>) -> Self { let mut ret = Self::default(); @@ -126,16 +116,46 @@ macro_rules! define_vector { define_vector!(u16); define_vector!(i16); define_vector!(i32); +define_vector!(f32); impl ops::Mul for Vectori32<3> { type Output = Self; fn mul(self, rhs: Matrix) -> Self::Output { - let mut ret = Vectori32::default(); - ret.0[0] = ((self.0[0] as i64 * rhs.0[0] as i64 + self.0[1] as i64 * rhs.0[4] as i64 + self.0[2] as i64 * rhs.0[8] as i64) >> 12) as i32; - ret.0[1] = ((self.0[0] as i64 * rhs.0[1] as i64 + self.0[1] as i64 * rhs.0[5] as i64 + self.0[2] as i64 * rhs.0[9] as i64) >> 12) as i32; - ret.0[2] = ((self.0[0] as i64 * rhs.0[2] as i64 + self.0[1] as i64 * rhs.0[6] as i64 + self.0[2] as i64 * rhs.0[10] as i64) >> 12) as i32; - ret + let mut v0: i32; + let mut v1: i32; + let mut v2: i32; + unsafe { + asm!( + "vmov.s32 d1, 0", + "vld1.s32 {{d0}}, [{v}]!", + "vld1.s32 {{d1[0]}}, [{v}]", + "vld1.s32 {{q1}}, [{m}]!", + "vld1.s32 {{q2}}, [{m}]!", + "vld1.s32 {{q3}}, [{m}]!", + "vld1.s32 {{q4}}, [{m}]", + "vmull.s32 q5, d2, d0[0]", + "vmull.s32 q6, d3, d0[0]", + "vmlal.s32 q5, d4, d0[1]", + "vmlal.s32 q6, d5, d0[1]", + "vmlal.s32 q5, d6, d1[0]", + "vmlal.s32 q6, d7, d1[0]", + "vmlal.s32 q5, d8, d1[1]", + "vmlal.s32 q6, d9, d1[1]", + "vshr.s64 q5, q5, 12", + "vshr.s64 q6, q6, 12", + "vmov.s32 {v0}, s20", + "vmov.s32 {v1}, s22", + "vmov.s32 {v2}, s24", + v = in(reg) self.0.as_ptr(), + m = in(reg) rhs.0.as_ptr(), + v0 = out(reg) v0, + v1 = out(reg) v1, + v2 = out(reg) v2, + options(pure, readonly, preserves_flags, nostack), + ); + } + Vectori32([v0, v1, v2]) } } @@ -143,12 +163,41 @@ impl ops::Mul for Vectori32<4> { type Output = Self; fn mul(self, rhs: Matrix) -> Self::Output { - let mut ret = Vectori32::default(); - ret[0] = ((self[0] as i64 * rhs[0] as i64 + self[1] as i64 * rhs[4] as i64 + self[2] as i64 * rhs[8] as i64 + self[3] as i64 * rhs[12] as i64) >> 12) as i32; - ret[1] = ((self[0] as i64 * rhs[1] as i64 + self[1] as i64 * rhs[5] as i64 + self[2] as i64 * rhs[9] as i64 + self[3] as i64 * rhs[13] as i64) >> 12) as i32; - ret[2] = ((self[0] as i64 * rhs[2] as i64 + self[1] as i64 * rhs[6] as i64 + self[2] as i64 * rhs[10] as i64 + self[3] as i64 * rhs[14] as i64) >> 12) as i32; - ret[3] = ((self[0] as i64 * rhs[3] as i64 + self[1] as i64 * rhs[7] as i64 + self[2] as i64 * rhs[11] as i64 + self[3] as i64 * rhs[15] as i64) >> 12) as i32; - ret + let mut v0: i32; + let mut v1: i32; + let mut v2: i32; + let mut v3: i32; + unsafe { + asm!( + "vld1.s32 {{q0}}, [{v}]", + "vld1.s32 {{q1}}, [{m}]!", + "vld1.s32 {{q2}}, [{m}]!", + "vld1.s32 {{q3}}, [{m}]!", + "vld1.s32 {{q4}}, [{m}]", + "vmull.s32 q5, d2, d0[0]", + "vmull.s32 q6, d3, d0[0]", + "vmlal.s32 q5, d4, d0[1]", + "vmlal.s32 q6, d5, d0[1]", + "vmlal.s32 q5, d6, d1[0]", + "vmlal.s32 q6, d7, d1[0]", + "vmlal.s32 q5, d8, d1[1]", + "vmlal.s32 q6, d9, d1[1]", + "vshr.s64 q5, q5, 12", + "vshr.s64 q6, q6, 12", + "vmov.s32 {v0}, s20", + "vmov.s32 {v1}, s22", + "vmov.s32 {v2}, s24", + "vmov.s32 {v3}, s26", + v = in(reg) self.0.as_ptr(), + m = in(reg) rhs.0.as_ptr(), + v0 = out(reg) v0, + v1 = out(reg) v1, + v2 = out(reg) v2, + v3 = out(reg) v3, + options(pure, readonly, preserves_flags, nostack), + ); + } + Vectori32([v0, v1, v2, v3]) } } @@ -164,14 +213,40 @@ impl ops::MulAssign for Vectori32<4> { } } -impl ops::Mul for Vectori32 { +impl ops::Mul for Vectori32<3> { type Output = i32; fn mul(self, rhs: Self) -> Self::Output { + /* Vectorization of let mut dot = 0; - for i in 0..SIZE { - dot += self[i] as i64 * rhs[i] as i64; - } + dot += self[0] as i64 * rhs[0] as i64; + dot += self[1] as i64 * rhs[1] as i64; + dot += self[2] as i64 * rhs[2] as i64; (dot >> 12) as i32 + */ + + let v1 = self.0.as_ptr(); + let v2 = rhs.0.as_ptr(); + let mut dot: i32; + unsafe { + asm!( + "vmov.s32 d1, 0", + "vmov.s32 d3, 0", + "vld1.s32 {{d0}}, [{v1}]!", + "vld1.s32 {{d1[0]}}, [{v1}]", + "vld1.s32 {{d2}}, [{v2}]!", + "vld1.s32 {{d3[0]}}, [{v2}]", + "vmull.s32 q2, d0, d2", + "vmlal.s32 q2, d1, d3", + "vadd.s64 d4, d4, d5", + "vshr.s64 d4, d4, 12", + "vmov.s32 {dot}, d4[0]", + v1 = in(reg) v1, + v2 = in(reg) v2, + dot = out(reg) dot, + options(pure, readonly, preserves_flags, nostack), + ); + } + dot } }