From 2d7ab4cf7d9a4e65e3e294adcc61c77ce0ddb4fd Mon Sep 17 00:00:00 2001 From: Willi Ye Date: Sun, 5 Jan 2025 08:42:19 +0100 Subject: [PATCH] Add padding to vec3 * Unify vec4 and vec3 neon operations --- src/core/graphics/gpu_3d/registers_3d.rs | 20 +-- src/core/memory/io_arm7.rs | 3 - src/core/memory/io_arm9.rs | 3 - src/math.rs | 154 +++++++++++++---------- 4 files changed, 98 insertions(+), 82 deletions(-) diff --git a/src/core/graphics/gpu_3d/registers_3d.rs b/src/core/graphics/gpu_3d/registers_3d.rs index cb35d31..b7d1843 100644 --- a/src/core/graphics/gpu_3d/registers_3d.rs +++ b/src/core/graphics/gpu_3d/registers_3d.rs @@ -872,10 +872,11 @@ impl Gpu3DRegisters { fn exe_normal(&mut self, params: &[u32; 32]) { let normal_vector_param = NormalVector::from(params[0]); - let mut normal_vector = Vectori32::<3>::default(); - normal_vector[0] = (((u16::from(normal_vector_param.x()) << 6) as i16) >> 3) as i32; - normal_vector[1] = (((u16::from(normal_vector_param.y()) << 6) as i16) >> 3) as i32; - normal_vector[2] = (((u16::from(normal_vector_param.z()) << 6) as i16) >> 3) as i32; + let mut normal_vector = Vectori32::<3>::new([ + (((u16::from(normal_vector_param.x()) << 6) as i16) >> 3) as i32, + (((u16::from(normal_vector_param.y()) << 6) as i16) >> 3) as i32, + (((u16::from(normal_vector_param.z()) << 6) as i16) >> 3) as i32, + ]); if self.texture_coord_mode == TextureCoordTransMode::Normal { let mut vector = Vectori32::<4>::from(normal_vector); @@ -941,7 +942,7 @@ impl Gpu3DRegisters { self.t = tex_coord.t() as i16; if self.texture_coord_mode == TextureCoordTransMode::TexCoord { - let mut vector = Vectori32::<4>([(self.s as i32) << 8, (self.t as i32) << 8, 1 << 8, 1 << 8]); + let mut vector = Vectori32::<4>::new([(self.s as i32) << 8, (self.t as i32) << 8, 1 << 8, 1 << 8]); vector *= &self.matrices.tex; @@ -1177,10 +1178,11 @@ impl Gpu3DRegisters { } fn exe_vec_test(&mut self, params: &[u32; 32]) { - let mut vector = Vectori32::<3>::default(); - vector[0] = (((params[0] & 0x000003FF) << 6) as i16 as i32) >> 3; - vector[1] = (((params[0] & 0x000FFC00) >> 4) as i16 as i32) >> 3; - vector[2] = (((params[0] & 0x3FF00000) >> 14) as i16 as i32) >> 3; + let mut vector = Vectori32::<3>::new([ + (((params[0] & 0x000003FF) << 6) as i16 as i32) >> 3, + (((params[0] & 0x000FFC00) >> 4) as i16 as i32) >> 3, + (((params[0] & 0x3FF00000) >> 14) as i16 as i32) >> 3, + ]); vector *= &self.matrices.vec; self.vec_result[0] = ((vector[0] << 3) as i16) >> 3; diff --git a/src/core/memory/io_arm7.rs b/src/core/memory/io_arm7.rs index 2d03a2c..4c35a48 100644 --- a/src/core/memory/io_arm7.rs +++ b/src/core/memory/io_arm7.rs @@ -32,7 +32,6 @@ impl IoArm7 { } } - #[inline(never)] pub fn read(&mut self, addr_offset: u32, emu: &mut Emu) -> T { match addr_offset & 0xF00000 { 0x0 if IoArm7ReadLut::is_in_range(addr_offset) => T::from(IoArm7ReadLut::read(addr_offset, size_of::() as u8, emu)), @@ -42,7 +41,6 @@ impl IoArm7 { } } - #[inline(never)] pub fn write(&mut self, addr_offset: u32, value: T, emu: &mut Emu) { match addr_offset & 0xF00000 { 0x0 if IoArm7WriteLut::is_in_range(addr_offset) => IoArm7WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu), @@ -51,7 +49,6 @@ impl IoArm7 { } } - #[inline(never)] pub fn write_fixed_slice(&mut self, addr_offset: u32, slice: &[T], emu: &mut Emu) { match addr_offset & 0xF00000 { 0x0 if IoArm7WriteLut::is_in_range(addr_offset) => IoArm7WriteLut::write_fixed_slice(addr_offset, slice, emu), diff --git a/src/core/memory/io_arm9.rs b/src/core/memory/io_arm9.rs index 7d637db..347d096 100644 --- a/src/core/memory/io_arm9.rs +++ b/src/core/memory/io_arm9.rs @@ -22,7 +22,6 @@ impl IoArm9 { } } - #[inline(never)] pub fn read(&mut self, addr_offset: u32, emu: &mut Emu) -> T { match addr_offset & 0xF00000 { 0x0 if IoArm9ReadLut::is_in_range(addr_offset) => T::from(IoArm9ReadLut::read(addr_offset, size_of::() as u8, emu)), @@ -31,14 +30,12 @@ impl IoArm9 { } } - #[inline(never)] pub fn write(&mut self, addr_offset: u32, value: T, emu: &mut Emu) { if likely(IoArm9WriteLut::is_in_range(addr_offset)) { IoArm9WriteLut::write(value.into(), addr_offset, size_of::() as u8, emu); } } - #[inline(never)] pub fn write_fixed_slice(&mut self, addr_offset: u32, slice: &[T], emu: &mut Emu) { if likely(IoArm9WriteLut::is_in_range(addr_offset)) { IoArm9WriteLut::write_fixed_slice(addr_offset, slice, emu); diff --git a/src/math.rs b/src/math.rs index e5e289f..09ff190 100644 --- a/src/math.rs +++ b/src/math.rs @@ -1,8 +1,8 @@ use paste::paste; use std::arch::arm::{int64x2_t, uint64x2_t, vaddq_u64, vmovn_u64, vmull_u32, vreinterpretq_s64_u64, vreinterpretq_u64_s64, vshlq_n_u64, vshrq_n_u64}; use std::arch::asm; -use std::ops; use std::ops::{Index, IndexMut}; +use std::{mem, ops}; // Taken from https://github.com/awxkee/erydanos/blob/master/src/neon/general.rs #[inline] @@ -150,9 +150,82 @@ macro_rules! define_vector { }; } +#[derive(Copy, Clone)] +#[repr(C)] +pub struct Vectori32 +where + [(); 4 - SIZE]:, +{ + values: [i32; SIZE], + padding: [i32; 4 - SIZE], +} + +impl Vectori32 +where + [(); 4 - SIZE]:, +{ + pub fn new(values: [i32; SIZE]) -> Self { + Vectori32 { + values, + padding: unsafe { mem::zeroed() }, + } + } +} + +impl Default for Vectori32 +where + [(); 4 - SIZE]:, +{ + fn default() -> Self { + unsafe { mem::zeroed() } + } +} + +impl AsRef<[i32; SIZE]> for Vectori32 +where + [(); 4 - SIZE]:, +{ + fn as_ref(&self) -> &[i32; SIZE] { + &self.values + } +} + +impl AsMut<[i32; SIZE]> for Vectori32 +where + [(); 4 - SIZE]:, +{ + fn as_mut(&mut self) -> &mut [i32; SIZE] { + &mut self.values + } +} + +impl Index for Vectori32 +where + [(); 4 - SIZE]:, +{ + type Output = i32; + fn index(&self, index: usize) -> &Self::Output { + &self.values[index] + } +} + +impl IndexMut for Vectori32 +where + [(); 4 - SIZE]:, +{ + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + &mut self.values[index] + } +} + +impl From> for Vectori32<4> { + fn from(value: Vectori32<3>) -> Self { + unsafe { mem::transmute(value) } + } +} + define_vector!(u16); define_vector!(i16); -define_vector!(i32); define_vector!(f32); impl ops::Mul<&Matrix> for Vectori32<3> { @@ -175,51 +248,13 @@ impl ops::Mul<&Matrix> for Vectori32<4> { impl ops::MulAssign<&Matrix> for Vectori32<3> { fn mul_assign(&mut self, rhs: &Matrix) { - let mut v0: i32; - let mut v1: i32; - let mut v2: i32; - unsafe { - asm!( - "vmov.s32 d1, 0", - "vld1.s32 {{d0}}, [{v}]!", - "vld1.s32 {{d1[0]}}, [{v}]", - "vld1.s32 {{q1}}, [{m}]!", - "vld1.s32 {{q2}}, [{m}]!", - "vld1.s32 {{q3}}, [{m}]!", - "vld1.s32 {{q4}}, [{m}]", - "vmull.s32 q5, d2, d0[0]", - "vmull.s32 q6, d3, d0[0]", - "vmlal.s32 q5, d4, d0[1]", - "vmlal.s32 q6, d5, d0[1]", - "vmlal.s32 q5, d6, d1[0]", - "vmlal.s32 q6, d7, d1[0]", - "vmlal.s32 q5, d8, d1[1]", - "vmlal.s32 q6, d9, d1[1]", - "vshr.s64 q5, q5, 12", - "vshr.s64 q6, q6, 12", - "vmov.s32 {v0}, s20", - "vmov.s32 {v1}, s22", - "vmov.s32 {v2}, s24", - v = in(reg) self.0.as_ptr(), - m = in(reg) rhs.0.as_ptr(), - v0 = out(reg) v0, - v1 = out(reg) v1, - v2 = out(reg) v2, - options(pure, readonly, preserves_flags, nostack), - ); - } - self[0] = v0; - self[1] = v1; - self[2] = v2; + let vec4: &mut Vectori32<4> = unsafe { mem::transmute(self) }; + vec4.mul_assign(rhs) } } impl ops::MulAssign<&Matrix> for Vectori32<4> { fn mul_assign(&mut self, rhs: &Matrix) { - let mut v0: i32; - let mut v1: i32; - let mut v2: i32; - let mut v3: i32; unsafe { asm!( "vld1.s32 {{q0}}, [{v}]", @@ -237,30 +272,20 @@ impl ops::MulAssign<&Matrix> for Vectori32<4> { "vmlal.s32 q6, d9, d1[1]", "vshr.s64 q5, q5, 12", "vshr.s64 q6, q6, 12", - "vmov.s32 {v0}, s20", - "vmov.s32 {v1}, s22", - "vmov.s32 {v2}, s24", - "vmov.s32 {v3}, s26", - v = in(reg) self.0.as_ptr(), + "vuzp.32 q5, q6", + "vst1.s32 {{q5}}, [{v}]", + v = in(reg) self.values.as_mut_ptr(), m = in(reg) rhs.0.as_ptr(), - v0 = out(reg) v0, - v1 = out(reg) v1, - v2 = out(reg) v2, - v3 = out(reg) v3, - options(pure, readonly, preserves_flags, nostack), + options(preserves_flags, nostack), ); } - self[0] = v0; - self[1] = v1; - self[2] = v2; - self[3] = v3; } } impl ops::Mul<&Vectori32<3>> for Vectori32<3> { type Output = i32; - fn mul(self, rhs: &Vectori32<3>) -> Self::Output { + fn mul(mut self, rhs: &Vectori32<3>) -> Self::Output { /* Vectorization of let mut dot = 0; dot += self[0] as i64 * rhs[0] as i64; @@ -269,24 +294,19 @@ impl ops::Mul<&Vectori32<3>> for Vectori32<3> { (dot >> 12) as i32 */ - let v1 = self.0.as_ptr(); - let v2 = rhs.0.as_ptr(); + self.padding[0] = 0; let mut dot: i32; unsafe { asm!( - "vmov.s32 d1, 0", - "vmov.s32 d3, 0", - "vld1.s32 {{d0}}, [{v1}]!", - "vld1.s32 {{d1[0]}}, [{v1}]", - "vld1.s32 {{d2}}, [{v2}]!", - "vld1.s32 {{d3[0]}}, [{v2}]", + "vld1.s32 {{q1}}, [{v1}]", + "vld1.s32 {{q2}}, [{v2}]", "vmull.s32 q2, d0, d2", "vmlal.s32 q2, d1, d3", "vadd.s64 d4, d4, d5", "vshr.s64 d4, d4, 12", "vmov.s32 {dot}, d4[0]", - v1 = in(reg) v1, - v2 = in(reg) v2, + v1 = in(reg) self.values.as_ptr(), + v2 = in(reg) rhs.values.as_ptr(), dot = out(reg) dot, options(pure, readonly, preserves_flags, nostack), );