Skip to content

Commit

Permalink
Add padding to vec3
Browse files Browse the repository at this point in the history
* Unify vec4 and vec3 neon operations
  • Loading branch information
Grarak committed Jan 5, 2025
1 parent cb46f8a commit 2d7ab4c
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 82 deletions.
20 changes: 11 additions & 9 deletions src/core/graphics/gpu_3d/registers_3d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -872,10 +872,11 @@ impl Gpu3DRegisters {

fn exe_normal(&mut self, params: &[u32; 32]) {
let normal_vector_param = NormalVector::from(params[0]);
let mut normal_vector = Vectori32::<3>::default();
normal_vector[0] = (((u16::from(normal_vector_param.x()) << 6) as i16) >> 3) as i32;
normal_vector[1] = (((u16::from(normal_vector_param.y()) << 6) as i16) >> 3) as i32;
normal_vector[2] = (((u16::from(normal_vector_param.z()) << 6) as i16) >> 3) as i32;
let mut normal_vector = Vectori32::<3>::new([
(((u16::from(normal_vector_param.x()) << 6) as i16) >> 3) as i32,
(((u16::from(normal_vector_param.y()) << 6) as i16) >> 3) as i32,
(((u16::from(normal_vector_param.z()) << 6) as i16) >> 3) as i32,
]);

if self.texture_coord_mode == TextureCoordTransMode::Normal {
let mut vector = Vectori32::<4>::from(normal_vector);
Expand Down Expand Up @@ -941,7 +942,7 @@ impl Gpu3DRegisters {
self.t = tex_coord.t() as i16;

if self.texture_coord_mode == TextureCoordTransMode::TexCoord {
let mut vector = Vectori32::<4>([(self.s as i32) << 8, (self.t as i32) << 8, 1 << 8, 1 << 8]);
let mut vector = Vectori32::<4>::new([(self.s as i32) << 8, (self.t as i32) << 8, 1 << 8, 1 << 8]);

vector *= &self.matrices.tex;

Expand Down Expand Up @@ -1177,10 +1178,11 @@ impl Gpu3DRegisters {
}

fn exe_vec_test(&mut self, params: &[u32; 32]) {
let mut vector = Vectori32::<3>::default();
vector[0] = (((params[0] & 0x000003FF) << 6) as i16 as i32) >> 3;
vector[1] = (((params[0] & 0x000FFC00) >> 4) as i16 as i32) >> 3;
vector[2] = (((params[0] & 0x3FF00000) >> 14) as i16 as i32) >> 3;
let mut vector = Vectori32::<3>::new([
(((params[0] & 0x000003FF) << 6) as i16 as i32) >> 3,
(((params[0] & 0x000FFC00) >> 4) as i16 as i32) >> 3,
(((params[0] & 0x3FF00000) >> 14) as i16 as i32) >> 3,
]);

vector *= &self.matrices.vec;
self.vec_result[0] = ((vector[0] << 3) as i16) >> 3;
Expand Down
3 changes: 0 additions & 3 deletions src/core/memory/io_arm7.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ impl IoArm7 {
}
}

#[inline(never)]
pub fn read<T: Convert>(&mut self, addr_offset: u32, emu: &mut Emu) -> T {
match addr_offset & 0xF00000 {
0x0 if IoArm7ReadLut::is_in_range(addr_offset) => T::from(IoArm7ReadLut::read(addr_offset, size_of::<T>() as u8, emu)),
Expand All @@ -42,7 +41,6 @@ impl IoArm7 {
}
}

#[inline(never)]
pub fn write<T: Convert>(&mut self, addr_offset: u32, value: T, emu: &mut Emu) {
match addr_offset & 0xF00000 {
0x0 if IoArm7WriteLut::is_in_range(addr_offset) => IoArm7WriteLut::write(value.into(), addr_offset, size_of::<T>() as u8, emu),
Expand All @@ -51,7 +49,6 @@ impl IoArm7 {
}
}

#[inline(never)]
pub fn write_fixed_slice<T: Convert>(&mut self, addr_offset: u32, slice: &[T], emu: &mut Emu) {
match addr_offset & 0xF00000 {
0x0 if IoArm7WriteLut::is_in_range(addr_offset) => IoArm7WriteLut::write_fixed_slice(addr_offset, slice, emu),
Expand Down
3 changes: 0 additions & 3 deletions src/core/memory/io_arm9.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ impl IoArm9 {
}
}

#[inline(never)]
pub fn read<T: Convert>(&mut self, addr_offset: u32, emu: &mut Emu) -> T {
match addr_offset & 0xF00000 {
0x0 if IoArm9ReadLut::is_in_range(addr_offset) => T::from(IoArm9ReadLut::read(addr_offset, size_of::<T>() as u8, emu)),
Expand All @@ -31,14 +30,12 @@ impl IoArm9 {
}
}

#[inline(never)]
pub fn write<T: Convert>(&mut self, addr_offset: u32, value: T, emu: &mut Emu) {
if likely(IoArm9WriteLut::is_in_range(addr_offset)) {
IoArm9WriteLut::write(value.into(), addr_offset, size_of::<T>() as u8, emu);
}
}

#[inline(never)]
pub fn write_fixed_slice<T: Convert>(&mut self, addr_offset: u32, slice: &[T], emu: &mut Emu) {
if likely(IoArm9WriteLut::is_in_range(addr_offset)) {
IoArm9WriteLut::write_fixed_slice(addr_offset, slice, emu);
Expand Down
154 changes: 87 additions & 67 deletions src/math.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use paste::paste;
use std::arch::arm::{int64x2_t, uint64x2_t, vaddq_u64, vmovn_u64, vmull_u32, vreinterpretq_s64_u64, vreinterpretq_u64_s64, vshlq_n_u64, vshrq_n_u64};
use std::arch::asm;
use std::ops;
use std::ops::{Index, IndexMut};
use std::{mem, ops};

// Taken from https://github.com/awxkee/erydanos/blob/master/src/neon/general.rs
#[inline]
Expand Down Expand Up @@ -150,9 +150,82 @@ macro_rules! define_vector {
};
}

#[derive(Copy, Clone)]
#[repr(C)]
pub struct Vectori32<const SIZE: usize>
where
[(); 4 - SIZE]:,
{
values: [i32; SIZE],
padding: [i32; 4 - SIZE],
}

impl<const SIZE: usize> Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
pub fn new(values: [i32; SIZE]) -> Self {
Vectori32 {
values,
padding: unsafe { mem::zeroed() },
}
}
}

impl<const SIZE: usize> Default for Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
fn default() -> Self {
unsafe { mem::zeroed() }
}
}

impl<const SIZE: usize> AsRef<[i32; SIZE]> for Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
fn as_ref(&self) -> &[i32; SIZE] {
&self.values
}
}

impl<const SIZE: usize> AsMut<[i32; SIZE]> for Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
fn as_mut(&mut self) -> &mut [i32; SIZE] {
&mut self.values
}
}

impl<const SIZE: usize> Index<usize> for Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
type Output = i32;
fn index(&self, index: usize) -> &Self::Output {
&self.values[index]
}
}

impl<const SIZE: usize> IndexMut<usize> for Vectori32<SIZE>
where
[(); 4 - SIZE]:,
{
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
&mut self.values[index]
}
}

impl From<Vectori32<3>> for Vectori32<4> {
fn from(value: Vectori32<3>) -> Self {
unsafe { mem::transmute(value) }
}
}

define_vector!(u16);
define_vector!(i16);
define_vector!(i32);
define_vector!(f32);

impl ops::Mul<&Matrix> for Vectori32<3> {
Expand All @@ -175,51 +248,13 @@ impl ops::Mul<&Matrix> for Vectori32<4> {

impl ops::MulAssign<&Matrix> for Vectori32<3> {
fn mul_assign(&mut self, rhs: &Matrix) {
let mut v0: i32;
let mut v1: i32;
let mut v2: i32;
unsafe {
asm!(
"vmov.s32 d1, 0",
"vld1.s32 {{d0}}, [{v}]!",
"vld1.s32 {{d1[0]}}, [{v}]",
"vld1.s32 {{q1}}, [{m}]!",
"vld1.s32 {{q2}}, [{m}]!",
"vld1.s32 {{q3}}, [{m}]!",
"vld1.s32 {{q4}}, [{m}]",
"vmull.s32 q5, d2, d0[0]",
"vmull.s32 q6, d3, d0[0]",
"vmlal.s32 q5, d4, d0[1]",
"vmlal.s32 q6, d5, d0[1]",
"vmlal.s32 q5, d6, d1[0]",
"vmlal.s32 q6, d7, d1[0]",
"vmlal.s32 q5, d8, d1[1]",
"vmlal.s32 q6, d9, d1[1]",
"vshr.s64 q5, q5, 12",
"vshr.s64 q6, q6, 12",
"vmov.s32 {v0}, s20",
"vmov.s32 {v1}, s22",
"vmov.s32 {v2}, s24",
v = in(reg) self.0.as_ptr(),
m = in(reg) rhs.0.as_ptr(),
v0 = out(reg) v0,
v1 = out(reg) v1,
v2 = out(reg) v2,
options(pure, readonly, preserves_flags, nostack),
);
}
self[0] = v0;
self[1] = v1;
self[2] = v2;
let vec4: &mut Vectori32<4> = unsafe { mem::transmute(self) };
vec4.mul_assign(rhs)
}
}

impl ops::MulAssign<&Matrix> for Vectori32<4> {
fn mul_assign(&mut self, rhs: &Matrix) {
let mut v0: i32;
let mut v1: i32;
let mut v2: i32;
let mut v3: i32;
unsafe {
asm!(
"vld1.s32 {{q0}}, [{v}]",
Expand All @@ -237,30 +272,20 @@ impl ops::MulAssign<&Matrix> for Vectori32<4> {
"vmlal.s32 q6, d9, d1[1]",
"vshr.s64 q5, q5, 12",
"vshr.s64 q6, q6, 12",
"vmov.s32 {v0}, s20",
"vmov.s32 {v1}, s22",
"vmov.s32 {v2}, s24",
"vmov.s32 {v3}, s26",
v = in(reg) self.0.as_ptr(),
"vuzp.32 q5, q6",
"vst1.s32 {{q5}}, [{v}]",
v = in(reg) self.values.as_mut_ptr(),
m = in(reg) rhs.0.as_ptr(),
v0 = out(reg) v0,
v1 = out(reg) v1,
v2 = out(reg) v2,
v3 = out(reg) v3,
options(pure, readonly, preserves_flags, nostack),
options(preserves_flags, nostack),
);
}
self[0] = v0;
self[1] = v1;
self[2] = v2;
self[3] = v3;
}
}

impl ops::Mul<&Vectori32<3>> for Vectori32<3> {
type Output = i32;

fn mul(self, rhs: &Vectori32<3>) -> Self::Output {
fn mul(mut self, rhs: &Vectori32<3>) -> Self::Output {
/* Vectorization of
let mut dot = 0;
dot += self[0] as i64 * rhs[0] as i64;
Expand All @@ -269,24 +294,19 @@ impl ops::Mul<&Vectori32<3>> for Vectori32<3> {
(dot >> 12) as i32
*/

let v1 = self.0.as_ptr();
let v2 = rhs.0.as_ptr();
self.padding[0] = 0;
let mut dot: i32;
unsafe {
asm!(
"vmov.s32 d1, 0",
"vmov.s32 d3, 0",
"vld1.s32 {{d0}}, [{v1}]!",
"vld1.s32 {{d1[0]}}, [{v1}]",
"vld1.s32 {{d2}}, [{v2}]!",
"vld1.s32 {{d3[0]}}, [{v2}]",
"vld1.s32 {{q1}}, [{v1}]",
"vld1.s32 {{q2}}, [{v2}]",
"vmull.s32 q2, d0, d2",
"vmlal.s32 q2, d1, d3",
"vadd.s64 d4, d4, d5",
"vshr.s64 d4, d4, 12",
"vmov.s32 {dot}, d4[0]",
v1 = in(reg) v1,
v2 = in(reg) v2,
v1 = in(reg) self.values.as_ptr(),
v2 = in(reg) rhs.values.as_ptr(),
dot = out(reg) dot,
options(pure, readonly, preserves_flags, nostack),
);
Expand Down

0 comments on commit 2d7ab4c

Please sign in to comment.