Make Box2d::contains and Box2d intersects faster #526

nical · 2024-05-19T19:57:09Z

After stumbling upon https://www.romainguy.dev/posts/2024/down-a-rabbit-hole/ which applies the same optimization to equivalent Kotlin code I gave it a try in euclid and it indeed makes for quite a nice speedup:

Chaining simple && conditions produces branch instructions in some cases. Replacing the logical && with bitwise & to ensure the compiler does not produce branches makes a big (70~80%) performance difference.

Generated assembly

intersects before:

.section .text.euclid::_intersects,"ax",@progbits
	.globl	euclid::_intersects
	.p2align	4, 0x90
	.type	euclid::_intersects,@function
euclid::_intersects:
	.cfi_startproc
	movss xmm0, dword ptr [rsi + 8]
	ucomiss xmm0, dword ptr [rdi]
	jbe .LBB0_4
	movss xmm0, dword ptr [rdi + 8]
	ucomiss xmm0, dword ptr [rsi]
	jbe .LBB0_4
	movss xmm0, dword ptr [rsi + 12]
	ucomiss xmm0, dword ptr [rdi + 4]
	jbe .LBB0_4
	movss xmm0, dword ptr [rdi + 12]
	ucomiss xmm0, dword ptr [rsi + 4]
	seta al
	ret
.LBB0_4:
	xor eax, eax
	ret

intersects after:

.section .text.euclid::_intersects,"ax",@progbits
	.globl	euclid::_intersects
	.p2align	4, 0x90
	.type	euclid::_intersects,@function
euclid::_intersects:
	.cfi_startproc
	movsd xmm0, qword ptr [rdi + 8]
	movhps xmm0, qword ptr [rsi + 8]
	movups xmm1, xmmword ptr [rsi]
	movhps xmm1, qword ptr [rdi]
	cmpnltps xmm1, xmm0
	movmskps eax, xmm1
	test eax, eax
	sete al
	ret

contains before:

.section .text.euclid::box2d_contains_f32,"ax",@progbits
	.globl	euclid::box2d_contains_f32
	.p2align	4, 0x90
	.type	euclid::box2d_contains_f32,@function
euclid::box2d_contains_f32:
	.cfi_startproc
	xor eax, eax
	ucomiss xmm0, dword ptr [rdi]
	jb .LBB3_4
	movss xmm2, dword ptr [rdi + 8]
	ucomiss xmm2, xmm0
	jbe .LBB3_4
	ucomiss xmm1, dword ptr [rdi + 4]
	jb .LBB3_4
	movss xmm0, dword ptr [rdi + 12]
	ucomiss xmm0, xmm1
	seta al
.LBB3_4:
	ret

contains after:

.section .text.euclid::box2d_contains_f32,"ax",@progbits
	.globl	euclid::box2d_contains_f32
	.p2align	4, 0x90
	.type	euclid::box2d_contains_f32,@function
euclid::box2d_contains_f32:

	.cfi_startproc
	ucomiss xmm1, dword ptr [rdi + 4]
	setae cl
	movss xmm2, dword ptr [rdi + 12]
	ucomiss xmm2, xmm1
	seta al
	movss xmm1, dword ptr [rdi]
	movss xmm2, dword ptr [rdi + 8]
	cmpleps xmm1, xmm0
	cmpltps xmm0, xmm2
	andps xmm0, xmm1
	movd edx, xmm0
	and al, cl
	and al, dl
	ret

Benchmark code

Results in code comments.

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use euclid::default::*;

fn pseudo_rand(state: &mut f32) -> f32 {
    let seed = 7.0;
    let x = *state;
    *state += 1.0;

    f32::fract(f32::tan((x * 1.61803398874989484820459 - x).abs() * seed) * x)
}

// Before:
//   box2d intersects f32    time:   [246.45 ms 246.60 ms 246.75 ms]
// After:
//   box2d intersects f32    time:   [70.162 ms 70.189 ms 70.218 ms]                                 
//   change: [-71.558% -71.537% -71.516%] (p = 0.00 < 0.05)
pub fn box2d_intersects_f32(c: &mut Criterion) {
    let mut boxes = Vec::with_capacity(10_000);

    let mut rand_state = 1.0;
    for _ in 0..10_000 {
        let x0 = pseudo_rand(&mut rand_state) * 2000.0;
        let x1 = pseudo_rand(&mut rand_state) * 2000.0;
        let y0 = pseudo_rand(&mut rand_state) * 2000.0;
        let y1 = pseudo_rand(&mut rand_state) * 2000.0;
        boxes.push(Box2D {
            min: Point2D::new(x0.min(x1), y0.min(y1)),
            max: Point2D::new(x0.max(x1), y0.max(y1)),
        });
    }

    c.bench_function("box2d intersects f32", |b| b.iter(|| {
        for a in boxes.iter() {
            for b in boxes.iter() {
                black_box(a.intersects(b));
            }
        }
    }));
}

// Before:
//   box2d contains f32      time:   [367.26 ms 367.50 ms 367.76 ms]
// After:
//   box2d contains f32      time:   [52.898 ms 52.940 ms 52.986 ms]                               
//   change: [-85.609% -85.595% -85.579%] (p = 0.00 < 0.05)
pub fn box2d_contains_f32(c: &mut Criterion) {
    let mut boxes = Vec::with_capacity(10_000);
    let mut points = Vec::with_capacity(10_000);

    let mut rand_state = 1.0;
    for _ in 0..10_000 {
        let x0 = pseudo_rand(&mut rand_state) * 2000.0;
        let x1 = pseudo_rand(&mut rand_state) * 2000.0;
        let y0 = pseudo_rand(&mut rand_state) * 2000.0;
        let y1 = pseudo_rand(&mut rand_state) * 2000.0;
        boxes.push(Box2D {
            min: Point2D::new(x0.min(x1), y0.min(y1)),
            max: Point2D::new(x0.max(x1), y0.max(y1)),
        });
        let x2 = pseudo_rand(&mut rand_state) * 2000.0;
        let y2 = pseudo_rand(&mut rand_state) * 2000.0;
        points.push(Point2D::new(x2, y2));
    }

    c.bench_function("box2d contains f32", |b| b.iter(|| {
        for a in boxes.iter() {
            for b in points.iter() {
                black_box(a.contains(*b));
            }
        }
    }));
}


criterion_group!(box2d, box2d_intersects_f32, box2d_contains_f32);
criterion_main!(box2d);

Chaining simple && conditions produces branch instructions in some cases. Replacing the logical and with bitwise & to ensure the compiler does not produce branches makes a big performance difference.

nical · 2024-05-19T20:31:44Z

The speedup for contains_box:

Before:

box2d contains_box f32  time:   [436.69 ms 436.97 ms 437.26 ms]

After:

box2d contains_box f32  time:   [244.31 ms 244.51 ms 244.72 ms]                                   
change: [-44.102% -44.044% -43.981%] (p = 0.00 < 0.05)

jdm

Makes sense!

nical added 3 commits May 19, 2024 22:28

Make Box2d::contains and Box2d intersects faster

9612b8a

Chaining simple && conditions produces branch instructions in some cases. Replacing the logical and with bitwise & to ensure the compiler does not produce branches makes a big performance difference.

Speed up Box2d::contains_box the same way

cb88653

Give box3d the same treatment

1895f65

nical force-pushed the branchless branch from 30b1c9a to 1895f65 Compare May 19, 2024 20:28

jdm approved these changes May 20, 2024

View reviewed changes

jdm added this pull request to the merge queue May 20, 2024

Merged via the queue into servo:main with commit 3a3e57f May 20, 2024
19 checks passed

nical deleted the branchless branch May 21, 2024 06:55

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Make Box2d::contains and Box2d intersects faster #526

Make Box2d::contains and Box2d intersects faster #526

nical commented May 19, 2024

nical commented May 19, 2024

jdm left a comment

Make Box2d::contains and Box2d intersects faster #526

Make Box2d::contains and Box2d intersects faster #526

Conversation

nical commented May 19, 2024

Generated assembly

Benchmark code

nical commented May 19, 2024

jdm left a comment

Choose a reason for hiding this comment