Skip to content

Commit

Permalink
Meshlet single pass depth downsampling (SPD) (#13003)
Browse files Browse the repository at this point in the history
# Objective
- Using multiple raster passes to generate the depth pyramid is
extremely slow
- Pulling data from the source image is the largest bottleneck, it's
important to sample in a cache-aware pattern
- Barriers and pipeline drain between the raster passes is the second
largest bottleneck
- Each separate RenderPass on the CPU is _really_ expensive

## Solution
- Port [FidelityFX SPD](https://gpuopen.com/fidelityfx-spd) to WGSL,
replacing meshlet's existing multiple raster passes with a ~~single~~
two compute dispatches. Lack of coherent buffers means we have to do the
the last 64x64 tile from mip 7+ in a separate dispatch to ensure the mip
6 writes were flushed :(
- Workgroup shared memory version only at the moment, as the subgroup
operation is blocked by our upgrade to wgpu 0.20 #13186
- Don't enforce a power-of-2 depth pyramid texture size, simply scaling
by 0.5 is fine
  • Loading branch information
JMS55 authored Jun 3, 2024
1 parent b45786d commit 5536079
Show file tree
Hide file tree
Showing 5 changed files with 440 additions and 107 deletions.
299 changes: 288 additions & 11 deletions crates/bevy_pbr/src/meshlet/downsample_depth.wgsl
Original file line number Diff line number Diff line change
@@ -1,16 +1,293 @@
#import bevy_core_pipeline::fullscreen_vertex_shader::FullscreenVertexOutput
@group(0) @binding(0) var mip_0: texture_depth_2d;
@group(0) @binding(1) var mip_1: texture_storage_2d<r32float, write>;
@group(0) @binding(2) var mip_2: texture_storage_2d<r32float, write>;
@group(0) @binding(3) var mip_3: texture_storage_2d<r32float, write>;
@group(0) @binding(4) var mip_4: texture_storage_2d<r32float, write>;
@group(0) @binding(5) var mip_5: texture_storage_2d<r32float, write>;
@group(0) @binding(6) var mip_6: texture_storage_2d<r32float, read_write>;
@group(0) @binding(7) var mip_7: texture_storage_2d<r32float, write>;
@group(0) @binding(8) var mip_8: texture_storage_2d<r32float, write>;
@group(0) @binding(9) var mip_9: texture_storage_2d<r32float, write>;
@group(0) @binding(10) var mip_10: texture_storage_2d<r32float, write>;
@group(0) @binding(11) var mip_11: texture_storage_2d<r32float, write>;
@group(0) @binding(12) var mip_12: texture_storage_2d<r32float, write>;
@group(0) @binding(13) var samplr: sampler;
var<push_constant> max_mip_level: u32;

@group(0) @binding(0) var input_depth: texture_2d<f32>;
@group(0) @binding(1) var samplr: sampler;
/// Generates a hierarchical depth buffer.
/// Based on FidelityFX SPD v2.1 https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/d7531ae47d8b36a5d4025663e731a47a38be882f/sdk/include/FidelityFX/gpu/spd/ffx_spd.h#L528

/// Performs a 2x2 downsample on a depth texture to generate the next mip level of a hierarchical depth buffer.
var<workgroup> intermediate_memory: array<array<f32, 16>, 16>;

@fragment
fn downsample_depth(in: FullscreenVertexOutput) -> @location(0) vec4<f32> {
let depth_quad = textureGather(0, input_depth, samplr, in.uv);
let downsampled_depth = min(
min(depth_quad.x, depth_quad.y),
min(depth_quad.z, depth_quad.w),
@compute
@workgroup_size(256, 1, 1)
fn downsample_depth_first(
@builtin(num_workgroups) num_workgroups: vec3u,
@builtin(workgroup_id) workgroup_id: vec3u,
@builtin(local_invocation_index) local_invocation_index: u32,
) {
let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
let y = sub_xy.y + 8u * (local_invocation_index >> 7u);

downsample_mips_0_and_1(x, y, workgroup_id.xy, local_invocation_index);

downsample_mips_2_to_5(x, y, workgroup_id.xy, local_invocation_index);
}

@compute
@workgroup_size(256, 1, 1)
fn downsample_depth_second(@builtin(local_invocation_index) local_invocation_index: u32) {
let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
let y = sub_xy.y + 8u * (local_invocation_index >> 7u);

downsample_mips_6_and_7(x, y);

downsample_mips_8_to_11(x, y, local_invocation_index);
}

fn downsample_mips_0_and_1(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index: u32) {
var v: vec4f;

var tex = vec2(workgroup_id * 64u) + vec2(x * 2u, y * 2u);
var pix = vec2(workgroup_id * 32u) + vec2(x, y);
v[0] = reduce_load_mip_0(tex);
textureStore(mip_1, pix, vec4(v[0]));

tex = vec2(workgroup_id * 64u) + vec2(x * 2u + 32u, y * 2u);
pix = vec2(workgroup_id * 32u) + vec2(x + 16u, y);
v[1] = reduce_load_mip_0(tex);
textureStore(mip_1, pix, vec4(v[1]));

tex = vec2(workgroup_id * 64u) + vec2(x * 2u, y * 2u + 32u);
pix = vec2(workgroup_id * 32u) + vec2(x, y + 16u);
v[2] = reduce_load_mip_0(tex);
textureStore(mip_1, pix, vec4(v[2]));

tex = vec2(workgroup_id * 64u) + vec2(x * 2u + 32u, y * 2u + 32u);
pix = vec2(workgroup_id * 32u) + vec2(x + 16u, y + 16u);
v[3] = reduce_load_mip_0(tex);
textureStore(mip_1, pix, vec4(v[3]));

if max_mip_level <= 1u { return; }

for (var i = 0u; i < 4u; i++) {
intermediate_memory[x][y] = v[i];
workgroupBarrier();
if local_invocation_index < 64u {
v[i] = reduce_4(vec4(
intermediate_memory[x * 2u + 0u][y * 2u + 0u],
intermediate_memory[x * 2u + 1u][y * 2u + 0u],
intermediate_memory[x * 2u + 0u][y * 2u + 1u],
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
pix = (workgroup_id * 16u) + vec2(
x + (i % 2u) * 8u,
y + (i / 2u) * 8u,
);
textureStore(mip_2, pix, vec4(v[i]));
}
workgroupBarrier();
}

if local_invocation_index < 64u {
intermediate_memory[x + 0u][y + 0u] = v[0];
intermediate_memory[x + 8u][y + 0u] = v[1];
intermediate_memory[x + 0u][y + 8u] = v[2];
intermediate_memory[x + 8u][y + 8u] = v[3];
}
}

fn downsample_mips_2_to_5(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index: u32) {
if max_mip_level <= 2u { return; }
workgroupBarrier();
downsample_mip_2(x, y, workgroup_id, local_invocation_index);

if max_mip_level <= 3u { return; }
workgroupBarrier();
downsample_mip_3(x, y, workgroup_id, local_invocation_index);

if max_mip_level <= 4u { return; }
workgroupBarrier();
downsample_mip_4(x, y, workgroup_id, local_invocation_index);

if max_mip_level <= 5u { return; }
workgroupBarrier();
downsample_mip_5(workgroup_id, local_invocation_index);
}

fn downsample_mip_2(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index: u32) {
if local_invocation_index < 64u {
let v = reduce_4(vec4(
intermediate_memory[x * 2u + 0u][y * 2u + 0u],
intermediate_memory[x * 2u + 1u][y * 2u + 0u],
intermediate_memory[x * 2u + 0u][y * 2u + 1u],
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
textureStore(mip_3, (workgroup_id * 8u) + vec2(x, y), vec4(v));
intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
}
}

fn downsample_mip_3(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index: u32) {
if local_invocation_index < 16u {
let v = reduce_4(vec4(
intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u],
intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u],
intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u],
intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u],
));
textureStore(mip_4, (workgroup_id * 4u) + vec2(x, y), vec4(v));
intermediate_memory[x * 4u + y][y * 4u] = v;
}
}

fn downsample_mip_4(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index: u32) {
if local_invocation_index < 4u {
let v = reduce_4(vec4(
intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u],
intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u],
intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u],
intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u],
));
textureStore(mip_5, (workgroup_id * 2u) + vec2(x, y), vec4(v));
intermediate_memory[x + y * 2u][0u] = v;
}
}

fn downsample_mip_5(workgroup_id: vec2u, local_invocation_index: u32) {
if local_invocation_index < 1u {
let v = reduce_4(vec4(
intermediate_memory[0u][0u],
intermediate_memory[1u][0u],
intermediate_memory[2u][0u],
intermediate_memory[3u][0u],
));
textureStore(mip_6, workgroup_id, vec4(v));
}
}

fn downsample_mips_6_and_7(x: u32, y: u32) {
var v: vec4f;

var tex = vec2(x * 4u + 0u, y * 4u + 0u);
var pix = vec2(x * 2u + 0u, y * 2u + 0u);
v[0] = reduce_load_mip_6(tex);
textureStore(mip_7, pix, vec4(v[0]));

tex = vec2(x * 4u + 2u, y * 4u + 0u);
pix = vec2(x * 2u + 1u, y * 2u + 0u);
v[1] = reduce_load_mip_6(tex);
textureStore(mip_7, pix, vec4(v[1]));

tex = vec2(x * 4u + 0u, y * 4u + 2u);
pix = vec2(x * 2u + 0u, y * 2u + 1u);
v[2] = reduce_load_mip_6(tex);
textureStore(mip_7, pix, vec4(v[2]));

tex = vec2(x * 4u + 2u, y * 4u + 2u);
pix = vec2(x * 2u + 1u, y * 2u + 1u);
v[3] = reduce_load_mip_6(tex);
textureStore(mip_7, pix, vec4(v[3]));

if max_mip_level <= 7u { return; }

let vr = reduce_4(v);
textureStore(mip_8, vec2(x, y), vec4(vr));
intermediate_memory[x][y] = vr;
}

fn downsample_mips_8_to_11(x: u32, y: u32, local_invocation_index: u32) {
if max_mip_level <= 8u { return; }
workgroupBarrier();
downsample_mip_8(x, y, local_invocation_index);

if max_mip_level <= 9u { return; }
workgroupBarrier();
downsample_mip_9(x, y, local_invocation_index);

if max_mip_level <= 10u { return; }
workgroupBarrier();
downsample_mip_10(x, y, local_invocation_index);

if max_mip_level <= 11u { return; }
workgroupBarrier();
downsample_mip_11(local_invocation_index);
}

fn downsample_mip_8(x: u32, y: u32, local_invocation_index: u32) {
if local_invocation_index < 64u {
let v = reduce_4(vec4(
intermediate_memory[x * 2u + 0u][y * 2u + 0u],
intermediate_memory[x * 2u + 1u][y * 2u + 0u],
intermediate_memory[x * 2u + 0u][y * 2u + 1u],
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
textureStore(mip_9, vec2(x, y), vec4(v));
intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
}
}

fn downsample_mip_9(x: u32, y: u32, local_invocation_index: u32) {
if local_invocation_index < 16u {
let v = reduce_4(vec4(
intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u],
intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u],
intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u],
intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u],
));
textureStore(mip_10, vec2(x, y), vec4(v));
intermediate_memory[x * 4u + y][y * 4u] = v;
}
}

fn downsample_mip_10(x: u32, y: u32, local_invocation_index: u32) {
if local_invocation_index < 4u {
let v = reduce_4(vec4(
intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u],
intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u],
intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u],
intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u],
));
textureStore(mip_11, vec2(x, y), vec4(v));
intermediate_memory[x + y * 2u][0u] = v;
}
}

fn downsample_mip_11(local_invocation_index: u32) {
if local_invocation_index < 1u {
let v = reduce_4(vec4(
intermediate_memory[0u][0u],
intermediate_memory[1u][0u],
intermediate_memory[2u][0u],
intermediate_memory[3u][0u],
));
textureStore(mip_12, vec2(0u, 0u), vec4(v));
}
}

fn remap_for_wave_reduction(a: u32) -> vec2u {
return vec2(
insertBits(extractBits(a, 2u, 3u), a, 0u, 1u),
insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u),
);
return vec4(downsampled_depth, 0.0, 0.0, 0.0);
}

fn reduce_load_mip_0(tex: vec2u) -> f32 {
let uv = (vec2f(tex) + 0.5) / vec2f(textureDimensions(mip_0));
return reduce_4(textureGather(mip_0, samplr, uv));
}

fn reduce_load_mip_6(tex: vec2u) -> f32 {
return reduce_4(vec4(
textureLoad(mip_6, tex + vec2(0u, 0u)).r,
textureLoad(mip_6, tex + vec2(0u, 1u)).r,
textureLoad(mip_6, tex + vec2(1u, 0u)).r,
textureLoad(mip_6, tex + vec2(1u, 1u)).r,
));
}

fn reduce_4(v: vec4f) -> f32 {
return min(min(v.x, v.y), min(v.z, v.w));
}
2 changes: 2 additions & 0 deletions crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
meshlet_cluster_meshlet_ids,
}

/// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene.

@compute
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread
fn fill_cluster_buffers(
Expand Down
Loading

0 comments on commit 5536079

Please sign in to comment.