Skip to content

Commit

Permalink
Merge pull request #377 from linebender/multi2
Browse files Browse the repository at this point in the history
Add multisampled antialiasing
  • Loading branch information
raphlinus authored Oct 12, 2023
2 parents 9bdbb10 + 2c0ef60 commit 4edf786
Show file tree
Hide file tree
Showing 6 changed files with 517 additions and 39 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
},
"wgsl-analyzer.diagnostics.nagaVersion": "main",
"wgsl-analyzer.preprocessor.shaderDefs": [
"full"
"full", "msaa16", "msaa"
]
}
319 changes: 309 additions & 10 deletions shader/fine.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

// Fine rasterizer. This can run in simple (just path rendering) and full
// modes, controllable by #define.
//
// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
// or msaa16.

// This is a cut'n'paste w/ backdrop.
struct Tile {
backdrop: i32,
segments: u32,
Expand All @@ -18,8 +20,6 @@ var<uniform> config: Config;
@group(0) @binding(1)
var<storage> segments: array<Segment>;

#ifdef full

#import blend
#import ptcl

Expand All @@ -40,6 +40,304 @@ var gradients: texture_2d<f32>;
@group(0) @binding(6)
var image_atlas: texture_2d<f32>;

#ifdef msaa8
let MASK_WIDTH = 32u;
let MASK_HEIGHT = 32u;
let SH_SAMPLES_SIZE = 256u;
let SAMPLE_WORDS_PER_PIXEL = 1u;
// This might be better in uniform, but that has 16 byte alignment
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 256u>;
#endif

#ifdef msaa16
let MASK_WIDTH = 64u;
let MASK_HEIGHT = 64u;
let SH_SAMPLES_SIZE = 512u;
let SAMPLE_WORDS_PER_PIXEL = 2u;
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 2048u>;
#endif

#ifdef msaa
let WG_SIZE = 64u;
var<workgroup> sh_count: array<u32, WG_SIZE>;

// This is 8 winding numbers packed to a u32, 4 bits per sample
var<workgroup> sh_winding: array<atomic<u32>, 32u>;
// Same packing, one group of 8 per pixel
var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
// Same packing, accumulating winding numbers for vertical edge crossings
var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;

// number of integer cells spanned by interval defined by a, b
fn span(a: f32, b: f32) -> u32 {
return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
}

let SEG_SIZE = 5u;

// New multisampled algorithm.
fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
if th_ix < 32u {
if th_ix < 2u {
atomicStore(&sh_winding_y[th_ix], 0x88888888u);
}
atomicStore(&sh_winding[th_ix], 0x88888888u);
}
let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
for (var i = 0u; i < sample_count; i++) {
atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
}
workgroupBarrier();
let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
for (var batch = 0u; batch < n_batch; batch++) {
let seg_ix = batch * WG_SIZE + th_ix;
let seg_off = fill.seg_data + seg_ix;
var count = 0u;
let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
// TODO: might save a register rewriting this in terms of limit
if th_ix < slice_size {
let segment = segments[seg_off];
// Note: coords relative to tile origin probably a good idea in coarse path,
// especially as f16 would work. But keeping existing scheme for compatibility.
let xy0 = segment.origin - tile_origin;
let xy1 = xy0 + segment.delta;
var y_edge_f = f32(TILE_HEIGHT);
var delta = select(-1, 1, xy1.x <= xy0.x);
if xy0.x == 0.0 && xy1.x == 0.0 {
if xy0.y == 0.0 {
y_edge_f = 0.0;
} else if xy1.y == 0.0 {
y_edge_f = 0.0;
delta = -delta;
}
} else {
if xy0.x == 0.0 {
if xy0.y != 0.0 {
y_edge_f = xy0.y;
}
} else if xy1.x == 0.0 && xy1.y != 0.0 {
y_edge_f = xy1.y;
}
// discard horizontal lines aligned to pixel grid
if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
}
}
let y_edge = u32(ceil(y_edge_f));
if y_edge < TILE_HEIGHT {
atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
}
}
// workgroup prefix sum of counts
sh_count[th_ix] = count;
let lg_n = firstLeadingBit(slice_size * 2u - 1u);
for (var i = 0u; i < lg_n; i++) {
workgroupBarrier();
if th_ix >= 1u << i {
count += sh_count[th_ix - (1u << i)];
}
workgroupBarrier();
sh_count[th_ix] = count;
}
let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
for (var i = th_ix; i < total; i += WG_SIZE) {
// binary search to find pixel
var lo = 0u;
var hi = slice_size;
let goal = i;
while hi > lo + 1u {
let mid = (lo + hi) >> 1u;
if goal >= sh_count[mid - 1u] {
lo = mid;
} else {
hi = mid;
}
}
let el_ix = lo;
let last_pixel = i + 1u == sh_count[el_ix];
let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
let segment = segments[seg_off];
let xy0_in = segment.origin - tile_origin;
let xy1_in = xy0_in + segment.delta;
let is_down = xy1_in.y >= xy0_in.y;
let xy0 = select(xy1_in, xy0_in, is_down);
let xy1 = select(xy0_in, xy1_in, is_down);

// Set up data for line rasterization
// Note: this is duplicated work if total count exceeds a workgroup.
// One alternative is to compute it in a separate dispatch.
let dx = abs(xy1.x - xy0.x);
let dy = xy1.y - xy0.y;
// TODO: apply numerical robustness and optimization
let dy_dxdy = dy / (dx + dy);
let a = dx / (dx + dy);
let is_positive_slope = xy1.x >= xy0.x;
let sign = select(-1.0, 1.0, is_positive_slope);
let xt0 = floor(xy0.x * sign);
let c = xy0.x * sign - xt0;
let y0i = floor(xy0.y);
let ytop = y0i + 1.0;
let b = dy_dxdy * c + a * (ytop - xy0.y);
let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
// Use line equation to plot pixel coordinates

let zf = a * f32(sub_ix) + b;
let z = floor(zf);
let x = x0i + i32(sign * z);
let y = i32(y0i) + i32(sub_ix) - i32(z);
var is_delta: bool;
// We need to adjust winding number if slope is positive and there
// is a crossing at the left edge of the pixel.
var is_bump = false;
let zp = floor(a * f32(sub_ix - 1u) + b);
if sub_ix == 0u {
is_delta = y0i == xy0.y && y0i != xy1.y;
is_bump = xy0.x == 0.0;
} else {
is_delta = z == zp;
is_bump = is_positive_slope && !is_delta;
}
let pix_ix = u32(y) * TILE_WIDTH + u32(x);
if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
let delta_pix = pix_ix + 1u;
if is_delta {
let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
atomicAdd(&sh_winding[delta_pix >> 3u], delta);
}
}
// Apply sample mask
let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
let half_height = f32(MASK_HEIGHT / 2u);
let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
let mask_col = floor((zf - z) * f32(MASK_WIDTH));
let mask_ix = mask_block + u32(mask_row + mask_col);
#ifdef msaa8
var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
mask &= 0xffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
mask &= 0xffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
mask &= ~(0xffu << mask_shift);
}
let mask_a = mask | (mask << 6u);
let mask_b = mask_a | (mask_a << 12u);
let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
if is_bump {
mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
}
atomicAdd(&sh_samples[pix_ix], mask_signed);
#endif
#ifdef msaa16
var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
mask &= 0xffffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
mask &= 0xffffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
mask &= ~(0xffffu << mask_shift);
}
let mask0 = mask & 0xffu;
let mask0_a = mask0 | (mask0 << 6u);
let mask0_b = mask0_a | (mask0_a << 12u);
let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
let mask1 = (mask >> 8u) & 0xffu;
let mask1_a = mask1 | (mask1 << 6u);
let mask1_b = mask1_a | (mask1_a << 12u);
let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
if is_bump {
let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
mask0_signed += bump_delta;
mask1_signed += bump_delta;
}
atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
#endif
}
workgroupBarrier();
}
var area: array<f32, PIXELS_PER_THREAD>;
let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
var packed_w = atomicLoad(&sh_winding[major]);
// Prefix sum of packed 4 bit values within u32
packed_w += (packed_w - 0x8888888u) << 4u;
packed_w += (packed_w - 0x888888u) << 8u;
packed_w += (packed_w - 0x8888u) << 16u;
// Note: could probably do bias in one go, but it would be inscrutable
if (major & 1u) != 0u {
// We could use shmem to communicate the value from another thread;
// if we had subgroups that would almost certainly be the most
// efficient way. But we just calculate again for simplicity.
var last_packed = atomicLoad(&sh_winding[major - 1u]);
last_packed += (last_packed - 0x8888888u) << 4u;
last_packed += (last_packed - 0x888888u) << 8u;
last_packed += (last_packed - 0x8888u) << 16u;
let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
packed_w += bump;
}
var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
packed_y += (packed_y - 0x8888888u) << 4u;
packed_y += (packed_y - 0x888888u) << 8u;
packed_y += (packed_y - 0x8888u) << 16u;
if th_ix == 0u {
atomicStore(&sh_winding_y[0], packed_y);
}
workgroupBarrier();
var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
if local_id.y >= 8u {
wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
}

for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
let pix_ix = th_ix * PIXELS_PER_THREAD + i;
let minor = pix_ix & 7u;
//let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
// TODO: math might be off here
let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
if expected_zero >= 16u {
area[i] = 1.0;
} else {
#ifdef msaa8
let samples = atomicLoad(&sh_samples[pix_ix]);
let xored = (expected_zero * 0x11111111u) ^ samples;
// Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
let xored2 = xored | (xored * 2u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
#endif
#ifdef msaa16
let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
let xored0 = (expected_zero * 0x11111111u) ^ samples0;
let xored0_2 = xored0 | (xored0 * 2u);
let xored1 = (expected_zero * 0x11111111u) ^ samples1;
let xored1_2 = xored1 | (xored1 >> 1u);
let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
#endif
}
}
return area;
}
#endif

fn read_fill(cmd_ix: u32) -> CmdFill {
let size_and_rule = ptcl[cmd_ix + 1u];
let seg_data = ptcl[cmd_ix + 2u];
Expand Down Expand Up @@ -126,15 +424,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
}
}

#else

@group(0) @binding(3)
var output: texture_storage_2d<r8, write>;

#endif

let PIXELS_PER_THREAD = 4u;

// Analytic area antialiasing.
//
// This is currently dead code if msaa is enabled, but it would be fairly straightforward
// to wire this so it's a dynamic choice (even per-path).
fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
Expand Down Expand Up @@ -220,7 +515,11 @@ fn main(
// CMD_FILL
case 1u: {
let fill = read_fill(cmd_ix);
#ifdef msaa
area = fill_path_ms(fill, wg_id.xy, local_id.xy);
#else
area = fill_path(fill, xy);
#endif
cmd_ix += 4u;
}
// CMD_STROKE
Expand Down
14 changes: 14 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
mod cpu_dispatch;
mod cpu_shader;
mod engine;
mod mask;
mod render;
mod scene;
mod shaders;
Expand Down Expand Up @@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
/// Specialization of `Result` for our catch-all error type.
pub type Result<T> = std::result::Result<T, Error>;

/// Possible configurations for antialiasing.
#[derive(PartialEq, Eq)]
#[allow(unused)]
enum AaConfig {
Area,
Msaa8,
Msaa16,
}

/// Configuration of antialiasing. Currently this is static, but could be switched to
/// a launch option or even finer-grained.
const ANTIALIASING: AaConfig = AaConfig::Area;

/// Renders a scene into a texture or surface.
#[cfg(feature = "wgpu")]
pub struct Renderer {
Expand Down
Loading

0 comments on commit 4edf786

Please sign in to comment.