Skip to content

Commit

Permalink
ci(benchmark): make lexer benchmark more realistic (#8573)
Browse files Browse the repository at this point in the history
The lexer benchmarks had a problem. The lexer alone cannot make sense of regexp literals, template literals, or JSX text elements - it needs the parser "driving" it.

So lexer was producing plenty of errors on some benchmarks. This is unrealistic - when driven by the parser, the lexer produces no errors. Generating diagnostics is relatively expensive, so this was skewing the benchmarks somewhat.

Solve this by cleaning up the input source text to replace these syntaxes with string literals prior to running the benchmarks.

Unfortunately lexer benchmarks don't exercise the code paths for these syntaxes, but there isn't much we can do about that. We can judge by the parser benchmarks, which are the more important ones anyway.
  • Loading branch information
overlookmotel committed Jan 18, 2025
1 parent 76ea52b commit bfd0b0d
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 9 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,13 @@ impl<'a> Lexer<'a> {
Self::new(allocator, source_text, source_type, unique)
}

/// Get errors.
/// Only used in benchmarks.
#[cfg(feature = "benchmarking")]
pub fn errors(&self) -> &[OxcDiagnostic] {
&self.errors
}

/// Remaining string from `Source`
pub fn remaining(&self) -> &'a str {
self.source.remaining()
Expand Down
4 changes: 3 additions & 1 deletion tasks/benchmark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ bench = false
# with only the crates it needs, to speed up the builds
[dependencies]
oxc_allocator = { workspace = true, optional = true }
oxc_ast = { workspace = true, optional = true }
oxc_codegen = { workspace = true, optional = true }
oxc_isolated_declarations = { workspace = true, optional = true }
oxc_linter = { workspace = true, optional = true }
Expand All @@ -86,6 +87,7 @@ serde_json = { workspace = true, optional = true }
[features]
default = [
"dep:oxc_allocator",
"dep:oxc_ast",
"dep:oxc_codegen",
"dep:oxc_isolated_declarations",
"dep:oxc_linter",
Expand All @@ -103,7 +105,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]

# Features for running each benchmark separately with minimum dependencies that benchmark needs.
# e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
lexer = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
transformer = [
"dep:oxc_allocator",
Expand Down
122 changes: 114 additions & 8 deletions tasks/benchmark/benches/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
#![allow(clippy::disallowed_methods)]
use oxc_allocator::Allocator;
use oxc_ast::{ast::*, Visit};
use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
use oxc_parser::lexer::{Kind, Lexer};
use oxc_parser::{
lexer::{Kind, Lexer},
Parser,
};
use oxc_span::SourceType;
use oxc_tasks_common::{TestFile, TestFiles};

fn bench_lexer(criterion: &mut Criterion) {
let mut group = criterion.benchmark_group("lexer");

// Lexer lacks awareness of JS grammar, so it gets confused by a few things without the parser
// driving it, notably escapes in regexps and template strings.
// So simplify the input for it, by removing backslashes and converting template strings to
// normal string literals.
// driving it. So simplify the input for it, by replacing these syntaxes with plain strings.
// This ensures lexing completes without generating any errors, which is more realistic.
//
// It's unfortunate that this benchmark doesn't exercise the code paths for these syntaxes,
// but this is the closest we can get to a realistic benchmark of lexer in isolation.
let mut allocator = Allocator::default();
let files = TestFiles::complicated()
.files()
.iter()
.map(|file| TestFile {
url: file.url.clone(),
file_name: file.file_name.clone(),
source_text: file.source_text.replace('\\', " ").replace('`', "'"),
.map(|file| {
let source_type = SourceType::from_path(&file.file_name).unwrap();

let mut cleaner = SourceCleaner::new(&file.source_text);
cleaner.clean(source_type, &allocator);
let source_text = cleaner.source_text;

allocator.reset();

TestFile { url: file.url.clone(), file_name: file.file_name.clone(), source_text }
})
.collect::<Vec<_>>();

Expand All @@ -43,3 +56,96 @@ fn bench_lexer(criterion: &mut Criterion) {

criterion_group!(lexer, bench_lexer);
criterion_main!(lexer);

/// Cleaner of source text.
///
/// Purpose is to allow lexer to complete without any errors.
/// Usually sources Oxc is asked to parse will not produce lexer errors, and generating diagnostics is
/// fairly expensive, so is unrealistic for benchmarking purposes.
///
/// Certain syntax will parse without error, but the lexer alone does not have the context to understand
/// they're fine. Notably this includes syntax where the lexer only consumes the first character and
/// parser would then call back into lexer to complete the job.
///
/// So replace these syntaxes with strings so that lexer can complete without error:
/// * `RegExpLiteral`
/// * `TemplateLiteral`
/// * `JSXText`
struct SourceCleaner {
source_text: String,
replacements: Vec<Replacement>,
}

struct Replacement {
span: Span,
text: String,
}

impl SourceCleaner {
fn new(source_text: &str) -> Self {
Self { source_text: source_text.to_string(), replacements: vec![] }
}

fn clean(&mut self, source_type: SourceType, allocator: &Allocator) {
// Parse
let source_text = self.source_text.clone();
let parser_ret = Parser::new(allocator, &source_text, source_type).parse();
assert!(parser_ret.errors.is_empty());
let program = parser_ret.program;

// Visit AST and compile list of replacements
self.visit_program(&program);

// Make replacements
self.replacements.sort_unstable_by_key(|replacement| replacement.span);

for replacement in self.replacements.iter().rev() {
let span = replacement.span;
self.source_text
.replace_range(span.start as usize..span.end as usize, &replacement.text);
}

// Check lexer can lex it without any errors
let mut lexer = Lexer::new_for_benchmarks(allocator, &self.source_text, source_type);
while lexer.next_token().kind != Kind::Eof {}
assert!(lexer.errors().is_empty());
}

fn replace(&mut self, span: Span, text: String) {
self.replacements.push(Replacement { span, text });
}
}

impl<'a> Visit<'a> for SourceCleaner {
fn visit_reg_exp_literal(&mut self, regexp: &RegExpLiteral<'a>) {
let RegExpPattern::Raw(pattern) = regexp.regex.pattern else { unreachable!() };
let span = Span::sized(regexp.span.start, u32::try_from(pattern.len()).unwrap() + 2);
let text = convert_to_string(pattern);
self.replace(span, text);
}

fn visit_template_literal(&mut self, lit: &TemplateLiteral<'a>) {
let span = lit.span;
let text = span.shrink(1).source_text(&self.source_text);
let text = convert_to_string(text).replace('\n', " ");
self.replace(span, text);
}

fn visit_jsx_text(&mut self, jsx_text: &JSXText<'a>) {
let span = jsx_text.span;
let text = span.source_text(&self.source_text);
let text = convert_to_string(text).replace('\n', " ");
self.replace(span, text);
}
}

#[expect(clippy::naive_bytecount)]
fn convert_to_string(text: &str) -> String {
let single_quote_count = text.as_bytes().iter().filter(|&&b| b == b'\'').count();
let double_quote_count = text.as_bytes().iter().filter(|&&b| b == b'"').count();

let (quote, other_quote) =
if single_quote_count <= double_quote_count { ('\'', "\"") } else { ('"', "'") };
let text = text.replace(quote, other_quote);
format!("{quote}{text}{quote}")
}

0 comments on commit bfd0b0d

Please sign in to comment.