From bfd0b0da17a2a3e33dcf9c267679c8d64c4eb5bc Mon Sep 17 00:00:00 2001
From: overlookmotel <557937+overlookmotel@users.noreply.github.com>
Date: Sat, 18 Jan 2025 01:47:06 +0000
Subject: [PATCH] ci(benchmark): make lexer benchmark more realistic (#8573)

The lexer benchmarks had a problem. The lexer alone cannot make sense of regexp literals, template literals, or JSX text elements - it needs the parser "driving" it.

So lexer was producing plenty of errors on some benchmarks. This is unrealistic - when driven by the parser, the lexer produces no errors. Generating diagnostics is relatively expensive, so this was skewing the benchmarks somewhat.

Solve this by cleaning up the input source text to replace these syntaxes with string literals prior to running the benchmarks.

Unfortunately lexer benchmarks don't exercise the code paths for these syntaxes, but there isn't much we can do about that. We can judge by the parser benchmarks, which are the more important ones anyway.
---
 Cargo.lock                         |   1 +
 crates/oxc_parser/src/lexer/mod.rs |   7 ++
 tasks/benchmark/Cargo.toml         |   4 +-
 tasks/benchmark/benches/lexer.rs   | 122 +++++++++++++++++++++++++++--
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f0fdf03ec2cb..9312e7d7b1cd3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1602,6 +1602,7 @@ version = "0.0.0"
 dependencies = [
  "criterion2",
  "oxc_allocator",
+ "oxc_ast",
  "oxc_codegen",
  "oxc_isolated_declarations",
  "oxc_linter",
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index 4527e3a9f91aa..02397fdd2bda7 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -136,6 +136,13 @@ impl<'a> Lexer<'a> {
         Self::new(allocator, source_text, source_type, unique)
     }
 
+    /// Get errors.
+    /// Only used in benchmarks.
+    #[cfg(feature = "benchmarking")]
+    pub fn errors(&self) -> &[OxcDiagnostic] {
+        &self.errors
+    }
+
     /// Remaining string from `Source`
     pub fn remaining(&self) -> &'a str {
         self.source.remaining()
diff --git a/tasks/benchmark/Cargo.toml b/tasks/benchmark/Cargo.toml
index a3eceb7897bba..d9e330f0538e5 100644
--- a/tasks/benchmark/Cargo.toml
+++ b/tasks/benchmark/Cargo.toml
@@ -65,6 +65,7 @@ bench = false
 # with only the crates it needs, to speed up the builds
 [dependencies]
 oxc_allocator = { workspace = true, optional = true }
+oxc_ast = { workspace = true, optional = true }
 oxc_codegen = { workspace = true, optional = true }
 oxc_isolated_declarations = { workspace = true, optional = true }
 oxc_linter = { workspace = true, optional = true }
@@ -86,6 +87,7 @@ serde_json = { workspace = true, optional = true }
 [features]
 default = [
   "dep:oxc_allocator",
+  "dep:oxc_ast",
   "dep:oxc_codegen",
   "dep:oxc_isolated_declarations",
   "dep:oxc_linter",
@@ -103,7 +105,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]
 
 # Features for running each benchmark separately with minimum dependencies that benchmark needs.
 # e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
-lexer = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
+lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
 parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
 transformer = [
   "dep:oxc_allocator",
diff --git a/tasks/benchmark/benches/lexer.rs b/tasks/benchmark/benches/lexer.rs
index 234da608bf81f..0ef9dcb7c0762 100644
--- a/tasks/benchmark/benches/lexer.rs
+++ b/tasks/benchmark/benches/lexer.rs
@@ -1,7 +1,11 @@
 #![allow(clippy::disallowed_methods)]
 use oxc_allocator::Allocator;
+use oxc_ast::{ast::*, Visit};
 use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use oxc_parser::lexer::{Kind, Lexer};
+use oxc_parser::{
+    lexer::{Kind, Lexer},
+    Parser,
+};
 use oxc_span::SourceType;
 use oxc_tasks_common::{TestFile, TestFiles};
 
@@ -9,16 +13,25 @@ fn bench_lexer(criterion: &mut Criterion) {
     let mut group = criterion.benchmark_group("lexer");
 
     // Lexer lacks awareness of JS grammar, so it gets confused by a few things without the parser
-    // driving it, notably escapes in regexps and template strings.
-    // So simplify the input for it, by removing backslashes and converting template strings to
-    // normal string literals.
+    // driving it. So simplify the input for it, by replacing these syntaxes with plain strings.
+    // This ensures lexing completes without generating any errors, which is more realistic.
+    //
+    // It's unfortunate that this benchmark doesn't exercise the code paths for these syntaxes,
+    // but this is the closest we can get to a realistic benchmark of lexer in isolation.
+    let mut allocator = Allocator::default();
     let files = TestFiles::complicated()
         .files()
         .iter()
-        .map(|file| TestFile {
-            url: file.url.clone(),
-            file_name: file.file_name.clone(),
-            source_text: file.source_text.replace('\\', " ").replace('`', "'"),
+        .map(|file| {
+            let source_type = SourceType::from_path(&file.file_name).unwrap();
+
+            let mut cleaner = SourceCleaner::new(&file.source_text);
+            cleaner.clean(source_type, &allocator);
+            let source_text = cleaner.source_text;
+
+            allocator.reset();
+
+            TestFile { url: file.url.clone(), file_name: file.file_name.clone(), source_text }
         })
         .collect::<Vec<_>>();
 
@@ -43,3 +56,96 @@ fn bench_lexer(criterion: &mut Criterion) {
 
 criterion_group!(lexer, bench_lexer);
 criterion_main!(lexer);
+
+/// Cleaner of source text.
+///
+/// Purpose is to allow lexer to complete without any errors.
+/// Usually sources Oxc is asked to parse will not produce lexer errors, and generating diagnostics is
+/// fairly expensive, so is unrealistic for benchmarking purposes.
+///
+/// Certain syntax will parse without error, but the lexer alone does not have the context to understand
+/// they're fine. Notably this includes syntax where the lexer only consumes the first character and
+/// parser would then call back into lexer to complete the job.
+///
+/// So replace these syntaxes with strings so that lexer can complete without error:
+/// * `RegExpLiteral`
+/// * `TemplateLiteral`
+/// * `JSXText`
+struct SourceCleaner {
+    source_text: String,
+    replacements: Vec<Replacement>,
+}
+
+struct Replacement {
+    span: Span,
+    text: String,
+}
+
+impl SourceCleaner {
+    fn new(source_text: &str) -> Self {
+        Self { source_text: source_text.to_string(), replacements: vec![] }
+    }
+
+    fn clean(&mut self, source_type: SourceType, allocator: &Allocator) {
+        // Parse
+        let source_text = self.source_text.clone();
+        let parser_ret = Parser::new(allocator, &source_text, source_type).parse();
+        assert!(parser_ret.errors.is_empty());
+        let program = parser_ret.program;
+
+        // Visit AST and compile list of replacements
+        self.visit_program(&program);
+
+        // Make replacements
+        self.replacements.sort_unstable_by_key(|replacement| replacement.span);
+
+        for replacement in self.replacements.iter().rev() {
+            let span = replacement.span;
+            self.source_text
+                .replace_range(span.start as usize..span.end as usize, &replacement.text);
+        }
+
+        // Check lexer can lex it without any errors
+        let mut lexer = Lexer::new_for_benchmarks(allocator, &self.source_text, source_type);
+        while lexer.next_token().kind != Kind::Eof {}
+        assert!(lexer.errors().is_empty());
+    }
+
+    fn replace(&mut self, span: Span, text: String) {
+        self.replacements.push(Replacement { span, text });
+    }
+}
+
+impl<'a> Visit<'a> for SourceCleaner {
+    fn visit_reg_exp_literal(&mut self, regexp: &RegExpLiteral<'a>) {
+        let RegExpPattern::Raw(pattern) = regexp.regex.pattern else { unreachable!() };
+        let span = Span::sized(regexp.span.start, u32::try_from(pattern.len()).unwrap() + 2);
+        let text = convert_to_string(pattern);
+        self.replace(span, text);
+    }
+
+    fn visit_template_literal(&mut self, lit: &TemplateLiteral<'a>) {
+        let span = lit.span;
+        let text = span.shrink(1).source_text(&self.source_text);
+        let text = convert_to_string(text).replace('\n', " ");
+        self.replace(span, text);
+    }
+
+    fn visit_jsx_text(&mut self, jsx_text: &JSXText<'a>) {
+        let span = jsx_text.span;
+        let text = span.source_text(&self.source_text);
+        let text = convert_to_string(text).replace('\n', " ");
+        self.replace(span, text);
+    }
+}
+
+#[expect(clippy::naive_bytecount)]
+fn convert_to_string(text: &str) -> String {
+    let single_quote_count = text.as_bytes().iter().filter(|&&b| b == b'\'').count();
+    let double_quote_count = text.as_bytes().iter().filter(|&&b| b == b'"').count();
+
+    let (quote, other_quote) =
+        if single_quote_count <= double_quote_count { ('\'', "\"") } else { ('"', "'") };
+    let text = text.replace(quote, other_quote);
+    format!("{quote}{text}{quote}")
+}