diff --git a/.cspell.json b/.cspell.json index 0611fda62b..5dd086eba9 100644 --- a/.cspell.json +++ b/.cspell.json @@ -11,6 +11,7 @@ "doxygen", "ebnf", "inheritdoc", + "instanceof", "ipfs", "mkdocs", "napi", diff --git a/crates/solidity/outputs/npm/tests/src/tests/cst-cursor.ts b/crates/solidity/outputs/npm/tests/src/tests/cst-cursor.ts index 58a2fb06b3..e8e04daa6a 100644 --- a/crates/solidity/outputs/npm/tests/src/tests/cst-cursor.ts +++ b/crates/solidity/outputs/npm/tests/src/tests/cst-cursor.ts @@ -106,3 +106,30 @@ test("use cursor", () => { expectToken(cursor.node(), TokenKind.Semicolon, ";"); expect(cursor.goToNext()).toBe(false); }); + +test("cursor navigation", () => { + const data = "contract Foo {} contract Bar {} contract Baz {}"; + + const language = new Language("0.8.0"); + const parseTree = language.parse(RuleKind.SourceUnit, data); + + let contractNames = []; + let cursor = parseTree.createTreeCursor(); + + while (cursor.goToNextRuleWithKinds([RuleKind.ContractDefinition])) { + // You have to make sure you return the cursor to original position + cursor.goToFirstChild(); + cursor.goToNextTokenWithKinds([TokenKind.Identifier]); + + // The currently pointed-to node is the name of the contract + let tokenNode = cursor.node(); + if (tokenNode.kind !== TokenKind.Identifier) { + throw new Error("Expected identifier"); + } + contractNames.push(tokenNode.text); + + cursor.goToParent(); + } + + expect(contractNames).toEqual(["Foo", "Bar", "Baz"]); +}); diff --git a/documentation/public/user-guide/npm-package/how-to-parse-a-file/index.md b/documentation/public/user-guide/npm-package/how-to-parse-a-file/index.md index eeb7afbe51..691c79d833 100644 --- a/documentation/public/user-guide/npm-package/how-to-parse-a-file/index.md +++ b/documentation/public/user-guide/npm-package/how-to-parse-a-file/index.md @@ -1,3 +1,116 @@ # How to parse a Solidity file ---8<-- "crates/solidity/inputs/language/snippets/under-construction.md" +In this guide, we'll walk you through the process of parsing a Solidity file using Slang. See [Installation](../#installation) on how to install Slang. + +A file has to be parsed according to a specific Solidity [version](../../../solidity-specification/supported-versions/). The version has to be explicitly specified and is not inferred from the source. To selectively parse parts of the source code using different versions, e.g. when the contract across multiple files has been flattened, you need to do that manually. + +## Using the NPM package + +Start by adding the Slang package as a dependency to your project: + +```bash +$ npm install "@nomicfoundation/slang" +``` + +Using the API directly provides us with a more fine-grained control over the parsing process; we can parse individual rules like contracts, various definitions or even expressions. + +We start by creating a `Language` struct with a given version. This is an entry point for our parser API. + +```ts +import { Language } from "@nomicfoundation/slang/language"; +import { RuleKind, TokenKind } from "@nomicfoundation/slang/kinds"; +import { Cursor } from "@nomicfoundation/slang/cursor"; + +const source = "int256 constant z = 1 + 2;"; +const language = new Language("0.8.11"); + +const parseOutput = language.parse(RuleKind.SourceUnit, source); +const cursor: Cursor = parseOutput.createTreeCursor(); +``` + +The resulting `ParseOutput` class exposes these helpful functions: + +- `errors()/isValid()` that return structured parse errors, if any, +- `tree()` that gives us back a CST (partial if there were parse errors), +- `fn createTreeCursor()` that creates a `Cursor` type used to conveniently walk the parse tree. + +### Example 1: Reconstruct the Solidity file + +Let's try the same example, only now using the API directly. + +We'll start with this file: + +```solidity +// file: file.sol +pragma solidity ^0.8.0; +``` + +#### Step 1: Parse the Solidity file + +Let's naively (ignore the errors) read the file and parse it: + +```ts +import { fs } from "node:fs"; +const data = fs.readFileSync("file.sol", "utf8"); + +let parseTree = language.parse(RuleKind.SourceUnit, data); +``` + +#### Step 2: Reconstruct the source code + +The `Cursor` visits the tree nodes in a depth-first search (DFS) fashion. Since our CST is complete (includes trivia such as whitespace), it's enough to visit the `Token` nodes and concatenate their text to reconstruct the original source code. + +Let's do that: + +```ts +import { TokenNode } from "@nomicfoundation/slang/cst"; + +let output = ""; +while (cursor.goToNext()) { + let node = cursor.node(); + if (node instanceof TokenNode) { + output += node.text; + } +} + +// Jest-style assertion for clarity +expect(output).toEqual("pragma solidity ^0.8.0\n"); +``` + +### Example 2: List the top-level contracts and their names + +The `Cursor` type exposes more procedural-style functions that allow you to navigate the source in an imperative fashion. In addition to `goToNext`, we can go to the parent, first child, next sibling, etc., as well as nodes with a given kind. + +To list the top-level contracts and their names, we need to visit the `ContractDefinition` rule nodes and then their `Identifier` children. + +Let's do that: + +```ts +import { fs } from "node:fs"; +import { RuleKind, TokenKind } from "@nomicfoundation/slang/kinds"; + +const data = fs.readFileSync("file.sol", "utf8"); + +const language = new Language("0.8.0"); +const parseTree = language.parse(RuleKind.SourceUnit, data); + +let contractNames = []; +let cursor = parseTree.createTreeCursor(); + +while (cursor.goToNextRuleWithKinds([RuleKind.ContractDefinition])) { + // You have to make sure you return the cursor to original position + cursor.goToFirstChild(); + cursor.goToNextTokenWithKinds([TokenKind.Identifier]); + + // The currently pointed-to node is the name of the contract + let tokenNode = cursor.node(); + if (tokenNode.kind !== TokenKind.Identifier) { + throw new Error("Expected identifier"); + } + contractNames.push(tokenNode.text); + + cursor.goToParent(); +} + +expect(contractNames).toEqual(["Foo", "Bar", "Baz"]); +``` diff --git a/documentation/public/user-guide/rust-crate/how-to-parse-a-file/index.md b/documentation/public/user-guide/rust-crate/how-to-parse-a-file/index.md index eeb7afbe51..622118cb60 100644 --- a/documentation/public/user-guide/rust-crate/how-to-parse-a-file/index.md +++ b/documentation/public/user-guide/rust-crate/how-to-parse-a-file/index.md @@ -1,3 +1,304 @@ # How to parse a Solidity file ---8<-- "crates/solidity/inputs/language/snippets/under-construction.md" +In this guide, we'll walk you through the process of parsing a Solidity file using Slang. There are two ways to do it: using our CLI app or directly using the API. See [Installation](../#installation) on how to install Slang. + +A file has to be parsed according to a specific Solidity [version](../../../solidity-specification/supported-versions/). The version has to be explicitly specified and is not inferred from the source. To selectively parse parts of the source code using different versions, e.g. when the contract across multiple files has been flattened, you need to do that manually. + +## Using the CLI + +After installing our CLI, you should now have `slang_solidity` in your $PATH. By default, `cargo install`s binaries at `$HOME/.cargo/bin`, so make sure you have that in your $PATH in order to execute the CLI from the command-line. + +Usage: + +``` +slang_solidity parse [--json] --version +``` + +All parse errors are printed in a human-readable format; the command succeeds if there were no parse errors and fails otherwise. + +If `--json` is specified, a JSON representation of a Concrete Syntax Tree (CST) is printed to the standard output stream; nothing is printed otherwise. + +The resulting parse tree has two node kinds: rules and tokens (see [Concepts](../../concepts.md)). + +Example: + +```json +// A Rule node +"Rule": { + // Name of the rule kind + "kind": "SourceUnit", + // Length of the rule in Unicode code points, depending on the encoding used + "text_len": { + "utf8": 24, + "utf16": 24, + "char": 24 // de facto utf32 + }, + "children": [/* Rule or Token nodes */] +} +// A Token node +"Token": { + // Name of the token kind + "kind": "PragmaKeyword", + // Literal value, taken from the source code + "text": "pragma" +} +``` + +Since the resulting structure is well-defined and recursive, we can turn to the popular `jq` tool to quickly analyze the resulting output. + +### Example 1: Reconstruct a Solidity file + +In this example, we'll use the command-line interface (CLI) to parse a Solidity file and then reconstruct it using the Token `text` fields. + +Since the the parse tree is simply a structured view of the underlying source code and our parser is complete (contains trivia), we can simply concatenate the text values of the token to reconstruct the source code. + +We'll start with a bare-bones file: + +```solidity +// file: file.sol +pragma solidity ^0.8.0; +``` + +#### Step 1: Parse the Solidity file + +First, use the `slang_solidity parse` command to parse the Solidity file: + +```bash +VERSION=0.8.11 +slang_solidity parse --json --version "$VERSION" file.sol > output.json +``` + +This command parses the Solidity file and outputs the resulting CST in JSON format. + +#### Step 2: Reconstruct the source code + +Next, let's inspect the tokens in the JSON output: + +```bash +$ JQ_QUERY='recurse | select(.Token?) | .Token' +$ cat output.json | jq "$JQ_QUERY" +``` + +This gives us a flat list of the Token nodes: + +```json +{ + "kind": "PragmaKeyword", + "text": "pragma" +} +{ + "kind": "Whitespace", + "text": " " +} +{ + "kind": "SolidityKeyword", + "text": "solidity" +} +{ + "kind": "Whitespace", + "text": " " +} +{ + "kind": "Caret", + "text": "^" +} +{ + "kind": "VersionPragmaValue", + "text": "0" +} +{ + "kind": "Period", + "text": "." +} +{ + "kind": "VersionPragmaValue", + "text": "8" +} +{ + "kind": "Period", + "text": "." +} +{ + "kind": "VersionPragmaValue", + "text": "0" +} +{ + "kind": "Semicolon", + "text": ";" +} +{ + "kind": "EndOfLine", + "text": "\n" +} +``` + +Now, we can adapt the query to select the `text` fields of the nodes and concatenate them: + +```bash +$ JQ_QUERY='[recurse | select(.Token?) | .Token.text] | join("")' +$ cat output.json | jq "$JQ_QUERY" +"pragma solidity ^0.8.0;\n" +``` + +And that gives us back the reconstructed source code! 🎉 + +### Example 2: List the top-level contracts and their names + +Let's do something slightly more practical. We want to get all of contracts defined in a file and print their names. + +To do that, we need to: + +1. select Rule nodes of `kind` equal to `ContractDefinition` +2. iterate over their children +3. select Token nodes of `kind` equal to `Identifier` +4. print their `text` fields. + +Given this file: + +```solidity +// file: file.sol +contract Foo {} +contract Bar {} +contract Baz {} +``` + +We can construct the `jq` query as follows: + +```bash +$ VERSION="0.8.0" +$ JQ_QUERY='.Rule | .. | select(.kind? == "ContractDefinition")' # Pick Rule nodes of kind "ContractDefinition" +$ JQ_QUERY+=' | .children[]' # Iterate over their children +$ JQ_QUERY+=' | select(.Token?.kind == "Identifier")' # Select Token nodes of kind "Identifier" +$ JQ_QUERY+=' | .Token.text' # Print their text fields +$ slang_solidity parse --json --version "$VERSION" file.sol | jq "$JQ_QUERY" +"Foo" +"Bar" +"Baz" +``` + +## Using the Rust library + +The Rust package is a regular crate published to crates.io, so we can add it to a project as a dependency with: + +```bash +$ cargo add slang_solidity +``` + +Using the API directly provides us with a more fine-grained control over the parsing process. We're not limited to parsing the input as a top-level source unit but we can parse individual rules like contracts, various definitions or even expressions. + +We start by creating a `Language` struct with a given version. This is an entry point for our parser API. + +```rust +// We use `anyhow` for a convenient and simplistic error handling/propagation +use anyhow::Result; +use semver::Version; +use slang_solidity::cst::Node; +use slang_solidity::kinds::{RuleKind, TokenKind}; +use slang_solidity::language::Language; + +fn main() -> Result<()> { + let language = Language::new(Version::parse("0.8.0")?)?; + // Or, using the version directly: + let language = Language::new(Version::new(0, 8, 0))?; + + // ... +} +``` + +Now, we need to read the source code. The `parse` function accepts a `&str` slice, so we need to load it from the file directly or slice from an existing `String` buffer. + +Let's use the convenient [`std::fs::read_to_string`](https://doc.rust-lang.org/stable/std/fs/fn.read_to_string.html) helper to make our lives a bit easier. + +```rust +// ...cont. +let source_code = std::fs::read_to_string("file.sol")?; + +let parse_tree/*: ParseOutput*/ = language.parse(RuleKind::SourceUnit, &source_code); +``` + +The resulting `ParseOutput` type exposes these helpful functions: + +- `fn errors()/is_valid()` that return structured parse errors, if any, +- `fn tree()` that gives us back a CST (partial if there were parse errors), +- `fn create_tree_cursor()` that creates a `Cursor` type used to conveniently walk the parse tree. + +### Example 1: Reconstruct the Solidity file + +Let's try the same example, only now using the API directly. + +We'll start with this file: + +```solidity +// file: file.sol +pragma solidity ^0.8.0; +``` + +#### Step 1: Parse the Solidity file + +We can re-use the snippets above: + +```rust +// ...cont. +let source_code = std::fs::read_to_string("file.sol")?; + +let parse_tree/*: ParseOutput*/ = language.parse(RuleKind::SourceUnit, &source_code); +``` + +#### Step 2: Reconstruct the source code + +The API already includes a convenience `Node::unparse` that does exactly that: + +```rust +let output/*: String*/ = parse_tree.node().unparse(); +assert_eq!(output, "pragma solidity ^0.8.0\n"); +``` + +However, let's do that ourselves to exercise the tree walking. + +The `Cursor` type implements an `Iterator` trait by yielding the tree nodes in a depth-first search (DFS) fashion. + +To only visit the `Token` nodes, we can use the [`Iterator::filter_map`](https://doc.rust-lang.org/stable/std/iter/trait.Iterator.html#method.filter_map) iterator adapter that filters and maps the tree token nodes using the helper [`Node::into_token`](https://docs.rs/slang_solidity/latest/slang_solidity/cst/enum.Node.html#method.into_token) conversion function. + +```rust +let cursor = parse_tree.create_tree_cursor(); +let only_tokens_iter = cursor.filter_map(|node| node.into_token()); + +let mut source = vec![]; +for token_node in only_tokens_iter { + source.push_str(&token_node.text); +} + +assert_eq!(output, "pragma solidity ^0.8.0\n"); +``` + +### Example 2: List the top-level contracts and their names + +In addition to the `Iterator` implementation, the `Cursor` type also exposes procedural-style functions that allow you to navigate the source in an imperative fashion: + +```rust +const SOURCE: &'static str = " +contract Foo {} +contract Bar {} +contract Baz {} +"; + +let language = Language::new(Version::parse("0.8.0")?)?; +let parse_output = language.parse(RuleKind::SourceUnit, SOURCE); + +let mut contract_names = Vec::new(); +let mut cursor = parse_output.create_tree_cursor(); + +while cursor.go_to_next_rule_with_kinds(&[RuleKind::ContractDefinition]) { + // You have to make sure you return the cursor to original position + assert!(cursor.go_to_first_child()); + assert!(cursor.go_to_next_token_with_kinds(&[TokenKind::Identifier])); + + // The currently pointed-to node is the name of the contract + let token_node = cursor.node(); + contract_names.push(token_node.as_token().unwrap().text.clone()); + + assert!(cursor.go_to_parent()); +} + +assert_eq!(contract_names, &["Foo", "Bar", "Baz"]); +```