forked from carbon-language/carbon-lang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_tree.h
381 lines (320 loc) · 14.2 KB
/
parse_tree.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef CARBON_TOOLCHAIN_PARSER_PARSE_TREE_H_
#define CARBON_TOOLCHAIN_PARSER_PARSE_TREE_H_
#include <iterator>
#include "common/ostream.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/raw_ostream.h"
#include "toolchain/diagnostics/diagnostic_emitter.h"
#include "toolchain/lexer/tokenized_buffer.h"
#include "toolchain/parser/parse_node_kind.h"
namespace Carbon {
// A tree of parsed tokens based on the language grammar.
//
// This is a purely syntactic parse tree without any semantics yet attached. It
// is based on the token stream and the grammar of the language without even
// name lookup.
//
// The tree is designed to make depth-first traversal especially efficient, with
// postorder and reverse postorder (RPO, a topological order) not even requiring
// extra state.
//
// The nodes of the tree follow a flyweight pattern and are handles into the
// tree. The tree itself must be available to query for information about those
// nodes.
//
// Nodes also have a precise one-to-one correspondence to tokens from the parsed
// token stream. Each node can be thought of as the tree-position of a
// particular token from the stream.
//
// The tree is immutable once built, but is designed to support reasonably
// efficient patterns that build a new tree with a specific transformation
// applied.
class ParseTree {
public:
class Node;
class PostorderIterator;
class SiblingIterator;
// The maximum stack depth allowed while recursing the parse tree.
// This is meant to approximate system stack limits, but we may need to find a
// better way to track what the system is enforcing.
static constexpr int StackDepthLimit = 200;
// Parses the token buffer into a `ParseTree`.
//
// This is the factory function which is used to build parse trees.
static auto Parse(TokenizedBuffer& tokens, DiagnosticConsumer& consumer)
-> ParseTree;
// Tests whether there are any errors in the parse tree.
[[nodiscard]] auto has_errors() const -> bool { return has_errors_; }
// Returns the number of nodes in this parse tree.
[[nodiscard]] auto size() const -> int { return node_impls_.size(); }
// Returns an iterable range over the parse tree nodes in depth-first
// postorder.
[[nodiscard]] auto postorder() const
-> llvm::iterator_range<PostorderIterator>;
// Returns an iterable range over the parse tree node and all of its
// descendants in depth-first postorder.
[[nodiscard]] auto postorder(Node n) const
-> llvm::iterator_range<PostorderIterator>;
// Returns an iterable range over the direct children of a node in the parse
// tree. This is a forward range, but is constant time to increment. The order
// of children is the same as would be found in a reverse postorder traversal.
[[nodiscard]] auto children(Node n) const
-> llvm::iterator_range<SiblingIterator>;
// Returns an iterable range over the roots of the parse tree. This is a
// forward range, but is constant time to increment. The order of roots is the
// same as would be found in a reverse postorder traversal.
[[nodiscard]] auto roots() const -> llvm::iterator_range<SiblingIterator>;
// Tests whether a particular node contains an error and may not match the
// full expected structure of the grammar.
[[nodiscard]] auto node_has_error(Node n) const -> bool;
// Returns the kind of the given parse tree node.
[[nodiscard]] auto node_kind(Node n) const -> ParseNodeKind;
// Returns the token the given parse tree node models.
[[nodiscard]] auto node_token(Node n) const -> TokenizedBuffer::Token;
[[nodiscard]] auto node_subtree_size(Node n) const -> int32_t;
// Returns the text backing the token for the given node.
//
// This is a convenience method for chaining from a node through its token to
// the underlying source text.
[[nodiscard]] auto GetNodeText(Node n) const -> llvm::StringRef;
// Prints a description of the parse tree to the provided `raw_ostream`.
//
// While the parse tree is represented as a postorder sequence, we print it in
// preorder to make it easier to visualize and read. The node indices are the
// postorder indices. The print out represents each node as a YAML record,
// with children nested within it.
//
// A single node without children is formatted as:
// ```
// {node_index: 0, kind: 'foo', text: '...'}
// ```
// A node with two children, one of them with an error:
// ```
// {node_index: 2, kind: 'foo', text: '...', children: [
// {node_index: 0, kind: 'bar', text: '...', has_error: yes},
// {node_index: 1, kind: 'baz', text: '...'}]}
// ```
// The top level is formatted as an array of these nodes.
// ```
// [
// {node_index: 1, kind: 'foo', text: '...'},
// {node_index: 0, kind: 'foo', text: '...'},
// ...
// ]
// ```
//
// This can be parsed as YAML using tools like `python-yq` combined with `jq`
// on the command line. The format is also reasonably amenable to other
// line-oriented shell tools from `grep` to `awk`.
auto Print(llvm::raw_ostream& output) const -> void;
// Verifies the parse tree structure.
//
// This tries to check any invariants of the parse tree structure and write
// out information about it to stderr. Returns false if anything fails to
// verify. This is primarily intended to be used as a debugging aid. A typical
// usage is to `assert` on the result. This routine doesn't directly assert so
// that it can be used even when asserts are disabled or within a debugger.
[[nodiscard]] auto Verify() const -> bool;
private:
class Parser;
friend Parser;
// The in-memory representation of data used for a particular node in the
// tree.
struct NodeImpl {
explicit NodeImpl(ParseNodeKind k, TokenizedBuffer::Token t,
int subtree_size_arg)
: kind(k), token(t), subtree_size(subtree_size_arg) {}
// The kind of this node. Note that this is only a single byte.
ParseNodeKind kind;
// We have 3 bytes of padding here that we can pack flags or other compact
// data into.
// Whether this node is or contains a parse error.
//
// When this is true, this node and its children may not have the expected
// grammatical production structure. Prior to reasoning about any specific
// subtree structure, this flag must be checked.
//
// Not every node in the path from the root to an error will have this field
// set to true. However, any node structure that fails to conform to the
// expected grammatical production will be contained within a subtree with
// this flag set. Whether parents of that subtree also have it set is
// optional (and will depend on the particular parse implementation
// strategy). The goal is that you can rely on grammar-based structural
// invariants *until* you encounter a node with this set.
bool has_error = false;
// The token root of this node.
TokenizedBuffer::Token token;
// The size of this node's subtree of the parse tree. This is the number of
// nodes (and thus tokens) that are covered by this node (and its
// descendents) in the parse tree.
//
// During a *reverse* postorder (RPO) traversal of the parse tree, this can
// also be thought of as the offset to the next non-descendant node. When
// this node is not the first child of its parent (which is the last child
// visited in RPO), that is the offset to the next sibling. When this node
// *is* the first child of its parent, this will be an offset to the node's
// parent's next sibling, or if it the parent is also a first child, the
// grandparent's next sibling, and so on.
//
// This field should always be a positive integer as at least this node is
// part of its subtree.
int32_t subtree_size;
};
static_assert(sizeof(NodeImpl) == 12,
"Unexpected size of node implementation!");
// Wires up the reference to the tokenized buffer. The global `parse` routine
// should be used to actually parse the tokens into a tree.
explicit ParseTree(TokenizedBuffer& tokens_arg) : tokens_(&tokens_arg) {}
// Depth-first postorder sequence of node implementation data.
llvm::SmallVector<NodeImpl, 0> node_impls_;
TokenizedBuffer* tokens_;
// Indicates if any errors were encountered while parsing.
//
// This doesn't indicate how much of the tree is structurally accurate with
// respect to the grammar. That can be identified by looking at the `HasError`
// flag for a given node (see above for details). This simply indicates that
// some errors were encountered somewhere. A key implication is that when this
// is true we do *not* have the expected 1:1 mapping between tokens and parsed
// nodes as some tokens may have been skipped.
bool has_errors_ = false;
};
// A lightweight handle representing a node in the tree.
//
// Objects of this type are small and cheap to copy and store. They don't
// contain any of the information about the node, and serve as a handle that
// can be used with the underlying tree to query for detailed information.
//
// That said, nodes can be compared and are part of a depth-first pre-order
// sequence across all nodes in the parse tree.
class ParseTree::Node {
public:
// Node handles are default constructable, but such a node cannot be used
// for anything. It just allows it to be initialized later through
// assignment. Any other operation on a default constructed node is an
// error.
Node() = default;
friend auto operator==(Node lhs, Node rhs) -> bool {
return lhs.index_ == rhs.index_;
}
friend auto operator!=(Node lhs, Node rhs) -> bool {
return lhs.index_ != rhs.index_;
}
friend auto operator<(Node lhs, Node rhs) -> bool {
return lhs.index_ < rhs.index_;
}
friend auto operator<=(Node lhs, Node rhs) -> bool {
return lhs.index_ <= rhs.index_;
}
friend auto operator>(Node lhs, Node rhs) -> bool {
return lhs.index_ > rhs.index_;
}
friend auto operator>=(Node lhs, Node rhs) -> bool {
return lhs.index_ >= rhs.index_;
}
// Returns an opaque integer identifier of the node in the tree. Clients
// should not expect any particular semantics from this value.
//
// TODO: Maybe we can switch to stream operator overloads?
[[nodiscard]] auto index() const -> int { return index_; }
// Prints the node index.
auto Print(llvm::raw_ostream& output) const -> void;
// Returns true if the node is valid; in other words, it was not default
// initialized.
auto is_valid() -> bool { return index_ != InvalidValue; }
private:
friend ParseTree;
friend Parser;
friend PostorderIterator;
friend SiblingIterator;
// Value for uninitialized nodes.
static constexpr int InvalidValue = -1;
// Constructs a node with a specific index into the parse tree's postorder
// sequence of node implementations.
explicit Node(int index) : index_(index) {}
// The index of this node's implementation in the postorder sequence.
int32_t index_ = InvalidValue;
};
// A random-access iterator to the depth-first postorder sequence of parse nodes
// in the parse tree. It produces `ParseTree::Node` objects which are opaque
// handles and must be used in conjunction with the `ParseTree` itself.
class ParseTree::PostorderIterator
: public llvm::iterator_facade_base<PostorderIterator,
std::random_access_iterator_tag, Node,
int, Node*, Node> {
public:
// Default construction is only provided to satisfy iterator requirements. It
// produces an unusable iterator, and you must assign a valid iterator to it
// before performing any operations.
PostorderIterator() = default;
auto operator==(const PostorderIterator& rhs) const -> bool {
return node_ == rhs.node_;
}
auto operator<(const PostorderIterator& rhs) const -> bool {
return node_ < rhs.node_;
}
auto operator*() const -> Node { return node_; }
auto operator-(const PostorderIterator& rhs) const -> int {
return node_.index_ - rhs.node_.index_;
}
auto operator+=(int offset) -> PostorderIterator& {
node_.index_ += offset;
return *this;
}
auto operator-=(int offset) -> PostorderIterator& {
node_.index_ -= offset;
return *this;
}
// Prints the underlying node index.
auto Print(llvm::raw_ostream& output) const -> void;
private:
friend class ParseTree;
explicit PostorderIterator(Node n) : node_(n) {}
Node node_;
};
// A forward iterator across the siblings at a particular level in the parse
// tree. It produces `ParseTree::Node` objects which are opaque handles and must
// be used in conjunction with the `ParseTree` itself.
//
// While this is a forward iterator and may not have good locality within the
// `ParseTree` data structure, it is still constant time to increment and
// suitable for algorithms relying on that property.
//
// The siblings are discovered through a reverse postorder (RPO) tree traversal
// (which is made constant time through cached distance information), and so the
// relative order of siblings matches their RPO order.
class ParseTree::SiblingIterator
: public llvm::iterator_facade_base<
SiblingIterator, std::forward_iterator_tag, Node, int, Node*, Node> {
public:
SiblingIterator() = default;
auto operator==(const SiblingIterator& rhs) const -> bool {
return node_ == rhs.node_;
}
auto operator<(const SiblingIterator& rhs) const -> bool {
// Note that child iterators walk in reverse compared to the postorder
// index.
return node_ > rhs.node_;
}
auto operator*() const -> Node { return node_; }
using iterator_facade_base::operator++;
auto operator++() -> SiblingIterator& {
node_.index_ -= std::abs(tree_->node_impls_[node_.index_].subtree_size);
return *this;
}
// Prints the underlying node index.
auto Print(llvm::raw_ostream& output) const -> void;
private:
friend class ParseTree;
explicit SiblingIterator(const ParseTree& tree_arg, Node n)
: tree_(&tree_arg), node_(n) {}
const ParseTree* tree_;
Node node_;
};
} // namespace Carbon
#endif // CARBON_TOOLCHAIN_PARSER_PARSE_TREE_H_