From 81269a91c6999aca2ee9ffe20ff013e27f080a3a Mon Sep 17 00:00:00 2001 From: Lennart Van Hirtum Date: Wed, 31 Jan 2024 15:04:26 +0100 Subject: [PATCH] Start on Span refactor: replace Tokens with SoA --- src/dev_aid/lsp.rs | 2 +- src/dev_aid/syntax_highlighting.rs | 19 ++++--- src/linker.rs | 6 +-- src/parser.rs | 81 +++++++++++++++--------------- src/tokenizer.rs | 80 ++++++++++++++++------------- 5 files changed, 97 insertions(+), 91 deletions(-) diff --git a/src/dev_aid/lsp.rs b/src/dev_aid/lsp.rs index 1f41f5b..fa01c3d 100644 --- a/src/dev_aid/lsp.rs +++ b/src/dev_aid/lsp.rs @@ -198,7 +198,7 @@ fn do_syntax_highlight(file_data : &FileData, linker : &Linker) -> (SemanticToke let typ = get_semantic_token_type_from_ide_token(ide_tok); let mod_bits = get_modifiers_for_token(ide_tok); - let tok_range = file_data.tokens[tok_idx].get_range(); + let tok_range = file_data.tokens.get_token_range(tok_idx); let whitespace_text = &file_text[cur_whitespace_start..tok_range.start]; cur_whitespace_start = tok_range.end; let token_text = &file_text[tok_range]; diff --git a/src/dev_aid/syntax_highlighting.rs b/src/dev_aid/syntax_highlighting.rs index 7195fcb..b80e074 100644 --- a/src/dev_aid/syntax_highlighting.rs +++ b/src/dev_aid/syntax_highlighting.rs @@ -45,13 +45,13 @@ fn pretty_print_chunk_with_whitespace(whitespace_start : usize, file_text : &str print!("{}{}", whitespace_text, st.apply_to(&file_text[text_span])); } -fn print_tokens(file_text : &str, tokens : &[Token]) { +fn print_tokens(file_text : &str, tokens : &TokenizeResult) { let mut whitespace_start : usize = 0; - for (tok_idx, token) in tokens.iter().enumerate() { + for tok_idx in 0..tokens.len() { let styles = [Style::new().magenta(), Style::new().yellow(), Style::new().blue()]; let st = styles[tok_idx % styles.len()].clone().underlined(); - let token_range = token.get_range(); + let token_range = tokens.get_token_range(tok_idx); pretty_print_chunk_with_whitespace(whitespace_start, file_text, token_range.clone(), st); whitespace_start = token_range.end; } @@ -59,7 +59,7 @@ fn print_tokens(file_text : &str, tokens : &[Token]) { print!("{}\n", &file_text[whitespace_start..file_text.len()]); } -fn pretty_print(file_text : &str, tokens : &[Token], ide_infos : &[IDEToken]) { +fn pretty_print(file_text : &str, tokens : &TokenizeResult, ide_infos : &[IDEToken]) { let mut whitespace_start : usize = 0; for (tok_idx, token) in ide_infos.iter().enumerate() { @@ -85,7 +85,7 @@ fn pretty_print(file_text : &str, tokens : &[Token], ide_infos : &[IDEToken]) { } }; - let tok_span = tokens[tok_idx].get_range(); + let tok_span = tokens.get_token_range(tok_idx); pretty_print_chunk_with_whitespace(whitespace_start, file_text, tok_span.clone(), st); whitespace_start = tok_span.end; } @@ -162,8 +162,7 @@ pub fn create_token_ide_info<'a>(parsed: &FileData, linker : &Linker) -> Vec = Vec::new(); result.reserve(parsed.tokens.len()); - for t in &parsed.tokens { - let tok_typ = t.get_type(); + for &tok_typ in &parsed.tokens.token_types { let initial_typ = if is_keyword(tok_typ) { IDETokenType::Keyword } else if is_bracket(tok_typ) != IsBracket::NotABracket { @@ -196,14 +195,14 @@ pub fn create_token_ide_info<'a>(parsed: &FileData, linker : &Linker) -> Vec Vec> { +fn generate_character_offsets(file_text : &str, tokens : &TokenizeResult) -> Vec> { let mut character_offsets : Vec> = Vec::new(); character_offsets.reserve(tokens.len()); let mut cur_char = 0; let mut whitespace_start = 0; - for tok in tokens { - let tok_range = tok.get_range(); + for tok_idx in 0..tokens.len() { + let tok_range = tokens.get_token_range(tok_idx); // whitespace cur_char += file_text[whitespace_start..tok_range.start].chars().count(); diff --git a/src/linker.rs b/src/linker.rs index 708f81b..1498ed1 100644 --- a/src/linker.rs +++ b/src/linker.rs @@ -1,6 +1,6 @@ use std::{collections::{HashMap, HashSet}, rc::Rc, cell::RefCell}; -use crate::{ast::{Module, LinkInfo, Span}, arena_alloc::{ArenaAllocator, UUID, UUIDMarker}, parser::{FullParseResult, TokenTreeNode}, tokenizer::Token, errors::{ErrorCollector, error_info}, flattening::FlattenedModule, util::{const_str_position, const_str_position_in_tuples}, instantiation::InstantiatedModule, value::Value, typing::Type}; +use crate::{arena_alloc::{ArenaAllocator, UUID, UUIDMarker}, ast::{Module, LinkInfo, Span}, errors::{ErrorCollector, error_info}, flattening::FlattenedModule, instantiation::InstantiatedModule, parser::{FullParseResult, TokenTreeNode}, tokenizer::TokenizeResult, typing::Type, util::{const_str_position, const_str_position_in_tuples}, value::Value}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct ModuleUUIDMarker; @@ -119,7 +119,7 @@ impl Linkable for NamedType { pub struct FileData { pub file_text : String, - pub tokens : Vec, + pub tokens : TokenizeResult, pub token_hierarchy : Vec, pub parsing_errors : ErrorCollector, pub associated_values : Vec @@ -127,7 +127,7 @@ pub struct FileData { impl FileData { fn get_token_text(&self, token_idx : usize) -> &str { - &self.file_text[self.tokens[token_idx].get_range()] + &self.file_text[self.tokens.get_token_range(token_idx)] } } diff --git a/src/parser.rs b/src/parser.rs index 6b147d6..d757176 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -13,20 +13,20 @@ struct TokenContent { } pub enum TokenTreeNode { - PlainToken(Token, usize), // Has the index of the given token to the global Token array + PlainToken{tok_typ : TokenTypeIdx, range : Range, tok_idx : usize}, // Has the index of the given token to the global Token array // Code between '{' and '}', '(' and ')', or '[' and ']' exclusive. Contains sublist of tokens, index of open, index of close bracket Block(TokenTypeIdx, Vec, Span), } impl TokenTreeNode { fn get_token_type(&self) -> TokenTypeIdx { match self { - Self::PlainToken(tok, _pos) => tok.get_type(), + Self::PlainToken{tok_typ, range : _, tok_idx : _} => *tok_typ, Self::Block(typ, _content, _span) => *typ } } fn get_span(&self) -> Span { match self { - Self::PlainToken(_typ, pos) => Span::from(*pos), + Self::PlainToken{tok_typ: _, range : _, tok_idx} => Span::from(*tok_idx), Self::Block(_typ, _content, span) => *span } } @@ -51,18 +51,17 @@ struct TokenHierarchyStackElem { parent : Vec } -pub fn to_token_hierarchy(tokens : &[Token], errors : &ErrorCollector) -> Vec { +pub fn to_token_hierarchy(tokens : &TokenizeResult, errors : &ErrorCollector) -> Vec { let mut cur_token_slab : Vec = Vec::new(); let mut stack : Vec = Vec::new(); // Type of opening bracket, token position, Token Subtree - for (idx, &tok) in tokens.iter().enumerate() { - let tok_typ = tok.get_type(); + for (tok_idx, &tok_typ) in tokens.token_types.iter().enumerate() { if tok_typ == TOKEN_COMMENT || tok_typ == TOKEN_INVALID { // At this stage the comments are filtered out continue; } match is_bracket(tok_typ) { IsBracket::Open => { - stack.push(TokenHierarchyStackElem{open_bracket : tok_typ, open_bracket_pos : idx, parent : cur_token_slab}); + stack.push(TokenHierarchyStackElem{open_bracket : tok_typ, open_bracket_pos : tok_idx, parent : cur_token_slab}); cur_token_slab = Vec::new(); }, IsBracket::Close => { @@ -70,27 +69,27 @@ pub fn to_token_hierarchy(tokens : &[Token], errors : &ErrorCollector) -> Vec { - cur_token_slab.push(TokenTreeNode::PlainToken(tok, idx)); + cur_token_slab.push(TokenTreeNode::PlainToken{tok_typ, range : tokens.get_token_range(tok_idx), tok_idx}); } } } @@ -170,18 +169,18 @@ impl<'it> TokenStream<'it> { } } fn peek_is_plain(&mut self, expected : TokenTypeIdx) -> bool { - if let Some(TokenTreeNode::PlainToken(tok, _place)) = self.iter.peek() { - if tok.get_type() == expected { + if let Some(TokenTreeNode::PlainToken{tok_typ, range : _, tok_idx : _}) = self.iter.peek() { + if *tok_typ == expected { return true; } } false } fn eat_is_plain(&mut self, expected : TokenTypeIdx) -> Option { - if let Some(TokenTreeNode::PlainToken(tok, pos)) = self.peek() { - if tok.get_type() == expected { + if let Some(TokenTreeNode::PlainToken{tok_typ, range, tok_idx}) = self.peek() { + if *tok_typ == expected { self.next(); - return Some(TokenContent{position : *pos, text : tok.get_range()}); + return Some(TokenContent{position : *tok_idx, text : range.clone()}); } } None @@ -240,8 +239,8 @@ impl<'file> ASTParserContext<'file> { None => { self.errors.error_basic(Span::from(unexpected_eof_idx), format!("Unexpected End of Scope while parsing {context}. Expected {expected_list_str}")) } - Some(TokenTreeNode::PlainToken(tok, pos)) => { - self.error_unexpected_token_str(expected_list_str, tok.get_type(), *pos, context); + Some(TokenTreeNode::PlainToken{tok_typ, range: _, tok_idx}) => { + self.error_unexpected_token_str(expected_list_str, *tok_typ, *tok_idx, context); } Some(TokenTreeNode::Block(typ, _, span)) => { let tok_typ_name = get_token_type_name(*typ); @@ -255,8 +254,8 @@ impl<'file> ASTParserContext<'file> { assert!(is_bracket(expected) == IsBracket::NotABracket); match token_stream.next() { - Some(TokenTreeNode::PlainToken(tok, idx)) if tok.get_type() == expected => { - Some(TokenContent{position : *idx, text : tok.get_range()}) + Some(TokenTreeNode::PlainToken{tok_typ, range, tok_idx}) if *tok_typ == expected => { + Some(TokenContent{position : *tok_idx, text : range.clone()}) }, other => { self.error_unexpected_tree_node(&[expected], other, token_stream.unexpected_eof_token, context); @@ -313,23 +312,23 @@ impl<'file> ASTParserContext<'file> { // For expression fn parse_unit_expression(&mut self, token_stream : &mut TokenStream, scope : &LocalVariableContext) -> Option { let mut base_expr : (Expression, Span) = match token_stream.next() { - Some(TokenTreeNode::PlainToken(tok, pos)) if is_unary_operator(tok.get_type()) => { + Some(TokenTreeNode::PlainToken{tok_typ, range: _, tok_idx}) if is_unary_operator(*tok_typ) => { let found_expr = self.parse_unit_expression(token_stream, scope)?; - let new_span = Span(*pos, found_expr.1.1); - return Some((Expression::UnaryOp(Box::new((Operator{op_typ : tok.get_type()}, *pos, found_expr))), new_span)); + let new_span = Span(*tok_idx, found_expr.1.1); + return Some((Expression::UnaryOp(Box::new((Operator{op_typ : *tok_typ}, *tok_idx, found_expr))), new_span)); }, - Some(TokenTreeNode::PlainToken(tok, pos)) if tok.get_type() == TOKEN_IDENTIFIER => { - let ident_ref = if let Some(local_idx) = scope.get_declaration_for(&self.file_text[tok.get_range()]) { + Some(TokenTreeNode::PlainToken{tok_typ, range, tok_idx}) if *tok_typ == TOKEN_IDENTIFIER => { + let ident_ref = if let Some(local_idx) = scope.get_declaration_for(&self.file_text[range.clone()]) { LocalOrGlobal::Local(local_idx) } else { // todo namespacing and shit - LocalOrGlobal::Global(Span::from(*pos)) + LocalOrGlobal::Global(Span::from(*tok_idx)) }; - (Expression::Named(ident_ref), Span::from(*pos)) + (Expression::Named(ident_ref), Span::from(*tok_idx)) }, - Some(TokenTreeNode::PlainToken(tok, pos)) if tok.get_type() == TOKEN_NUMBER => { - let value = &self.file_text[tok.get_range()]; - (Expression::Constant(Value::Integer(BigInt::from_str(value).unwrap())), Span::from(*pos)) + Some(TokenTreeNode::PlainToken{tok_typ, range, tok_idx}) if *tok_typ == TOKEN_NUMBER => { + let value = &self.file_text[range.clone()]; + (Expression::Constant(Value::Integer(BigInt::from_str(value).unwrap())), Span::from(*tok_idx)) }, Some(TokenTreeNode::Block(typ, contents, span)) if *typ == kw("(") => { let mut content_token_stream = TokenStream::new(contents, span.0, span.1); @@ -390,10 +389,10 @@ impl<'file> ASTParserContext<'file> { loop { let mut grabbed_symbol = self.parse_unit_expression(token_stream, scope)?; match token_stream.peek() { - Some(TokenTreeNode::PlainToken(tok, op_pos)) if is_operator(tok.get_type()) => { + Some(TokenTreeNode::PlainToken{tok_typ, range: _, tok_idx}) if is_operator(*tok_typ) => { //let operator_prescedence = get_binary_operator_prescedence(*typ); while let Some((left_expr, stack_op, stack_op_pos)) = stack.pop() { - if get_binary_operator_prescedence(stack_op) >= get_binary_operator_prescedence(tok.get_type()) { + if get_binary_operator_prescedence(stack_op) >= get_binary_operator_prescedence(*tok_typ) { grabbed_symbol = Expression::new_binop(left_expr, Operator{op_typ : stack_op}, stack_op_pos, grabbed_symbol); } else { stack.push((left_expr, stack_op, stack_op_pos)); // oops, shouldn't have popped it @@ -402,7 +401,7 @@ impl<'file> ASTParserContext<'file> { } token_stream.next(); // commit operator peek - stack.push((grabbed_symbol, tok.get_type(), *op_pos)); + stack.push((grabbed_symbol, *tok_typ, *tok_idx)); }, _other => { while let Some((left_expr, stack_op, stack_op_pos)) = stack.pop() { @@ -552,15 +551,15 @@ impl<'file> ASTParserContext<'file> { } } match token_stream.next() { - Some(TokenTreeNode::PlainToken(tok, _pos)) if tok.get_type() == kw(",") => { + Some(TokenTreeNode::PlainToken{tok_typ, range:_, tok_idx:_}) if *tok_typ == kw(",") => { continue; // parse next declaration } - Some(TokenTreeNode::PlainToken(tok, assign_pos)) if tok.get_type() == kw("=") => { + Some(TokenTreeNode::PlainToken{tok_typ, range:_, tok_idx}) if *tok_typ == kw("=") => { // Ends the loop // T a, T b = x(y); - return self.parse_statement_handle_assignment(left_expressions, *assign_pos, token_stream, scope, &mut code_block.statements, start_at); + return self.parse_statement_handle_assignment(left_expressions, *tok_idx, token_stream, scope, &mut code_block.statements, start_at); } - Some(TokenTreeNode::PlainToken(tok, _pos)) if tok.get_type() == kw(";") => { + Some(TokenTreeNode::PlainToken{tok_typ, range:_, tok_idx:_}) if *tok_typ == kw(";") => { // Ends the loop return self.parse_statement_handle_end(left_expressions, all_decls, &mut code_block.statements); } @@ -756,8 +755,8 @@ impl<'file> ASTParserContext<'file> { while let Some(t) = outer_token_iter.next() { match t { - TokenTreeNode::PlainToken(tok, module_kw_pos) if tok.get_type() == kw("module") => { - if let Some(module) = self.parse_module(outer_token_iter, *module_kw_pos) { + TokenTreeNode::PlainToken{tok_typ, range:_, tok_idx} if *tok_typ == kw("module") => { + if let Some(module) = self.parse_module(outer_token_iter, *tok_idx) { modules.push(module); } }, @@ -783,7 +782,7 @@ pub fn parse<'nums, 'g, 'file>(token_hierarchy : &Vec, file_text pub struct FullParseResult { pub file_text : String, - pub tokens : Vec, + pub tokens : TokenizeResult, pub token_hierarchy : Vec, pub ast : ASTRoot } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index cd1e60a..8d084b3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2,30 +2,11 @@ use std::ops::Range; use std::str::CharIndices; use crate::ast::Span; -use crate::errors::*; +use crate::errors::ErrorCollector; use crate::util::const_str_position_in_tuples; pub type TokenTypeIdx = u8; -#[derive(Debug, PartialEq, Clone, Copy)] -pub struct Token { - typ : TokenTypeIdx, - from : usize, - to : usize -} - -impl Token { - pub fn new(typ : TokenTypeIdx, range : Range) -> Self { - Self{typ, from : range.start, to : range.end} - } - pub fn get_type(&self) -> TokenTypeIdx { - self.typ - } - pub fn get_range(&self) -> Range { - self.from..self.to - } -} - pub const ALL_KEYWORDS : [(&'static str, u8); 20] = [ ("template", 0), ("module", 0), @@ -224,8 +205,39 @@ impl<'iter> Iterator for FileIter<'iter> { } } -pub fn tokenize<'txt>(file_text : &'txt str, errors : &ErrorCollector) -> Vec { - let mut result : Vec = Vec::new(); +pub struct TokenizeResult { + pub token_types : Vec, + // List of all boundaries. Starts with 0, in whitespace mode, and then alternatingly switch to being a token, switch to being whitespace, back and forth + // The span of token i is given by token_boundaries[i*2+1..i*2+2] + // Ends at the end of the file + pub token_boundaries : Vec +} +impl TokenizeResult { + fn new() -> Self { + TokenizeResult{token_types : Vec::new(), token_boundaries : vec![0]} + } + // Result can be used for error reporting + fn push(&mut self, typ : TokenTypeIdx, rng : Range) { + self.token_types.push(typ); + self.token_boundaries.push(rng.start); + self.token_boundaries.push(rng.end); + } + fn push_invalid>(&mut self, rng : Range, errors : &ErrorCollector, motivation : S) { + let new_idx = self.token_types.len(); + self.push(TOKEN_INVALID, rng); + errors.error_basic(Span::from(new_idx), motivation); + } + + pub fn len(&self) -> usize { + self.token_types.len() + } + pub fn get_token_range(&self, token_idx : usize) -> Range { + self.token_boundaries[token_idx*2+1]..self.token_boundaries[token_idx*2+2] + } +} + +pub fn tokenize<'txt>(file_text : &'txt str, errors : &ErrorCollector) -> TokenizeResult { + let mut result = TokenizeResult::new(); let mut file_char_iter = FileIter::new(file_text); while let Some((mut file_pos, cur_char)) = file_char_iter.next() { @@ -239,22 +251,20 @@ pub fn tokenize<'txt>(file_text : &'txt str, errors : &ErrorCollector) -> Vec(file_text : &'txt str, errors : &ErrorCollector) -> Vec(file_text : &'txt str, errors : &ErrorCollector) -> Vec