diff --git a/src/lexer/index.test.ts b/src/lexer/index.test.ts index 27f0597..ac451d7 100644 --- a/src/lexer/index.test.ts +++ b/src/lexer/index.test.ts @@ -27,6 +27,477 @@ import type { End, } from "./token"; +import type { + SourceToken, +} from "./source-token"; + +describe("getSourceToken()", () => { + describe("single token", () => { + const testLex = ({ input, expected }: { input: string, expected: SourceToken }) => { + const lexer = new Lexer(input); + + const token = lexer.getSourceToken(); + + expect(token).toEqual(expected); + }; + + describe("operators", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "+", + expected: { + type: "operator", + value: "+", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "-", + expected: { + type: "operator", + value: "-", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "*", + expected: { + type: "operator", + value: "*", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "/", + expected: { + type: "operator", + value: "/", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "=", + expected: { + type: "operator", + value: "=", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "==", + expected: { + type: "operator", + value: "==", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + { + input: "!", + expected: { + type: "operator", + value: "!", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "!=", + expected: { + type: "operator", + value: "!=", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + { + input: ">", + expected: { + type: "operator", + value: ">", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: ">=", + expected: { + type: "operator", + value: ">=", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + { + input: "<", + expected: { + type: "operator", + value: "<", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "<=", + expected: { + type: "operator", + value: "<=", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + ]; + + it.each(cases)("lex operator '$input'", testLex); + }); + + describe("delimiters", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "(", + expected: { + type: "group delimiter", + value: "(", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: ")", + expected: { + type: "group delimiter", + value: ")", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "{", + expected: { + type: "block delimiter", + value: "{", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "}", + expected: { + type: "block delimiter", + value: "}", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: ",", + expected: { + type: "separator", + value: ",", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + ]; + + it.each(cases)("lex delimiter '$input'", testLex); + }); + + describe("number literals", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "0", + expected: { + type: "number literal", + value: "0", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "1", + expected: { + type: "number literal", + value: "1", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "1234", + expected: { + type: "number literal", + value: "1234", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 3 }, + }, + }, + { + input: "12.34", + expected: { + type: "number literal", + value: "12.34", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 4 }, + }, + }, + ]; + + it.each(cases)("lex number literal '$input'", testLex); + }); + + describe("boolean literals", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "참", + expected: { + type: "boolean literal", + value: "참", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "거짓", + expected: { + type: "boolean literal", + value: "거짓", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + ]; + + it.each(cases)("lex boolean literal '$input'", testLex); + }); + + describe("string literals", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "'foo bar 123 !@# 참'", + expected: { + type: "string literal", + value: "foo bar 123 !@# 참", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 18 }, + }, + }, + { + input: "''", + expected: { + type: "string literal", + value: "", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + ]; + + it.each(cases)("lex string literal '$input'", testLex); + }); + + describe("keywords", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "만약", + expected: { + type: "keyword", + value: "만약", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + { + input: "아니면", + expected: { + type: "keyword", + value: "아니면", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 2 }, + }, + }, + { + input: "함수", + expected: { + type: "keyword", + value: "함수", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + { + input: "결과", + expected: { + type: "keyword", + value: "결과", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + }, + ]; + + it.each(cases)("lex keyword '$input'", testLex); + }); + + describe("identifiers", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "Foo이름123_", + expected: { + type: "identifier", + value: "Foo이름123_", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 8 }, + }, + }, + { + input: "이름Foo123_", + expected: { + type: "identifier", + value: "이름Foo123_", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 8 }, + }, + }, + { + input: "_이름Foo123", + expected: { + type: "identifier", + value: "_이름Foo123", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 8 }, + }, + }, + ]; + + it.each(cases)("lex identifier '$input'", testLex); + }); + + describe("special", () => { + const cases: { input: string, expected: SourceToken }[] = [ + { + input: "$", + expected: { + type: "illegal", + value: "$", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + { + input: "'foo", + expected: { + type: "illegal string", + value: "foo", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 4 }, + }, + }, + { + input: "", + expected: { + type: "end", + value: "$end", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 0 }, + }, + }, + ]; + + it.each(cases)("lex special '$input'", testLex); + }); + }); + + describe("multiple tokens", () => { + const cases: { input: string, expectedTokens: SourceToken[] }[] = [ + { + input: "12 + 34", + expectedTokens: [ + { + type: "number literal", + value: "12", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + { + type: "operator", + value: "+", + posBegin: { row: 0, col: 3 }, + posEnd: { row: 0, col: 3 }, + }, + { + type: "number literal", + value: "34", + posBegin: { row: 0, col: 5 }, + posEnd: { row: 0, col: 6 }, + }, + { + type: "end", + value: "$end", + posBegin: { row: 0, col: 7 }, + posEnd: { row: 0, col: 7 }, + }, + ] + }, + { + input: "만약 참 {\n 12\r\n}", + expectedTokens: [ + { + type: "keyword", + value: "만약", + posBegin: { row: 0, col: 0 }, + posEnd: { row: 0, col: 1 }, + }, + { + type: "boolean literal", + value: "참", + posBegin: { row: 0, col: 3 }, + posEnd: { row: 0, col: 3 }, + }, + { + type: "block delimiter", + value: "{", + posBegin: { row: 0, col: 5 }, + posEnd: { row: 0, col: 5 }, + }, + { + type: "number literal", + value: "12", + posBegin: { row: 1, col: 2 }, + posEnd: { row: 1, col: 3 }, + }, + { + type: "block delimiter", + value: "}", + posBegin: { row: 2, col: 0 }, + posEnd: { row: 2, col: 0 }, + }, + { + type: "end", + value: "$end", + posBegin: { row: 2, col: 1 }, + posEnd: { row: 2, col: 1 }, + }, + ] + }, + ]; + + it.each(cases)("get tokens from input '$input'", ({ input, expectedTokens }) => { + const lexer = new Lexer(input); + + for (const expected of expectedTokens) { + const token = lexer.getSourceToken(); + expect(token).toEqual(expected); + } + }); + }); +}); + describe("getToken()", () => { describe("single token", () => { const testLexing = ({ input, expected }: { input: string, expected: TokenType }) => { diff --git a/src/lexer/index.ts b/src/lexer/index.ts index c464bbc..a60f30a 100644 --- a/src/lexer/index.ts +++ b/src/lexer/index.ts @@ -1,7 +1,31 @@ -import CharBuffer from "./char-buffer"; +import CharBuffer, { type SourceChar } from "./char-buffer"; import * as Token from "./token"; import * as Util from "./util"; +import { + createOperatorToken, + createGroupDelimiterToken, + createBlockDelimiterToken, + createSeparatorToken, + createIllegalToken, + createIllegalStringLiteralToken, + createNumberLiteralToken, + createBooleanLiteralToken, + createStringLiteralToken, + createKeywordToken, + createIdentifierToken, + createEndToken, +} from "./source-token"; +import type { + SourceToken, + OperatorToken, + NumberLiteralToken, + StringLiteralToken, + IllegalStringLiteralToken, +} from "./source-token"; +import type Position from "../util/position"; +import { isDigit, isLetter, isWhitespace } from "./util"; + export default class Lexer { private readonly charBuffer: CharBuffer; @@ -9,6 +33,303 @@ export default class Lexer { this.charBuffer = new CharBuffer(input); } + getSourceToken(): SourceToken { + this.skipWhitespaceChars(); + + const char = this.charBuffer.peekChar(); + switch (char.value) { + case "+": + case "-": + case "*": + case "/": + { + const { position } = this.charBuffer.popChar(); + const value = char.value; + + const token = createOperatorToken(value, position, position); + return token; + } + + case "(": + case ")": + { + const { position } = this.charBuffer.popChar(); + const value = char.value; + + const token = createGroupDelimiterToken(value, position, position); + return token; + } + + case "{": + case "}": + { + const { position } = this.charBuffer.popChar(); + const value = char.value; + + const token = createBlockDelimiterToken(value, position, position); + return token; + } + + case ",": + { + const { position } = this.charBuffer.popChar(); + const value = char.value; + + const token = createSeparatorToken(value, position, position); + return token; + } + + case "!": + { + const { position: pos } = this.charBuffer.popChar(); + + const token = this.lexCharsStartingWithBang(pos); + return token; + } + + case "=": + { + const { position: pos } = this.charBuffer.popChar(); + + const token = this.lexCharsStartingWithEqual(pos); + return token; + } + + case ">": + { + const { position: pos } = this.charBuffer.popChar(); + + const token = this.lexCharsStartingWithGreaterThan(pos); + return token; + } + + case "<": + { + const { position: pos } = this.charBuffer.popChar(); + + const token = this.lexCharsStartingWithLessThan(pos); + return token; + } + + case "'": + { + const { position: pos } = this.charBuffer.popChar(); + + const token = this.lexCharsStartingWithSingleQuote(pos); + return token; + } + + case CharBuffer.END_OF_INPUT: + { + const { position: pos } = this.charBuffer.popChar(); + + // TODO + const token = createEndToken(pos, pos); + return token; + } + + default: + { + if (isDigit(char.value)) { + const token = this.lexNumberLiteral(); + return token; + } + + if (isLetter(char.value)) { + const token = this.lexLetters(); + return token; + } + + const { position } = this.charBuffer.popChar(); + const token = createIllegalToken(char.value, position, position); + return token; + } + } + } + + private skipWhitespaceChars(): void { + while (true) { + const char = this.charBuffer.peekChar(); + if (!isWhitespace(char.value)) { + break; + } + + this.charBuffer.popChar(); + } + } + + /** assumes the bang character popped */ + private lexCharsStartingWithBang(bangPos: Position): OperatorToken { + const peek = this.charBuffer.peekChar(); + switch (peek.value) { + case "=": + { + const { position: posEnd } = this.charBuffer.popChar(); + return createOperatorToken("!=", bangPos, posEnd); + } + + default: + return createOperatorToken("!", bangPos, bangPos); + } + } + + /** assumes the equal character popped */ + private lexCharsStartingWithEqual(equalPos: Position): OperatorToken { + const peek = this.charBuffer.peekChar(); + switch (peek.value) { + case "=": + { + const { position: posEnd } = this.charBuffer.popChar(); + return createOperatorToken("==", equalPos, posEnd); + } + + default: + return createOperatorToken("=", equalPos, equalPos); + } + } + + /** assume the greater-than character popped */ + private lexCharsStartingWithGreaterThan(greaterThanPos: Position): OperatorToken { + const peek = this.charBuffer.peekChar(); + switch (peek.value) { + case "=": + { + const { position: posEnd } = this.charBuffer.popChar(); + return createOperatorToken(">=", greaterThanPos, posEnd); + } + + default: + return createOperatorToken(">", greaterThanPos, greaterThanPos); + } + } + + /** assume the less-than character popped */ + private lexCharsStartingWithLessThan(lessThanPos: Position): OperatorToken { + const peek = this.charBuffer.peekChar(); + switch (peek.value) { + case "=": + { + const { position: posEnd } = this.charBuffer.popChar(); + return createOperatorToken("<=", lessThanPos, posEnd); + } + + default: + return createOperatorToken("<", lessThanPos, lessThanPos); + } + } + + /** assume the single quote character popped */ + private lexCharsStartingWithSingleQuote(quotePos: Position): StringLiteralToken | IllegalStringLiteralToken { + const chars: SourceChar[] = []; + + while (true) { + const char = this.charBuffer.popChar(); + + const value = chars.map(char => char.value).join(""); + const posBegin = quotePos; + const posEnd = char.position; + + if (char.value === "'") { + return createStringLiteralToken(value, posBegin, posEnd); + } + + if (char.value === CharBuffer.END_OF_INPUT) { + return createIllegalStringLiteralToken(value, posBegin, posEnd); + } + + chars.push(char); + } + } + + private lexNumberLiteral(): NumberLiteralToken { + const wholeNumberPart = this.readDigitChars(); + const decimalPart = this.readDecimalChars(); + const numberChars = wholeNumberPart.concat(decimalPart); + + const value = numberChars.map(char => char.value).join(""); + const posBegin = numberChars[0].position; + const posEnd = numberChars[numberChars.length-1].position; + + const token = createNumberLiteralToken(value, posBegin, posEnd); + return token; + } + + private lexLetters(): any { + const letterChars = this.readLetterChars(); + + const value = letterChars.map(char => char.value).join(""); + const posBegin = letterChars[0].position; + const posEnd = letterChars[letterChars.length-1].position; + + // order is important; match keywords first, then identifier + switch (value) { + case "참": + case "거짓": + { + const token = createBooleanLiteralToken(value, posBegin, posEnd); + return token; + } + + case "만약": + case "아니면": + case "함수": + case "결과": + { + const token = createKeywordToken(value, posBegin, posEnd); + return token; + } + + default: + { + const token = createIdentifierToken(value, posBegin, posEnd); + return token; + } + } + } + + private readDigitChars(): SourceChar[] { + const chars: SourceChar[] = []; + while (true) { + const peek = this.charBuffer.peekChar(); + if (!isDigit(peek.value)) { + break; + } + + chars.push(this.charBuffer.popChar()); + } + + return chars; + } + + private readDecimalChars(): SourceChar[] { + // read decimal point; if not, early return + const maybeDot = this.charBuffer.peekChar(); + if (maybeDot.value !== ".") { + return []; + } + const dot = this.charBuffer.popChar(); + + // read and return decimal part + const digits = this.readDigitChars(); + const decimalChars = [dot].concat(digits); + return decimalChars; + } + + private readLetterChars(): SourceChar[] { + const chars: SourceChar[] = []; + while (true) { + const peek = this.charBuffer.peekChar(); + if (!isLetter(peek.value) && !isDigit(peek.value)) { + break; + } + + chars.push(this.charBuffer.popChar()); + } + + return chars; + } + + /** @deprecated */ getToken(): Token.TokenType { this.skipWhitespaces(); @@ -18,33 +339,33 @@ export default class Lexer { case "-": case "*": case "/": - { + { const operator = this.charBuffer.pop() as typeof char; return Token.operator(operator); } case "(": case ")": - { + { const delimiter = this.charBuffer.pop() as typeof char; return Token.groupDelimiter(delimiter); } case "{": case "}": - { + { const delimiter = this.charBuffer.pop() as typeof char; return Token.blockDelimiter(delimiter); } case ",": - { + { const separator = this.charBuffer.pop() as typeof char; return Token.separator(separator); } case "!": - { + { this.charBuffer.pop(); const operator = this.readOperatorStartingWithBang(); @@ -52,7 +373,7 @@ export default class Lexer { } case "=": - { + { this.charBuffer.pop(); const operator: "=" | "==" = this.readOperatorStartingWithEqual(); @@ -60,7 +381,7 @@ export default class Lexer { } case ">": - { + { this.charBuffer.pop(); const operator: ">" | ">=" = this.readOperatorStartingWithGreaterThan(); @@ -68,7 +389,7 @@ export default class Lexer { } case "<": - { + { this.charBuffer.pop(); const operator: "<" | "<=" = this.readOperatorStartingWithLessThan(); @@ -76,7 +397,7 @@ export default class Lexer { } case "'": - { + { this.charBuffer.pop(); const [str, ok] = this.readStringLiteral(); @@ -103,9 +424,9 @@ export default class Lexer { if ( read === "만약" || - read === "아니면" || - read === "함수" || - read === "결과" + read === "아니면" || + read === "함수" || + read === "결과" ) { return Token.keyword(read); } @@ -118,13 +439,14 @@ export default class Lexer { } } + /** @deprecated */ private skipWhitespaces(): void { while (Util.isWhitespace(this.charBuffer.peek())) { this.charBuffer.pop(); } } - /** assume the bang character popped */ + /** @deprecated assume the bang character popped */ private readOperatorStartingWithBang(): "!" | "!=" { switch (this.charBuffer.peek()) { case "=": @@ -136,7 +458,7 @@ export default class Lexer { } } - /** assume the equal character popped */ + /** @deprecated assume the equal character popped */ private readOperatorStartingWithEqual(): "=" | "==" { switch (this.charBuffer.peek()) { case "=": @@ -148,7 +470,7 @@ export default class Lexer { } } - /** assume the greater-than character popped */ + /** @deprecated assume the greater-than character popped */ private readOperatorStartingWithGreaterThan(): ">" | ">=" { switch (this.charBuffer.peek()) { case "=": @@ -160,7 +482,7 @@ export default class Lexer { } } - /** assume the less-than character popped */ + /** @deprecated assume the less-than character popped */ private readOperatorStartingWithLessThan(): "<" | "<=" { switch (this.charBuffer.peek()) { case "=": @@ -172,7 +494,7 @@ export default class Lexer { } } - /** return [string-literal, true] if ok; otherwise [string-read-so-far, false] */ + /** @deprecated return [string-literal, true] if ok; otherwise [string-read-so-far, false] */ private readStringLiteral(): [string, boolean] { const read: string[] = []; @@ -191,6 +513,7 @@ export default class Lexer { } } + /** @deprecated */ private readNumberLiteral(): string { const wholeNumberPart = this.readDigits(); const decimalPart = this.readDecimalPart(); @@ -200,6 +523,7 @@ export default class Lexer { return number; } + /** @deprecated */ private readDigits(): string { const read: string[] = []; while (Util.isDigit(this.charBuffer.peek())) { @@ -210,7 +534,7 @@ export default class Lexer { return digits; } - /** helper function for readNumberLiteral() method */ + /** @deprecated helper function for readNumberLiteral() method */ private readDecimalPart(): string { // read decimal point; if not, early return const maybeDecimalPoint = this.charBuffer.peek(); @@ -225,11 +549,12 @@ export default class Lexer { return decimalPart; } + /** @deprecated */ private readLettersAndDigits(): string { const read = []; while ( Util.isLetter(this.charBuffer.peek()) || - Util.isDigit(this.charBuffer.peek()) + Util.isDigit(this.charBuffer.peek()) ) { read.push(this.charBuffer.pop()); } diff --git a/src/lexer/token/index.ts b/src/lexer/token/index.ts index ee587de..cb8d1de 100644 --- a/src/lexer/token/index.ts +++ b/src/lexer/token/index.ts @@ -82,51 +82,61 @@ export interface End { value: EndValue } +/** @deprecated */ export const operator = (value: Operator["value"]): Operator => ({ type: "operator", value, }); +/** @deprecated */ export const identifier = (value: Identifier["value"]): Identifier => ({ type: "identifier", value, }); +/** @deprecated */ export const numberLiteral = (value: NumberLiteral["value"]): NumberLiteral => ({ type: "number literal", value, }); +/** @deprecated */ export const booleanLiteral = (value: BooleanLiteral["value"]): BooleanLiteral => ({ type: "boolean literal", value, }); +/** @deprecated */ export const stringLiteral = (value: StringLiteral["value"]): StringLiteral => ({ type: "string literal", value, }); +/** @deprecated */ export const groupDelimiter = (value: GroupDelimiter["value"]): GroupDelimiter => ({ type: "group delimiter", value, }); +/** @deprecated */ export const blockDelimiter = (value: BlockDelimiter["value"]): BlockDelimiter => ({ type: "block delimiter", value, }); +/** @deprecated */ export const separator = (value: Separator["value"]): Separator => ({ type: "separator", value, }); +/** @deprecated */ export const keyword = (value: Keyword["value"]): Keyword => ({ type: "keyword", value, }); +/** @deprecated */ export const illegal = (value: Illegal["value"]): Illegal => ({ type: "illegal", value, diff --git a/src/lexer/util/index.ts b/src/lexer/util/index.ts index 485d6d6..3d2276f 100644 --- a/src/lexer/util/index.ts +++ b/src/lexer/util/index.ts @@ -15,9 +15,9 @@ export const isDigit = (char: string): boolean => { }; export const isWhitespace = (char: string): boolean => { - if (char.length !== 1) { + if (char.length > 2) { return false; } - return /^[ \t\r\n]$/.test(char); + return /^(\r\n|[ \t\r\n])$/.test(char); }