diff --git a/parser/splitHedString.js b/parser/splitHedString.js index 387377d5..5b2716a7 100644 --- a/parser/splitHedString.js +++ b/parser/splitHedString.js @@ -1,25 +1,11 @@ -import flattenDeep from 'lodash/flattenDeep' - import { ParsedHed3Tag, ParsedHedTag } from './parsedHedTag' import ParsedHedColumnSplice from './parsedHedColumnSplice' import ParsedHedGroup from './parsedHedGroup' import { Schemas } from '../common/schema/types' -import { generateIssue } from '../common/issues/issues' import { recursiveMap } from '../utils/array' -import { replaceTagNameWithPound } from '../utils/hedStrings' import { mergeParsingIssues } from '../utils/hedData' -import { stringIsEmpty } from '../utils/string' import { ParsedHed2Tag } from '../validator/hed2/parser/parsedHed2Tag' - -const openingGroupCharacter = '(' -const closingGroupCharacter = ')' -const openingColumnCharacter = '{' -const closingColumnCharacter = '}' -const commaCharacter = ',' -const colonCharacter = ':' -const slashCharacter = '/' -const invalidCharacters = new Set(['[', ']', '~', '"']) -const invalidCharactersOutsideOfValues = new Set([':']) +import { HedStringTokenizer, ColumnSpliceSpec, TagSpec } from './tokenizer' const generationToClass = [ ParsedHedTag, @@ -28,380 +14,6 @@ const generationToClass = [ ParsedHed3Tag, ] -/** - * A specification for a tokenized substring. - */ -class SubstringSpec { - /** - * The starting and ending bounds of the substring. - * @type {number[]} - */ - bounds - - constructor(start, end) { - this.bounds = [start, end] - } -} - -/** - * A specification for a tokenized tag. - */ -class TagSpec extends SubstringSpec { - /** - * The tag this spec represents. - * @type {string} - */ - tag - /** - * The schema prefix for this tag, if any. - * @type {string} - */ - library - - constructor(tag, start, end, librarySchema) { - super(start, end) - - this.tag = tag.trim() - this.library = librarySchema - } -} - -/** - * A specification for a tokenized tag group. - */ -class GroupSpec extends SubstringSpec { - /** - * The child group specifications. - * @type {GroupSpec[]} - */ - children - - constructor(start, end) { - super(start, end) - - this.children = [] - } -} - -/** - * A specification for a tokenized column splice template. - */ -class ColumnSpliceSpec extends SubstringSpec { - /** - * The column name this spec refers to. - * @type {string} - */ - columnName - - constructor(name, start, end) { - super(start, end) - - this.columnName = name.trim() - } -} - -/** - * Class for tokenizing HED strings. - */ -class HedStringTokenizer { - /** - * The HED string being parsed. - * @type {string} - */ - hedString - - syntaxIssues - - /** - * The current substring being parsed. - * @type {string} - */ - currentTag - - groupDepth - startingIndex - resetStartingIndex - slashFound - librarySchema - currentGroupStack - parenthesesStack - ignoringCharacters - - constructor(hedString) { - this.hedString = hedString - } - - /** - * Split the HED string into delimiters and tags. - * - * @returns {[TagSpec[], GroupSpec, Object]} The tag specifications, group bounds, and any issues found. - */ - tokenize() { - this.initializeTokenizer() - - for (let i = 0; i < this.hedString.length; i++) { - const character = this.hedString.charAt(i) - this.tokenizeCharacter(i, character) - if (this.resetStartingIndex) { - this.resetStartingIndex = false - this.startingIndex = i + 1 - this.currentTag = '' - } - } - this.pushTag(this.hedString.length) - - if (this.columnSpliceIndex >= 0) { - this.syntaxIssues.push( - generateIssue('unclosedCurlyBrace', { - index: this.columnSpliceIndex, - string: this.hedString, - }), - ) - } - - this.unwindGroupStack() - - const tagSpecs = this.currentGroupStack.pop() - const groupSpecs = this.parenthesesStack.pop() - const issues = { - syntax: this.syntaxIssues, - conversion: [], - } - return [tagSpecs, groupSpecs, issues] - } - - initializeTokenizer() { - this.syntaxIssues = [] - - this.currentTag = '' - this.groupDepth = 0 - this.startingIndex = 0 - this.resetStartingIndex = false - this.slashFound = false - this.librarySchema = '' - this.columnSpliceIndex = -1 - this.currentGroupStack = [[]] - this.parenthesesStack = [new GroupSpec(0, this.hedString.length)] - this.ignoringCharacters = false - } - - tokenizeCharacter(i, character) { - let dispatchTable - if (this.ignoringCharacters) { - dispatchTable = { - [closingGroupCharacter]: (i, character) => { - this.clearTag() - this.closingGroupCharacter(i) - }, - [commaCharacter]: (i, character) => this.clearTag(), - } - } else { - dispatchTable = { - [openingGroupCharacter]: (i, character) => this.openingGroupCharacter(i), - [closingGroupCharacter]: (i, character) => { - this.pushTag(i) - this.closingGroupCharacter(i) - }, - [openingColumnCharacter]: (i, character) => this.openingColumnCharacter(i), - [closingColumnCharacter]: (i, character) => this.closingColumnCharacter(i), - [commaCharacter]: (i, character) => this.pushTag(i), - [colonCharacter]: (i, character) => this.colonCharacter(character), - [slashCharacter]: (i, character) => this.slashCharacter(character), - } - } - const characterHandler = dispatchTable[character] - if (characterHandler) { - characterHandler(i, character) - } else { - this.otherCharacter(character) - } - } - - openingGroupCharacter(i) { - this.currentGroupStack.push([]) - this.parenthesesStack.push(new GroupSpec(i)) - this.resetStartingIndex = true - this.groupDepth++ - } - - closingGroupCharacter(i) { - if (this.groupDepth <= 0) { - this.syntaxIssues.push( - generateIssue('unopenedParenthesis', { - index: i, - string: this.hedString, - }), - ) - return - } - this.closeGroup(i) - } - - openingColumnCharacter(i) { - if (this.currentTag.length > 0) { - this.syntaxIssues.push( - generateIssue('invalidCharacter', { - character: openingColumnCharacter, - index: i, - string: this.hedString, - }), - ) - this.ignoringCharacters = true - return - } - if (this.columnSpliceIndex >= 0) { - this.syntaxIssues.push( - generateIssue('nestedCurlyBrace', { - index: i, - string: this.hedString, - }), - ) - } - this.columnSpliceIndex = i - } - - closingColumnCharacter(i) { - if (this.columnSpliceIndex < 0) { - this.syntaxIssues.push( - generateIssue('unopenedCurlyBrace', { - index: i, - string: this.hedString, - }), - ) - return - } - if (!stringIsEmpty(this.currentTag)) { - this.currentGroupStack[this.groupDepth].push(new ColumnSpliceSpec(this.currentTag, this.startingIndex, i)) - } else { - this.syntaxIssues.push( - generateIssue('emptyCurlyBrace', { - string: this.hedString, - }), - ) - } - this.columnSpliceIndex = -1 - this.resetStartingIndex = true - this.slashFound = false - } - - colonCharacter(character) { - if (!this.slashFound && !this.librarySchema) { - this.librarySchema = this.currentTag - this.resetStartingIndex = true - } else { - this.currentTag += character - } - } - - slashCharacter(character) { - this.slashFound = true - this.currentTag += character - } - - otherCharacter(character) { - if (this.ignoringCharacters) { - return - } - this.currentTag += character - this.resetStartingIndex = stringIsEmpty(this.currentTag) - } - - unwindGroupStack() { - // groupDepth is decremented in closeGroup. - // eslint-disable-next-line no-unmodified-loop-condition - while (this.groupDepth > 0) { - this.syntaxIssues.push( - generateIssue('unclosedParenthesis', { - index: this.parenthesesStack[this.parenthesesStack.length - 1].bounds[0], - string: this.hedString, - }), - ) - this.closeGroup(this.hedString.length) - } - } - - pushTag(i) { - if (!stringIsEmpty(this.currentTag) && this.columnSpliceIndex < 0) { - this.currentGroupStack[this.groupDepth].push( - new TagSpec(this.currentTag, this.startingIndex, i, this.librarySchema), - ) - } - this.resetStartingIndex = true - this.slashFound = false - this.librarySchema = '' - } - - clearTag() { - this.ignoringCharacters = false - this.resetStartingIndex = true - this.slashFound = false - this.librarySchema = '' - } - - closeGroup(i) { - const groupSpec = this.parenthesesStack.pop() - groupSpec.bounds[1] = i + 1 - this.parenthesesStack[this.groupDepth - 1].children.push(groupSpec) - this.currentGroupStack[this.groupDepth - 1].push(this.currentGroupStack.pop()) - this.groupDepth-- - } -} - -/** - * Check the split HED tags for invalid characters - * - * @param {string} hedString The HED string to be split. - * @param {SubstringSpec[]} tagSpecs The tag specifications. - * @returns {Object} Any issues found. - */ -const checkForInvalidCharacters = function (hedString, tagSpecs) { - const syntaxIssues = [] - const flatTagSpecs = flattenDeep(tagSpecs) - - for (const tagSpec of flatTagSpecs) { - if (tagSpec instanceof ColumnSpliceSpec) { - continue - } - const alwaysInvalidIssues = checkTagForInvalidCharacters(hedString, tagSpec, tagSpec.tag, invalidCharacters) - const valueTag = replaceTagNameWithPound(tagSpec.tag) - const outsideValueIssues = checkTagForInvalidCharacters( - hedString, - tagSpec, - valueTag, - invalidCharactersOutsideOfValues, - ) - syntaxIssues.push(...alwaysInvalidIssues, ...outsideValueIssues) - } - - return { syntax: syntaxIssues, conversion: [] } -} - -/** - * Check an individual tag for invalid characters. - * - * @param {string} hedString The HED string to be split. - * @param {TagSpec} tagSpec A tag specification. - * @param {string} tag The tag form to be checked. - * @param {Set} invalidSet The set of invalid characters. - * @returns {Issue[]} Any issues found. - */ -const checkTagForInvalidCharacters = function (hedString, tagSpec, tag, invalidSet) { - const issues = [] - for (let i = 0; i < tag.length; i++) { - const character = tag.charAt(i) - if (invalidSet.has(character)) { - issues.push( - generateIssue('invalidCharacter', { - character: character, - index: tagSpec.bounds[0] + i, - string: hedString, - }), - ) - } - } - return issues -} - /** * Create the parsed HED tag and group objects. * @@ -460,13 +72,11 @@ const createParsedTags = function (hedString, hedSchemas, tagSpecs, groupSpecs) * @returns {[ParsedHedSubstring[], Object]} The parsed HED string data and any issues found. */ export default function splitHedString(hedString, hedSchemas) { - const [tagSpecs, groupBounds, splitIssues] = new HedStringTokenizer(hedString).tokenize() - const characterIssues = checkForInvalidCharacters(hedString, tagSpecs) - mergeParsingIssues(splitIssues, characterIssues) - if (splitIssues.syntax.length > 0) { - return [null, splitIssues] + const [tagSpecs, groupBounds, tokenizingIssues] = new HedStringTokenizer(hedString).tokenize() + if (tokenizingIssues.syntax.length > 0) { + return [null, tokenizingIssues] } const [parsedTags, parsingIssues] = createParsedTags(hedString, hedSchemas, tagSpecs, groupBounds) - mergeParsingIssues(splitIssues, parsingIssues) - return [parsedTags, splitIssues] + mergeParsingIssues(tokenizingIssues, parsingIssues) + return [parsedTags, tokenizingIssues] } diff --git a/parser/tokenizer.js b/parser/tokenizer.js new file mode 100644 index 00000000..46d10e8d --- /dev/null +++ b/parser/tokenizer.js @@ -0,0 +1,364 @@ +import { generateIssue } from '../common/issues/issues' +import { stringIsEmpty } from '../utils/string' +import { replaceTagNameWithPound } from '../utils/hedStrings' + +const openingGroupCharacter = '(' +const closingGroupCharacter = ')' +const openingColumnCharacter = '{' +const closingColumnCharacter = '}' +const commaCharacter = ',' +const colonCharacter = ':' +const slashCharacter = '/' + +const invalidCharacters = new Set(['[', ']', '~', '"']) +const invalidCharactersOutsideOfValues = new Set([':']) + +/** + * A specification for a tokenized substring. + */ +export class SubstringSpec { + /** + * The starting and ending bounds of the substring. + * @type {number[]} + */ + bounds + + constructor(start, end) { + this.bounds = [start, end] + } +} + +/** + * A specification for a tokenized tag. + */ +export class TagSpec extends SubstringSpec { + /** + * The tag this spec represents. + * @type {string} + */ + tag + /** + * The schema prefix for this tag, if any. + * @type {string} + */ + library + + constructor(tag, start, end, librarySchema) { + super(start, end) + + this.tag = tag.trim() + this.library = librarySchema + } +} + +/** + * A specification for a tokenized tag group. + */ +export class GroupSpec extends SubstringSpec { + /** + * The child group specifications. + * @type {GroupSpec[]} + */ + children + + constructor(start, end) { + super(start, end) + + this.children = [] + } +} + +/** + * A specification for a tokenized column splice template. + */ +export class ColumnSpliceSpec extends SubstringSpec { + /** + * The column name this spec refers to. + * @type {string} + */ + columnName + + constructor(name, start, end) { + super(start, end) + + this.columnName = name.trim() + } +} + +/** + * Class for tokenizing HED strings. + */ +export class HedStringTokenizer { + /** + * The HED string being parsed. + * @type {string} + */ + hedString + + syntaxIssues + + /** + * The current substring being parsed. + * @type {string} + */ + currentTag + + groupDepth + startingIndex + resetStartingIndex + slashFound + librarySchema + currentGroupStack + parenthesesStack + ignoringCharacters + + constructor(hedString) { + this.hedString = hedString + } + + /** + * Split the HED string into delimiters and tags. + * + * @returns {[TagSpec[], GroupSpec, Object]} The tag specifications, group bounds, and any issues found. + */ + tokenize() { + this.initializeTokenizer() + + for (let i = 0; i < this.hedString.length; i++) { + const character = this.hedString.charAt(i) + this.tokenizeCharacter(i, character) + if (this.resetStartingIndex) { + this.resetStartingIndex = false + this.startingIndex = i + 1 + this.currentTag = '' + } + } + this.pushTag(this.hedString.length) + + if (this.columnSpliceIndex >= 0) { + this.syntaxIssues.push( + generateIssue('unclosedCurlyBrace', { + index: this.columnSpliceIndex, + string: this.hedString, + }), + ) + } + + this.unwindGroupStack() + + const tagSpecs = this.currentGroupStack.pop() + const groupSpecs = this.parenthesesStack.pop() + const issues = { + syntax: this.syntaxIssues, + conversion: [], + } + return [tagSpecs, groupSpecs, issues] + } + + initializeTokenizer() { + this.syntaxIssues = [] + + this.currentTag = '' + this.groupDepth = 0 + this.startingIndex = 0 + this.resetStartingIndex = false + this.slashFound = false + this.librarySchema = '' + this.columnSpliceIndex = -1 + this.currentGroupStack = [[]] + this.parenthesesStack = [new GroupSpec(0, this.hedString.length)] + this.ignoringCharacters = false + } + + tokenizeCharacter(i, character) { + let dispatchTable + if (this.ignoringCharacters) { + dispatchTable = { + [closingGroupCharacter]: (i /* character */) => { + this.clearTag() + this.closingGroupCharacter(i) + }, + [commaCharacter]: (/*i, character */) => this.clearTag(), + } + } else { + dispatchTable = { + [openingGroupCharacter]: (i /* character */) => this.openingGroupCharacter(i), + [closingGroupCharacter]: (i /* character */) => { + this.pushTag(i) + this.closingGroupCharacter(i) + }, + [openingColumnCharacter]: (i /* character */) => this.openingColumnCharacter(i), + [closingColumnCharacter]: (i /* character */) => this.closingColumnCharacter(i), + [commaCharacter]: (i /* character */) => this.pushTag(i), + [colonCharacter]: (i, character) => this.colonCharacter(character), + [slashCharacter]: (i, character) => this.slashCharacter(character), + } + } + const characterHandler = dispatchTable[character] + if (characterHandler) { + characterHandler(i, character) + } else if (invalidCharacters.has(character)) { + this.syntaxIssues.push( + generateIssue('invalidCharacter', { + character: character, + index: i, + string: this.hedString, + }), + ) + } else { + this.otherCharacter(character) + } + } + + openingGroupCharacter(i) { + this.currentGroupStack.push([]) + this.parenthesesStack.push(new GroupSpec(i)) + this.resetStartingIndex = true + this.groupDepth++ + } + + closingGroupCharacter(i) { + if (this.groupDepth <= 0) { + this.syntaxIssues.push( + generateIssue('unopenedParenthesis', { + index: i, + string: this.hedString, + }), + ) + return + } + this.closeGroup(i) + } + + openingColumnCharacter(i) { + if (this.currentTag.length > 0) { + this.syntaxIssues.push( + generateIssue('invalidCharacter', { + character: openingColumnCharacter, + index: i, + string: this.hedString, + }), + ) + this.ignoringCharacters = true + return + } + if (this.columnSpliceIndex >= 0) { + this.syntaxIssues.push( + generateIssue('nestedCurlyBrace', { + index: i, + string: this.hedString, + }), + ) + } + this.columnSpliceIndex = i + } + + closingColumnCharacter(i) { + if (this.columnSpliceIndex < 0) { + this.syntaxIssues.push( + generateIssue('unopenedCurlyBrace', { + index: i, + string: this.hedString, + }), + ) + return + } + if (!stringIsEmpty(this.currentTag)) { + this.currentGroupStack[this.groupDepth].push(new ColumnSpliceSpec(this.currentTag, this.startingIndex, i)) + } else { + this.syntaxIssues.push( + generateIssue('emptyCurlyBrace', { + string: this.hedString, + }), + ) + } + this.columnSpliceIndex = -1 + this.resetStartingIndex = true + this.slashFound = false + } + + colonCharacter(character) { + if (!this.slashFound && !this.librarySchema) { + this.librarySchema = this.currentTag + this.resetStartingIndex = true + } else { + this.currentTag += character + } + } + + slashCharacter(character) { + this.slashFound = true + this.currentTag += character + } + + otherCharacter(character) { + if (this.ignoringCharacters) { + return + } + this.currentTag += character + this.resetStartingIndex = stringIsEmpty(this.currentTag) + } + + unwindGroupStack() { + // groupDepth is decremented in closeGroup. + // eslint-disable-next-line no-unmodified-loop-condition + while (this.groupDepth > 0) { + this.syntaxIssues.push( + generateIssue('unclosedParenthesis', { + index: this.parenthesesStack[this.parenthesesStack.length - 1].bounds[0], + string: this.hedString, + }), + ) + this.closeGroup(this.hedString.length) + } + } + + pushTag(i) { + if (!stringIsEmpty(this.currentTag) && this.columnSpliceIndex < 0) { + this._checkValueTagForInvalidCharacters() + this.currentGroupStack[this.groupDepth].push( + new TagSpec(this.currentTag, this.startingIndex, i, this.librarySchema), + ) + } + this.resetStartingIndex = true + this.slashFound = false + this.librarySchema = '' + } + + clearTag() { + this.ignoringCharacters = false + this.resetStartingIndex = true + this.slashFound = false + this.librarySchema = '' + } + + closeGroup(i) { + const groupSpec = this.parenthesesStack.pop() + groupSpec.bounds[1] = i + 1 + this.parenthesesStack[this.groupDepth - 1].children.push(groupSpec) + this.currentGroupStack[this.groupDepth - 1].push(this.currentGroupStack.pop()) + this.groupDepth-- + } + + /** + * Check an individual tag for invalid characters. + * + * @private + */ + _checkValueTagForInvalidCharacters() { + const formToCheck = replaceTagNameWithPound(this.currentTag) + for (let i = 0; i < formToCheck.length; i++) { + const character = formToCheck.charAt(i) + if (!invalidCharactersOutsideOfValues.has(character)) { + continue + } + this.syntaxIssues.push( + generateIssue('invalidCharacter', { + character: character, + index: this.startingIndex + i, + string: this.hedString, + }), + ) + } + } +}