Merge pull request #212 from VisLab/update-tokenizer

Update tokenizer -- removed extra delimiter tests in parser
hed-standard · Nov 4, 2024 · 9cc9bcc · 9cc9bcc
2 parents 1f4764f + 08956d8
commit 9cc9bcc
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 152 deletions.
diff --git a/parser/parser.js b/parser/parser.js
@@ -34,116 +34,6 @@ class HedStringParser {
     this.hedSchemas = hedSchemas
   }
 
-  /**
-   * Check if the parentheses in a tag group match.
-   *
-   * @returns {Issue[]} Any issues found related to unmatched parentheses.
-   */
-  _countTagGroupParentheses() {
-    const issues = []
-    const numberOfOpeningParentheses = getCharacterCount(this.hedString, openingGroupCharacter)
-    const numberOfClosingParentheses = getCharacterCount(this.hedString, closingGroupCharacter)
-
-    if (numberOfOpeningParentheses !== numberOfClosingParentheses) {
-      issues.push(
-        generateIssue('parentheses', {
-          opening: numberOfOpeningParentheses,
-          closing: numberOfClosingParentheses,
-        }),
-      )
-    }
-
-    return issues
-  }
-
-  /**
-   * Check if a comma is missing after an opening parenthesis.
-   *
-   * @param {string} lastNonEmptyCharacter The last non-empty character.
-   * @param {string} currentCharacter The current character in the HED string.
-   * @returns {boolean} Whether a comma is missing after a closing parenthesis.
-   */
-  _isCommaMissingAfterClosingParenthesis(lastNonEmptyCharacter, currentCharacter) {
-    return (
-      lastNonEmptyCharacter === closingGroupCharacter &&
-      !(delimiters.has(currentCharacter) || currentCharacter === closingGroupCharacter)
-    )
-  }
-
-  /**
-   * Find delimiter-related issues in a HED string.
-   *
-   * @returns {Issue[]} Any issues related to delimiters.
-   */
-  _findDelimiterIssues() {
-    const issues = []
-    let lastNonEmptyValidCharacter = ''
-    let lastNonEmptyValidIndex = 0
-    let currentTag = ''
-
-    for (let i = 0; i < this.hedString.length; i++) {
-      const currentCharacter = this.hedString.charAt(i)
-      currentTag += currentCharacter
-
-      if (stringIsEmpty(currentCharacter)) {
-        continue
-      }
-
-      if (delimiters.has(currentCharacter)) {
-        if (currentTag.trim() === currentCharacter) {
-          issues.push(
-            generateIssue('extraDelimiter', {
-              character: currentCharacter,
-              index: i,
-              string: this.hedString,
-            }),
-          )
-          currentTag = ''
-          continue
-        }
-        currentTag = ''
-      } else if (currentCharacter === openingGroupCharacter) {
-        if (currentTag.trim() !== openingGroupCharacter) {
-          issues.push(generateIssue('commaMissing', { tag: currentTag }))
-        }
-        currentTag = ''
-      } else if (this._isCommaMissingAfterClosingParenthesis(lastNonEmptyValidCharacter, currentCharacter)) {
-        issues.push(
-          generateIssue('commaMissing', {
-            tag: currentTag.slice(0, -1),
-          }),
-        )
-        break
-      }
-
-      lastNonEmptyValidCharacter = currentCharacter
-      lastNonEmptyValidIndex = i
-    }
-
-    if (delimiters.has(lastNonEmptyValidCharacter)) {
-      issues.push(
-        generateIssue('extraDelimiter', {
-          character: lastNonEmptyValidCharacter,
-          index: lastNonEmptyValidIndex,
-          string: this.hedString,
-        }),
-      )
-    }
-
-    return issues
-  }
-
-  /**
-   * Validate the full unparsed HED string.
-   *
-   * @returns {Object<string, Issue[]>} Any issues found during validation.
-   */
-  _validateFullUnparsedHedString() {
-    const delimiterIssues = [].concat(this._countTagGroupParentheses(), this._findDelimiterIssues())
-
-    return { delimiter: delimiterIssues }
-  }
-
   /**
    * Parse a full HED string.
    *
@@ -154,14 +44,7 @@ class HedStringParser {
       return [this.hedString, {}]
     }
 
-    const fullStringIssues = this._validateFullUnparsedHedString()
-    if (fullStringIssues.delimiter.length > 0) {
-      fullStringIssues.syntax = []
-      return [null, fullStringIssues]
-    }
-
-    const [parsedTags, splitIssues] = new HedStringSplitter(this.hedString, this.hedSchemas).splitHedString()
-    const parsingIssues = Object.assign(fullStringIssues, splitIssues)
+    const [parsedTags, parsingIssues] = new HedStringSplitter(this.hedString, this.hedSchemas).splitHedString()
     if (parsedTags === null) {
       return [null, parsingIssues]
     }

diff --git a/parser/tokenizer.js b/parser/tokenizer.js
@@ -179,6 +179,13 @@ export class HedStringTokenizer {
       this.pushIssue('emptyTagFound', this.state.lastDelimiter[1]) // Extra comma
     } else if (this.state.lastSlash >= 0 && this.hedString.slice(this.state.lastSlash + 1).trim().length === 0) {
       this.pushIssue('extraSlash', this.state.lastSlash) // Extra slash
+    }
+    if (
+      this.state.currentToken.trim().length > 0 &&
+      ![undefined, CHARACTERS.COMMA].includes(this.state.lastDelimiter[0])
+    ) {
+      // Missing comma
+      this.pushIssue('commaMissing', this.state.lastDelimiter[1] + 1)
     } else {
       if (this.state.currentToken.trim().length > 0) {
         this.pushTag(this.hedString.length)
@@ -261,6 +268,10 @@ export class HedStringTokenizer {
   handleOpeningGroup(i) {
     if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
       this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1])
+    } else if (this.state.lastDelimiter[0] === CHARACTERS.CLOSING_COLUMN) {
+      this.pushIssue('commaMissing', this.state.lastDelimiter[1])
+    } else if (this.state.currentToken.trim().length > 0) {
+      this.pushInvalidTag('commaMissing', i, this.state.currentToken.trim())
     } else {
       this.state.currentGroupStack.push([])
       this.state.parenthesesStack.push(new GroupSpec(i, undefined, []))

diff --git a/tests/event.spec.js b/tests/event.spec.js
@@ -61,7 +61,8 @@ describe('HED string and event validation', () => {
     describe('Full HED Strings', () => {
       const validatorSyntactic = validatorSyntacticBase
 
-      it('should not have mismatched parentheses', () => {
+      //TODO: Remove this test -- the separate test for mismatched parentheses is no longer performed.
+      it.skip('should not have mismatched parentheses', () => {
         const testStrings = {
           extraOpening:
             'Action/Reach/To touch,((Attribute/Object side/Left,Participant/Effect/Body part/Arm),Attribute/Location/Screen/Top/70 px,Attribute/Location/Screen/Left/23 px',
@@ -84,7 +85,8 @@ describe('HED string and event validation', () => {
         validatorSyntactic(testStrings, expectedIssues, (validator) => {})
       })
 
-      it('should not have malformed delimiters', () => {
+      // TODO: This test is replaced by tokenizer tests and should be removed
+      it.skip('should not have malformed delimiters', () => {
         const testStrings = {
           missingOpeningComma:
             'Action/Reach/To touch(Attribute/Object side/Left,Participant/Effect/Body part/Arm),Attribute/Location/Screen/Top/70 px,Attribute/Location/Screen/Left/23 px',

diff --git a/tests/testData/tokenizerTests.data.js b/tests/testData/tokenizerTests.data.js
@@ -268,36 +268,6 @@ export const tokenizerTests = [
       },
     ],
   },
-  {
-    name: 'invalid-empty-tag-in-various-places',
-    description: 'Empty tags in various places (empty groups are allowed).',
-    tests: [
-      {
-        testname: 'end-in-comma',
-        explanation: 'Cannot end in a comma',
-        string: 'x,y,',
-        tagSpecs: [],
-        groupSpec: null,
-        errors: [generateIssue('emptyTagFound', { index: '3', string: 'x,y,' })],
-      },
-      {
-        testname: 'double-in-comma',
-        explanation: 'Cannot have double commas',
-        string: 'x,,y,',
-        tagSpecs: [],
-        groupSpec: null,
-        errors: [generateIssue('emptyTagFound', { index: '1', string: 'x,,y,' })],
-      },
-      {
-        testname: 'leading-comma',
-        string: ',x,y',
-        explanation: 'Cannot have a leading comma',
-        tagSpecs: [],
-        groupSpec: null,
-        errors: [generateIssue('emptyTagFound', { index: '0', string: ',x,y' })],
-      },
-    ],
-  },
   {
     name: 'invalid-extra-slash-in-various-places',
     description: 'Tags cannot have leading or trailing, or extra slashes',
@@ -352,6 +322,124 @@ export const tokenizerTests = [
       },
     ],
   },
+  {
+    name: 'invalid-commas',
+    description: 'Commas must separate tags and groups',
+    tests: [
+      {
+        testname: 'end-in-comma',
+        explanation: 'Cannot end in a comma',
+        string: 'x,y,',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '3', string: 'x,y,' })],
+      },
+      {
+        testname: 'double-in-comma',
+        explanation: 'Cannot have double commas',
+        string: 'x,,y,',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '1', string: 'x,,y,' })],
+      },
+      {
+        testname: 'leading-comma',
+        string: ',x,y',
+        explanation: 'Cannot have a leading comma',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '0', string: ',x,y' })],
+      },
+      {
+        testname: 'missing-comma-before-open',
+        explanation: 'Must have a comma before open parentheses',
+        string: 'x, y(z)',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('commaMissing', { index: '4', string: 'x, y(z)', tag: 'y' })],
+      },
+      {
+        testname: 'missing-comma-before-close',
+        explanation: 'Must have a comma after closing parentheses',
+        string: 'x, (y)zp, (w)',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('invalidTag', { index: '8', string: 'x, (y)zp, (w)' })],
+      },
+      {
+        testname: 'missing-comma-before-close-at-end',
+        explanation: 'Must have a comma after closing parenthesis at end of string',
+        string: 'x, (y)z',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('commaMissing', { index: '6', string: 'x, (y)z' })],
+      },
+      {
+        testname: 'missing-comma-before-open-column',
+        explanation: 'Must have a comma before open brace',
+        string: 'x, y, {(z)',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('unclosedCurlyBrace', { index: '6', string: 'x, y, {(z)' })],
+      },
+      {
+        testname: 'missing-close-brace-before-parentheses',
+        explanation: 'Must have a closed-brace-after-open-brace',
+        string: 'x, y, {w(z)',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('unclosedCurlyBrace', { index: '6', string: 'x, y, {w(z)' })],
+      },
+      {
+        testname: 'missing-comma-before-closing-column',
+        explanation: 'Must have a comma before closing brace',
+        string: 'x, {y}(z)',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('commaMissing', { index: '5', string: 'x, {y}(z)' })],
+      },
+      {
+        testname: 'extra-leading-comma',
+        explanation: 'Leading comma not allowed',
+        string: ',x, (y), z',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '0', string: ',x, (y), z' })],
+      },
+      {
+        testname: 'extra-closing-comma',
+        explanation: 'Closing comma not allowed',
+        string: 'x, (y), z,',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '9', string: 'x, (y), z,' })],
+      },
+      {
+        testname: 'multiple-leading-commas',
+        explanation: 'Multiple leading commas not allowed',
+        string: ',,x, (y), z',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '0', string: ',,x, (y), z' })],
+      },
+      {
+        testname: 'multiple-closing-commas',
+        explanation: 'Multiple closing comma not allowed',
+        string: 'x, (y), z,,',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '9', string: 'x, (y), z,,' })],
+      },
+      {
+        testname: 'multiple-internal-commas',
+        explanation: 'Multiple closing commas not allowed',
+        string: 'x, (y),, z',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('emptyTagFound', { index: '6', string: 'x, (y),, z' })],
+      },
+    ],
+  },
   {
     name: 'invalid-improper-curly-braces',
     description: 'Curly braces cannot have commas or parentheses or other curly braces',
@@ -398,4 +486,34 @@ export const tokenizerTests = [
       },
     ],
   },
+  {
+    name: 'invalid-parenthesis',
+    description: 'Various types of mismatched parentheses',
+    tests: [
+      {
+        testname: 'extra-opening-parentheses',
+        explanation: 'Parentheses must match',
+        string: 'x, (((y, z), w), u',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('unclosedParenthesis', { index: '3', string: 'x, (((y, z), w), u' })],
+      },
+      {
+        testname: 'extra-closing-parentheses',
+        explanation: 'Cannot parentheses inside curly braces',
+        string: 'x, (y, z)), w',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('unopenedParenthesis', { index: '9', string: 'x, (y, z)), w' })],
+      },
+      {
+        testname: 'parentheses-in-wrong-order',
+        explanation: 'Nesting must be proper',
+        string: '((x),y))(z',
+        tagSpecs: [],
+        groupSpec: null,
+        errors: [generateIssue('unopenedParenthesis', { index: '7', string: '((x),y))(z' })],
+      },
+    ],
+  },
 ]
diff --git a/tests/tokenizerTests.spec.js b/tests/tokenizerTests.spec.js
@@ -10,7 +10,7 @@ import { tokenizerTests } from './testData/tokenizerTests.data'
 const runAll = true
 let onlyRun = new Map()
 if (!runAll) {
-  onlyRun = new Map([['invalid-empty-tag-in-various-places', ['end-in-comma']]])
+  onlyRun = new Map([['invalid-comma-missing-or-extra', ['missing-comma-before-close']]])
 }
 
 describe('Tokenizer validation using JSON tests', () => {

diff --git a/validator/event/init.js b/validator/event/init.js
@@ -31,7 +31,7 @@ const initiallyValidateHedString = function (hedString, hedSchemas, options, def
   }
   if (parsedString === null) {
     return [null, [].concat(...Object.values(parsingIssues)), null]
-  } else if (parsingIssues.syntax.length + parsingIssues.delimiter.length > 0) {
+  } else if (parsingIssues.syntax.length > 0) {
     hedSchemas = new Schemas(null)
   }
   let hedValidator