Skip to content

Commit

Permalink
Move invalid character check to tokenizer
Browse files Browse the repository at this point in the history
This bans all characters in the C0 and C1 control code ranges, using
the unicode-name NPM module to display the Unicode names of any
invalid characters.
  • Loading branch information
happy5214 committed Oct 11, 2024
1 parent 9def82b commit 613e64e
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 29 deletions.
2 changes: 1 addition & 1 deletion esbuild.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ await esbuild.build({
entryPoints: [path.join(process.cwd(), 'index.js')],
loader: { '.xml': 'text' },
outdir: path.join(process.cwd(), 'dist', 'commonjs'),
target: 'node12',
target: 'node18',
bundle: true,
sourcemap: true,
platform: 'node',
Expand Down
14 changes: 14 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"pluralize": "^8.0.0",
"semver": "^7.6.0",
"string_decoder": "^1.3.0",
"unicode-name": "^1.0.2",
"xml2js": "^0.6.2"
},
"devDependencies": {
Expand Down Expand Up @@ -74,7 +75,10 @@
"transform": {
"\\.js$": "esbuild-runner/jest",
"\\.xml$": "<rootDir>/fileTransformer.js"
}
},
"transformIgnorePatterns": [
"node_modules/(?!unicode-name)"
]
},
"browser": {
"fs": false
Expand Down
2 changes: 1 addition & 1 deletion parser/splitHedString.js → parser/splitter.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export default class HedStringSplitter {
ParsedHedTagConstructor

/**
* Constructor for the HED string parser.
* Constructor.
*
* @param {string} hedString The HED string to be split and parsed.
* @param {Schemas} hedSchemas The collection of HED schemas.
Expand Down
12 changes: 11 additions & 1 deletion parser/tokenizer.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { unicodeName } from 'unicode-name'

import { generateIssue } from '../common/issues/issues'
import { stringIsEmpty } from '../utils/string'
import { replaceTagNameWithPound } from '../utils/hedStrings'
Expand All @@ -12,6 +14,14 @@ const slashCharacter = '/'

const invalidCharacters = new Set(['[', ']', '~', '"'])
const invalidCharactersOutsideOfValues = new Set([':'])
// C0 control codes
for (let i = 0x00; i <= 0x1f; i++) {
invalidCharacters.add(String.fromCodePoint(i))
}
// DEL and C1 control codes
for (let i = 0x7f; i <= 0x9f; i++) {
invalidCharacters.add(String.fromCodePoint(i))
}

/**
* A specification for a tokenized substring.
Expand Down Expand Up @@ -366,7 +376,7 @@ export class HedStringTokenizer {
_pushInvalidCharacterIssue(character, index) {
this.syntaxIssues.push(
generateIssue('invalidCharacter', {
character: character,
character: unicodeName(character),
index: index,
string: this.hedString,
}),
Expand Down
2 changes: 1 addition & 1 deletion tests/bids.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ describe('BIDS datasets', () => {
syntax: [
BidsHedIssue.fromHedIssue(
generateIssue('invalidCharacter', {
character: '{',
character: 'LEFT CURLY BRACKET',
index: 9,
string: '(Def/Acc/{response_time})',
}),
Expand Down
32 changes: 11 additions & 21 deletions tests/event.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,13 @@ describe('HED string and event validation', () => {
closingBracket: '/Attribute/Object side/Left,/Participant/Effect]/Body part/Arm',
tilde: '/Attribute/Object side/Left,/Participant/Effect~/Body part/Arm',
doubleQuote: '/Attribute/Object side/Left,/Participant/Effect"/Body part/Arm',
null: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\0',
tab: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\t',
}
const expectedIssues = {
openingBrace: [
generateIssue('invalidCharacter', {
character: '{',
character: 'LEFT CURLY BRACKET',
index: 47,
string: testStrings.openingBrace,
}),
Expand All @@ -201,54 +203,42 @@ describe('HED string and event validation', () => {
],
openingBracket: [
generateIssue('invalidCharacter', {
character: '[',
character: 'LEFT SQUARE BRACKET',
index: 47,
string: testStrings.openingBracket,
}),
],
closingBracket: [
generateIssue('invalidCharacter', {
character: ']',
character: 'RIGHT SQUARE BRACKET',
index: 47,
string: testStrings.closingBracket,
}),
],
tilde: [
generateIssue('invalidCharacter', {
character: '~',
character: 'TILDE',
index: 47,
string: testStrings.tilde,
}),
],
doubleQuote: [
generateIssue('invalidCharacter', {
character: '"',
character: 'QUOTATION MARK',
index: 47,
string: testStrings.doubleQuote,
}),
],
}
// No-op function as this check is done during the parsing stage.
// eslint-disable-next-line no-unused-vars
validatorSyntactic(testStrings, expectedIssues, (validator) => {})
})

it('should substitute and warn for certain illegal characters', () => {
const testStrings = {
nul: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\0',
tab: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\t',
}
const expectedIssues = {
nul: [
null: [
generateIssue('invalidCharacter', {
character: 'ASCII NUL',
character: 'NULL',
index: 61,
string: testStrings.nul,
string: testStrings.null,
}),
],
tab: [
generateIssue('invalidCharacter', {
character: 'Tab',
character: 'CHARACTER TABULATION',
index: 61,
string: testStrings.tab,
}),
Expand Down
6 changes: 3 additions & 3 deletions tests/stringParser.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ describe('HED string parsing', () => {
conversion: [],
syntax: [
generateIssue('invalidCharacter', {
character: '[',
character: 'LEFT SQUARE BRACKET',
index: 56,
string: testStrings.openingSquare,
}),
Expand All @@ -94,7 +94,7 @@ describe('HED string parsing', () => {
conversion: [],
syntax: [
generateIssue('invalidCharacter', {
character: ']',
character: 'RIGHT SQUARE BRACKET',
index: 56,
string: testStrings.closingSquare,
}),
Expand All @@ -104,7 +104,7 @@ describe('HED string parsing', () => {
conversion: [],
syntax: [
generateIssue('invalidCharacter', {
character: '~',
character: 'TILDE',
index: 56,
string: testStrings.tilde,
}),
Expand Down

0 comments on commit 613e64e

Please sign in to comment.