From 613e64e35a079fcd1c3dba95edebccc97571624b Mon Sep 17 00:00:00 2001 From: Alexander Jones Date: Fri, 11 Oct 2024 08:22:58 -0500 Subject: [PATCH] Move invalid character check to tokenizer This bans all characters in the C0 and C1 control code ranges, using the unicode-name NPM module to display the Unicode names of any invalid characters. --- esbuild.mjs | 2 +- package-lock.json | 14 ++++++++++ package.json | 6 ++++- parser/{splitHedString.js => splitter.js} | 2 +- parser/tokenizer.js | 12 ++++++++- tests/bids.spec.js | 2 +- tests/event.spec.js | 32 ++++++++--------------- tests/stringParser.spec.js | 6 ++--- 8 files changed, 47 insertions(+), 29 deletions(-) rename parser/{splitHedString.js => splitter.js} (99%) diff --git a/esbuild.mjs b/esbuild.mjs index ec1316eb..785a6a11 100644 --- a/esbuild.mjs +++ b/esbuild.mjs @@ -7,7 +7,7 @@ await esbuild.build({ entryPoints: [path.join(process.cwd(), 'index.js')], loader: { '.xml': 'text' }, outdir: path.join(process.cwd(), 'dist', 'commonjs'), - target: 'node12', + target: 'node18', bundle: true, sourcemap: true, platform: 'node', diff --git a/package-lock.json b/package-lock.json index 03dbdf4e..36be2288 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "pluralize": "^8.0.0", "semver": "^7.6.0", "string_decoder": "^1.3.0", + "unicode-name": "^1.0.2", "xml2js": "^0.6.2" }, "devDependencies": { @@ -5191,6 +5192,14 @@ "node": ">=14.17" } }, + "node_modules/unicode-name": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/unicode-name/-/unicode-name-1.0.2.tgz", + "integrity": "sha512-PETEgU8TKsHoGZXP/3eWRU/4xnXJKwAIm+H7b0s/6CEP6o+YK4tWbwBXPLKe0U5+njWEAo2snT5+Mvoau6BI8A==", + "engines": { + "node": ">=18.20" + } + }, "node_modules/update-browserslist-db": { "version": "1.0.14", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.14.tgz", @@ -9108,6 +9117,11 @@ "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", "dev": true }, + "unicode-name": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/unicode-name/-/unicode-name-1.0.2.tgz", + "integrity": "sha512-PETEgU8TKsHoGZXP/3eWRU/4xnXJKwAIm+H7b0s/6CEP6o+YK4tWbwBXPLKe0U5+njWEAo2snT5+Mvoau6BI8A==" + }, "update-browserslist-db": { "version": "1.0.14", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.14.tgz", diff --git a/package.json b/package.json index 79b45ad6..801cf414 100644 --- a/package.json +++ b/package.json @@ -45,6 +45,7 @@ "pluralize": "^8.0.0", "semver": "^7.6.0", "string_decoder": "^1.3.0", + "unicode-name": "^1.0.2", "xml2js": "^0.6.2" }, "devDependencies": { @@ -74,7 +75,10 @@ "transform": { "\\.js$": "esbuild-runner/jest", "\\.xml$": "/fileTransformer.js" - } + }, + "transformIgnorePatterns": [ + "node_modules/(?!unicode-name)" + ] }, "browser": { "fs": false diff --git a/parser/splitHedString.js b/parser/splitter.js similarity index 99% rename from parser/splitHedString.js rename to parser/splitter.js index c1c939f0..31038244 100644 --- a/parser/splitHedString.js +++ b/parser/splitter.js @@ -47,7 +47,7 @@ export default class HedStringSplitter { ParsedHedTagConstructor /** - * Constructor for the HED string parser. + * Constructor. * * @param {string} hedString The HED string to be split and parsed. * @param {Schemas} hedSchemas The collection of HED schemas. diff --git a/parser/tokenizer.js b/parser/tokenizer.js index 89ecaab3..b308a9d9 100644 --- a/parser/tokenizer.js +++ b/parser/tokenizer.js @@ -1,3 +1,5 @@ +import { unicodeName } from 'unicode-name' + import { generateIssue } from '../common/issues/issues' import { stringIsEmpty } from '../utils/string' import { replaceTagNameWithPound } from '../utils/hedStrings' @@ -12,6 +14,14 @@ const slashCharacter = '/' const invalidCharacters = new Set(['[', ']', '~', '"']) const invalidCharactersOutsideOfValues = new Set([':']) +// C0 control codes +for (let i = 0x00; i <= 0x1f; i++) { + invalidCharacters.add(String.fromCodePoint(i)) +} +// DEL and C1 control codes +for (let i = 0x7f; i <= 0x9f; i++) { + invalidCharacters.add(String.fromCodePoint(i)) +} /** * A specification for a tokenized substring. @@ -366,7 +376,7 @@ export class HedStringTokenizer { _pushInvalidCharacterIssue(character, index) { this.syntaxIssues.push( generateIssue('invalidCharacter', { - character: character, + character: unicodeName(character), index: index, string: this.hedString, }), diff --git a/tests/bids.spec.js b/tests/bids.spec.js index af427877..c210f90f 100644 --- a/tests/bids.spec.js +++ b/tests/bids.spec.js @@ -668,7 +668,7 @@ describe('BIDS datasets', () => { syntax: [ BidsHedIssue.fromHedIssue( generateIssue('invalidCharacter', { - character: '{', + character: 'LEFT CURLY BRACKET', index: 9, string: '(Def/Acc/{response_time})', }), diff --git a/tests/event.spec.js b/tests/event.spec.js index 87ef696b..1a364e62 100644 --- a/tests/event.spec.js +++ b/tests/event.spec.js @@ -184,11 +184,13 @@ describe('HED string and event validation', () => { closingBracket: '/Attribute/Object side/Left,/Participant/Effect]/Body part/Arm', tilde: '/Attribute/Object side/Left,/Participant/Effect~/Body part/Arm', doubleQuote: '/Attribute/Object side/Left,/Participant/Effect"/Body part/Arm', + null: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\0', + tab: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\t', } const expectedIssues = { openingBrace: [ generateIssue('invalidCharacter', { - character: '{', + character: 'LEFT CURLY BRACKET', index: 47, string: testStrings.openingBrace, }), @@ -201,54 +203,42 @@ describe('HED string and event validation', () => { ], openingBracket: [ generateIssue('invalidCharacter', { - character: '[', + character: 'LEFT SQUARE BRACKET', index: 47, string: testStrings.openingBracket, }), ], closingBracket: [ generateIssue('invalidCharacter', { - character: ']', + character: 'RIGHT SQUARE BRACKET', index: 47, string: testStrings.closingBracket, }), ], tilde: [ generateIssue('invalidCharacter', { - character: '~', + character: 'TILDE', index: 47, string: testStrings.tilde, }), ], doubleQuote: [ generateIssue('invalidCharacter', { - character: '"', + character: 'QUOTATION MARK', index: 47, string: testStrings.doubleQuote, }), ], - } - // No-op function as this check is done during the parsing stage. - // eslint-disable-next-line no-unused-vars - validatorSyntactic(testStrings, expectedIssues, (validator) => {}) - }) - - it('should substitute and warn for certain illegal characters', () => { - const testStrings = { - nul: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\0', - tab: '/Attribute/Object side/Left,/Participant/Effect/Body part/Arm\t', - } - const expectedIssues = { - nul: [ + null: [ generateIssue('invalidCharacter', { - character: 'ASCII NUL', + character: 'NULL', index: 61, - string: testStrings.nul, + string: testStrings.null, }), ], tab: [ generateIssue('invalidCharacter', { - character: 'Tab', + character: 'CHARACTER TABULATION', index: 61, string: testStrings.tab, }), diff --git a/tests/stringParser.spec.js b/tests/stringParser.spec.js index 9c46be82..dcb83eec 100644 --- a/tests/stringParser.spec.js +++ b/tests/stringParser.spec.js @@ -84,7 +84,7 @@ describe('HED string parsing', () => { conversion: [], syntax: [ generateIssue('invalidCharacter', { - character: '[', + character: 'LEFT SQUARE BRACKET', index: 56, string: testStrings.openingSquare, }), @@ -94,7 +94,7 @@ describe('HED string parsing', () => { conversion: [], syntax: [ generateIssue('invalidCharacter', { - character: ']', + character: 'RIGHT SQUARE BRACKET', index: 56, string: testStrings.closingSquare, }), @@ -104,7 +104,7 @@ describe('HED string parsing', () => { conversion: [], syntax: [ generateIssue('invalidCharacter', { - character: '~', + character: 'TILDE', index: 56, string: testStrings.tilde, }),