Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First pass at the update of the hed tag parsing #219

Merged
merged 3 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/issues/data.js
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ export default {
invalidExtension: {
hedCode: 'TAG_EXTENSION_INVALID',
level: 'error',
message: stringTemplate`"${'tag'}" appears as an extension of "${'parentTag'}", which does not allow tag extensions.`,
message: stringTemplate`"${'tag'}" appears as an extension of "${'parentTag'}", which does not allow this tag extension.`,
},
emptyTagFound: {
hedCode: 'TAG_EMPTY',
Expand Down
146 changes: 121 additions & 25 deletions parser/parsedHedTag.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import { IssueError } from '../common/issues/issues'
import { getTagLevels } from '../utils/hedStrings'
import { getParentTag, getTagLevels, getTagName } from '../utils/hedStrings'
import ParsedHedSubstring from './parsedHedSubstring'
import { SchemaValueTag } from '../schema/entries'
import TagConverter from './tagConverter'
import { Schema } from '../schema/containers'
import { getRegExp } from './tempRegex'

import RegexClass from '../schema/regExps'

/**
* A parsed HED tag.
Expand Down Expand Up @@ -39,6 +41,30 @@ export default class ParsedHedTag extends ParsedHedSubstring {
*/
_remainder

/**
* The extension if any
*
* @type {string}
* @private
*/
_extension

/**
* The value if any
*
* @type {string}
* @private
*/
_value

/**
* The units if any
*
* @type {string}
* @private
*/
_units

/**
* Constructor.
*
Expand All @@ -48,15 +74,16 @@ export default class ParsedHedTag extends ParsedHedSubstring {
* @throws {IssueError} If tag conversion or parsing fails.
*/
constructor(tagSpec, hedSchemas, hedString) {
super(tagSpec.tag, tagSpec.bounds)

this._convertTag(hedSchemas, hedString, tagSpec)

this.formattedTag = this._formatTag()
super(tagSpec.tag, tagSpec.bounds) // Sets originalTag and originalBounds
this._convertTag(hedSchemas, hedString, tagSpec) // Sets various forms of the tag.
this._handleRemainder()
//this._checkTagAttributes() // Checks various aspects like requireChild or extensionAllowed.
//this.formattedTag = this._formatTag()
//this.formattedTag = this.canonicalTag.toLowerCase()
}

/**
* Convert this tag to long form.
* Convert this tag to its various forms
*
* @param {Schemas} hedSchemas The collection of HED schemas.
* @param {string} hedString The original HED string.
Expand All @@ -83,6 +110,38 @@ export default class ParsedHedTag extends ParsedHedSubstring {
this._schemaTag = schemaTag
this._remainder = remainder
this.canonicalTag = this._schemaTag.longExtend(remainder)
this.formattedTag = this.canonicalTag.toLowerCase()
}

/**
* Handle the remainder portion
*
* @throws {IssueError} If parsing the remainder section fails.
*/
_handleRemainder() {
if (this._remainder === '') {
return
}
// if (this.allowsExtensions) {
// this._handleExtension()
// } else if (this.takesValue) { // Its a value tag
// return
// } else {
// //IssueError.generateAndThrow('invalidTag', {tag: this.originalTag})
// }
}

/**
* Handle potential extensions
*
* @throws {IssueError} If parsing the remainder section fails.
*/
_handleExtension() {
this._extension = this._remainder
const testReg = getRegExp('nameClass')
if (!testReg.test(this._extension)) {
IssueError.generateAndThrow('invalidExtension', { tag: this.originalTag })
}
}

/**
Expand Down Expand Up @@ -121,23 +180,6 @@ export default class ParsedHedTag extends ParsedHedSubstring {
}
}

/**
* Format this HED tag by removing newlines and double quotes.
*
* @returns {string} The formatted version of this tag.
*/
_formatTag() {
this.originalTag = this.originalTag.replace('\n', ' ')
let hedTagString = this.canonicalTag.trim()
if (hedTagString.startsWith('"')) {
hedTagString = hedTagString.slice(1)
}
if (hedTagString.endsWith('"')) {
hedTagString = hedTagString.slice(0, -1)
}
return hedTagString.toLowerCase()
}

/**
* Determine whether this tag has a given attribute.
*
Expand Down Expand Up @@ -440,4 +482,58 @@ export default class ParsedHedTag extends ParsedHedSubstring {
return units
})
}

/**
* Validate a unit and strip it from the value.
*
* @param {ParsedHedTag} tag A HED tag.
* @returns {[boolean, boolean, string]} Whether a unit was found, whether it was valid, and the stripped value.
*/
validateUnits(tag) {
const originalTagUnitValue = tag.originalTagName
const tagUnitClassUnits = tag.validUnits
const validUnits = tag.schema.entries.allUnits
const unitStrings = Array.from(validUnits.keys())
unitStrings.sort((first, second) => {
return second.length - first.length
})
let actualUnit = getTagName(originalTagUnitValue, ' ')
let noUnitFound = false
if (actualUnit === originalTagUnitValue) {
actualUnit = ''
noUnitFound = true
}
let foundUnit, foundWrongCaseUnit, strippedValue
for (const unitName of unitStrings) {
const unit = validUnits.get(unitName)
const isPrefixUnit = unit.isPrefixUnit
const isUnitSymbol = unit.isUnitSymbol
for (const derivativeUnit of unit.derivativeUnits()) {
if (isPrefixUnit && originalTagUnitValue.startsWith(derivativeUnit)) {
foundUnit = true
noUnitFound = false
strippedValue = originalTagUnitValue.substring(derivativeUnit.length).trim()
}
if (actualUnit === derivativeUnit) {
foundUnit = true
strippedValue = getParentTag(originalTagUnitValue, ' ')
} else if (actualUnit.toLowerCase() === derivativeUnit.toLowerCase()) {
if (isUnitSymbol) {
foundWrongCaseUnit = true
} else {
foundUnit = true
}
strippedValue = getParentTag(originalTagUnitValue, ' ')
}
if (foundUnit) {
const unitIsValid = tagUnitClassUnits.has(unit)
return [true, unitIsValid, strippedValue]
}
}
if (foundWrongCaseUnit) {
return [true, false, strippedValue]
}
}
return [!noUnitFound, false, originalTagUnitValue]
}
}
56 changes: 39 additions & 17 deletions parser/tagConverter.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { IssueError } from '../common/issues/issues'
import { getTagSlashIndices } from '../utils/hedStrings'
import { SchemaValueTag } from '../schema/entries'

import { getRegExp } from './tempRegex'
/**
* Converter from a tag specification to a schema-based tag object.
*/
Expand Down Expand Up @@ -56,6 +56,7 @@ export default class TagConverter {
constructor(tagSpec, hedSchemas) {
this.hedSchemas = hedSchemas
this.tagMapping = hedSchemas.getSchema(tagSpec.library).entries.tags

this.tagSpec = tagSpec
this.tagString = tagSpec.tag
this.tagLevels = this.tagString.split('/')
Expand All @@ -67,6 +68,7 @@ export default class TagConverter {
* Retrieve the {@link SchemaTag} object for a tag specification.
*
* @returns {[SchemaTag, string]} The schema's corresponding tag object and the remainder of the tag string.
* @throws {IssueError} If tag conversion.
*/
convert() {
let parentTag = undefined
Expand All @@ -86,45 +88,50 @@ export default class TagConverter {
}

_validateChildTag(parentTag, tagLevelIndex) {
if (this.schemaTag instanceof SchemaValueTag) {
IssueError.generateAndThrow('internalConsistencyError', {
message: 'Child tag is a value tag which should have been handled earlier.',
})
}

const childTag = this._getSchemaTag(tagLevelIndex)
if (childTag === undefined) {
// This is an extended tag
if (tagLevelIndex === 0) {
IssueError.generateAndThrow('invalidTag', { tag: this.tagString })
}
if (parentTag !== undefined && !parentTag.hasAttributeName('extensionAllowed')) {
IssueError.generateAndThrow('invalidExtension', {
tag: this.tagLevels[tagLevelIndex],
parentTag: parentTag.longName,
parentTag: this.tagLevels.slice(0, tagLevelIndex).join('/'),
})
}
this._checkExtensions(tagLevelIndex)
return childTag
}

if (tagLevelIndex > 0 && (childTag.parent === undefined || childTag.parent !== parentTag)) {
IssueError.generateAndThrow('invalidParentNode', {
tag: this.tagLevels[tagLevelIndex],
parentTag: childTag.longName,
parentTag: this.tagLevels.slice(0, tagLevelIndex).join('/'),
})
}

return childTag
}

_getSchemaTag(tagLevelIndex) {
let tagLevel = this.tagLevels[tagLevelIndex].toLowerCase()
// TODO: These two checks should probably be removed as the tokenizer handles this.
if (tagLevelIndex === 0) {
tagLevel = tagLevel.trimLeft()
}
if (tagLevel === '' || tagLevel !== tagLevel.trim()) {
IssueError.generateAndThrow('invalidTag', { tag: this.tagString })
_checkExtensions(tagLevelIndex) {
// A non-tag has been detected --- from here on must be non-tags.
this._checkNameClass(tagLevelIndex) // This is an extension
for (let index = tagLevelIndex + 1; index < this.tagLevels.length; index++) {
const child = this._getSchemaTag(index)
if (child !== undefined) {
// A schema tag showed up after a non-schema tag
IssueError.generateAndThrow('invalidParentNode', {
tag: this.tagLevels[index],
parentTag: this.tagLevels.slice(0, index).join('/'),
})
}
this._checkNameClass(index)
}
}

_getSchemaTag(tagLevelIndex) {
const tagLevel = this.tagLevels[tagLevelIndex].toLowerCase()
return this.tagMapping.getEntry(tagLevel)
}

Expand All @@ -138,4 +145,19 @@ export default class TagConverter {
IssueError.generateAndThrow('childRequired', { tag: this.tagString })
}
}

_checkNameClass(index) {
// Check whether the tagLevel is a valid name class
// TODO: this test should be in the schema and the RegExp only created once.
const valueClasses = this.hedSchemas.getSchema(this.tagSpec.library).entries.valueClasses
const myRex = valueClasses._definitions.get('nameClass')?._charClassRegex
const my = new RegExp(myRex)
if (!my.test(this.tagLevels[index])) {
// An extension is not name class
IssueError.generateAndThrow('invalidExtension', {
tag: this.tagLevels[index],
parentTag: this.tagLevels.slice(0, index).join('/'),
})
}
}
}
25 changes: 25 additions & 0 deletions parser/tempRegex.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import regexData from '../data/json/class_regex.json'

// Function to get the RegExp
export function getRegExp(name) {
if (!regexData.class_chars[name]) {
throw new Error(`Invalid class name: ${name}`)
}

const charNames = regexData.class_chars[name]
if (charNames.length === 0) {
throw new Error(`No character definitions for class: ${name}`)
}

// Join the individual character regex patterns
const pattern = charNames
.map((charName) => {
if (!regexData.char_regex[charName]) {
throw new Error(`Invalid character name: ${charName}`)
}
return regexData.char_regex[charName]
})
.join('|')

return new RegExp(`^(?:${pattern})+$`)
}
4 changes: 2 additions & 2 deletions schema/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,9 @@ export default class SchemaParser {
for (const [name, valueAttributes] of valueAttributeDefinitions) {
const booleanAttributes = booleanAttributeDefinitions.get(name)
//valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes))
const charClassRegex = this._getValueClassChars(name)
const charRegex = this._getValueClassChars(name)
const wordRegex = new RegExp(classRegex.class_words[name] ?? '^.+$')
valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes, charClassRegex, wordRegex))
valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes, charRegex, wordRegex))
}
this.valueClasses = new SchemaEntryManager(valueClasses)
}
Expand Down
21 changes: 21 additions & 0 deletions schema/regExps.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import classRegex from '../data/json/class_regex.json'

export class RegexClass {
// Static method that returns the RegExp object

static getValueClassChars(name) {
let classChars
if (Array.isArray(classRegex.class_chars[name]) && classRegex.class_chars[name].length > 0) {
classChars =
'^(?:' + classRegex.class_chars[name].map((charClass) => classRegex.char_regex[charClass]).join('|') + ')+$'
} else {
classChars = '^.+$' // Any non-empty line or string.
}
return new RegExp(classChars)
}

static testRegex(name, value) {
const regex = RegexClass.getValueClassChars(name)
return regex.test(value)
}
}
Loading
Loading