Skip to content

Commit

Permalink
Bracket parsing refactor and support for 「」『』<> brackets (#463)
Browse files Browse the repository at this point in the history
* Cleaner brace/bracket/parens state transition definitions
* Add additional brace kinds
* Check that secrets are defined before running Browserify tests so it does not fail
  • Loading branch information
nfrasser authored Nov 22, 2023
1 parent 7471c52 commit 223e317
Show file tree
Hide file tree
Showing 5 changed files with 347 additions and 331 deletions.
161 changes: 63 additions & 98 deletions packages/linkifyjs/src/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,31 +43,37 @@ export function init({ groups }) {
tk.SLASH,
tk.SYM,
tk.TILDE,
tk.UNDERSCORE
tk.UNDERSCORE,
]);

// Types of tokens that can follow a URL and be part of the query string
// but cannot be the very last characters
// Characters that cannot appear in the URL at all should be excluded
const qsNonAccepting = [
tk.APOSTROPHE,
tk.CLOSEANGLEBRACKET,
tk.CLOSEBRACE,
tk.CLOSEBRACKET,
tk.CLOSEPAREN,
tk.FULLWIDTH_CLOSEPAREN,
tk.COLON,
tk.COMMA,
tk.DOT,
tk.EXCLAMATION,
tk.QUERY,
tk.QUOTE,
tk.SEMI,
tk.OPENANGLEBRACKET,
tk.CLOSEANGLEBRACKET,
tk.OPENBRACE,
tk.CLOSEBRACE,
tk.CLOSEBRACKET,
tk.OPENBRACKET,
tk.OPENPAREN,
tk.FULLWIDTH_OPENPAREN,
tk.QUERY,
tk.QUOTE,
tk.SEMI
tk.CLOSEPAREN,
tk.FULLWIDTHLEFTPAREN,
tk.FULLWIDTHRIGHTPAREN,
tk.LEFTCORNERBRACKET,
tk.RIGHTCORNERBRACKET,
tk.LEFTWHITECORNERBRACKET,
tk.RIGHTWHITECORNERBRACKET,
tk.FULLWIDTHLESSTHAN,
tk.FULLWIDTHGREATERTHAN,
];

// For addresses without the mailto prefix
Expand All @@ -79,11 +85,11 @@ export function init({ groups }) {
tk.BACKSLASH,
tk.BACKTICK,
tk.CARET,
tk.CLOSEBRACE,
tk.DOLLAR,
tk.EQUALS,
tk.HYPHEN,
tk.OPENBRACE,
tk.CLOSEBRACE,
tk.PERCENT,
tk.PIPE,
tk.PLUS,
Expand All @@ -92,7 +98,7 @@ export function init({ groups }) {
tk.SLASH,
tk.SYM,
tk.TILDE,
tk.UNDERSCORE
tk.UNDERSCORE,
];

// The universal starting state.
Expand All @@ -104,7 +110,9 @@ export function init({ groups }) {
ta(Localpart, localpartAccepting, Localpart);
ta(Localpart, groups.domain, Localpart);

const Domain = makeState(), Scheme = makeState(), SlashScheme = makeState();
const Domain = makeState(),
Scheme = makeState(),
SlashScheme = makeState();
ta(Start, groups.domain, Domain); // parsed string ends with a potential domain name (A)
ta(Start, groups.scheme, Scheme); // e.g., 'mailto'
ta(Start, groups.slashscheme, SlashScheme); // e.g., 'http'
Expand Down Expand Up @@ -144,7 +152,7 @@ export function init({ groups }) {

// Final possible email states
const EmailColon = tt(Email, tk.COLON); // URL followed by colon (potential port number here)
/*const EmailColonPort = */ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port numner
/*const EmailColonPort = */ ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port number

// Account for dots and hyphens. Hyphens are usually parts of domain names
// (but not TLDs)
Expand Down Expand Up @@ -206,86 +214,46 @@ export function init({ groups }) {
ta(UriPrefix, qsAccepting, Url);
tt(UriPrefix, tk.SLASH, Url);

// URL, followed by an opening bracket
const UrlOpenbrace = tt(Url, tk.OPENBRACE); // URL followed by {
const UrlOpenbracket = tt(Url, tk.OPENBRACKET); // URL followed by [
const UrlOpenanglebracket = tt(Url, tk.OPENANGLEBRACKET); // URL followed by <
const UrlOpenparen = tt(Url, tk.OPENPAREN); // URL followed by (
const UrlFullwidthOpenparen = tt(Url, tk.FULLWIDTH_OPENPAREN); // URL followed by (

tt(UrlNonaccept, tk.OPENBRACE, UrlOpenbrace);
tt(UrlNonaccept, tk.OPENBRACKET, UrlOpenbracket);
tt(UrlNonaccept, tk.OPENANGLEBRACKET, UrlOpenanglebracket);
tt(UrlNonaccept, tk.OPENPAREN, UrlOpenparen);
tt(UrlNonaccept, tk.FULLWIDTH_OPENPAREN, UrlFullwidthOpenparen);

// Closing bracket component. This character WILL be included in the URL
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);
tt(UrlOpenbracket, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracket, tk.CLOSEANGLEBRACKET, Url);
tt(UrlOpenparen, tk.CLOSEPAREN, Url);
tt(UrlFullwidthOpenparen, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);

// URL that beings with an opening bracket, followed by a symbols.
// Note that the final state can still be `UrlOpenbrace` (if the URL only
// has a single opening bracket for some reason).
const UrlOpenbraceQ = makeState(mtk.Url); // URL followed by { and some symbols that the URL can end it
const UrlOpenbracketQ = makeState(mtk.Url); // URL followed by [ and some symbols that the URL can end it
const UrlOpenanglebracketQ = makeState(mtk.Url); // URL followed by < and some symbols that the URL can end it
const UrlOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
const UrlFullwidthOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
ta(UrlOpenbrace, qsAccepting, UrlOpenbraceQ);
ta(UrlOpenbracket, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracket, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparen, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparen, qsAccepting, UrlFullwidthOpenparenQ);

const UrlOpenbraceSyms = makeState(); // UrlOpenbrace followed by some symbols it cannot end it
const UrlOpenbracketSyms = makeState(); // UrlOpenbracketQ followed by some symbols it cannot end it
const UrlOpenanglebracketSyms = makeState(); // UrlOpenanglebracketQ followed by some symbols it cannot end it
const UrlOpenparenSyms = makeState(); // UrlOpenparenQ followed by some symbols it cannot end it
const UrlFullwidthOpenparenSyms = makeState(); // UrlFullwidthOpenparenQ followed by some symbols it cannot end it
ta(UrlOpenbrace, qsNonAccepting);
ta(UrlOpenbracket, qsNonAccepting);
ta(UrlOpenanglebracket, qsNonAccepting);
ta(UrlOpenparen, qsNonAccepting);
ta(UrlFullwidthOpenparen, qsNonAccepting);

// URL that begins with an opening bracket, followed by some symbols
ta(UrlOpenbraceQ, qsAccepting, UrlOpenbraceQ);
ta(UrlOpenbracketQ, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketQ, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenQ, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);
ta(UrlOpenbraceQ, qsNonAccepting, UrlOpenbraceQ);
ta(UrlOpenbracketQ, qsNonAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketQ, qsNonAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenQ, qsNonAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);

ta(UrlOpenbraceSyms, qsAccepting, UrlOpenbraceSyms);
ta(UrlOpenbracketSyms, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketSyms, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenSyms, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenQ);
ta(UrlOpenbraceSyms, qsNonAccepting, UrlOpenbraceSyms);
ta(UrlOpenbracketSyms, qsNonAccepting, UrlOpenbracketSyms);
ta(UrlOpenanglebracketSyms, qsNonAccepting, UrlOpenanglebracketSyms);
ta(UrlOpenparenSyms, qsNonAccepting, UrlOpenparenSyms);
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenSyms);

// Close brace/bracket to become regular URL
tt(UrlOpenbracketQ, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracketQ, tk.CLOSEANGLEBRACKET, Url);
tt(UrlOpenparenQ, tk.CLOSEPAREN, Url);
tt(UrlFullwidthOpenparenQ, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbraceQ, tk.CLOSEBRACE, Url);
tt(UrlOpenbracketSyms, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracketSyms, tk.CLOSEANGLEBRACKET, Url);
tt(UrlFullwidthOpenparenSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbraceSyms, tk.CLOSEPAREN, Url);
tt(UrlOpenbraceSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
const bracketPairs = [
[tk.OPENBRACE, tk.CLOSEBRACE], // {}
[tk.OPENBRACKET, tk.CLOSEBRACKET], // []
[tk.OPENPAREN, tk.CLOSEPAREN], // ()
[tk.OPENANGLEBRACKET, tk.CLOSEANGLEBRACKET], // <>
[tk.FULLWIDTHLEFTPAREN, tk.FULLWIDTHRIGHTPAREN], // ()
[tk.LEFTCORNERBRACKET, tk.RIGHTCORNERBRACKET], // 「」
[tk.LEFTWHITECORNERBRACKET, tk.RIGHTWHITECORNERBRACKET], // 『』
[tk.FULLWIDTHLESSTHAN, tk.FULLWIDTHGREATERTHAN], // <>
];

for (let i = 0; i < bracketPairs.length; i++) {
const [OPEN, CLOSE] = bracketPairs[i];
const UrlOpen = tt(Url, OPEN); // URL followed by open bracket

// Continue not accepting for open brackets
tt(UrlNonaccept, OPEN, UrlOpen);

// Closing bracket component. This character WILL be included in the URL
tt(UrlOpen, CLOSE, Url);

// URL that beings with an opening bracket, followed by a symbols.
// Note that the final state can still be `UrlOpen` (if the URL has a
// single opening bracket for some reason).
const UrlOpenQ = makeState(mtk.Url);
ta(UrlOpen, qsAccepting, UrlOpenQ);

const UrlOpenSyms = makeState(); // UrlOpen followed by some symbols it cannot end it
ta(UrlOpen, qsNonAccepting);

// URL that begins with an opening bracket, followed by some symbols
ta(UrlOpenQ, qsAccepting, UrlOpenQ);
ta(UrlOpenQ, qsNonAccepting, UrlOpenSyms);
ta(UrlOpenSyms, qsAccepting, UrlOpenQ);
ta(UrlOpenSyms, qsNonAccepting, UrlOpenSyms);

// Close brace/bracket to become regular URL
tt(UrlOpenQ, CLOSE, Url);
tt(UrlOpenSyms, CLOSE, Url);
}

tt(Start, tk.LOCALHOST, DomainDotTld); // localhost is a valid URL state
tt(Start, tk.NL, mtk.Nl); // single new line
Expand Down Expand Up @@ -323,10 +291,7 @@ export function run(start, input, tokens) {
textTokens.push(tokens[cursor++]);
}

while (cursor < len && (
nextState = secondState || state.go(tokens[cursor].t))
) {

while (cursor < len && (nextState = secondState || state.go(tokens[cursor].t))) {
// Get the next state
secondState = null;
state = nextState;
Expand Down
42 changes: 28 additions & 14 deletions packages/linkifyjs/src/scanner.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ const NL = '\n'; // New line character
const EMOJI_VARIATION = '\ufe0f'; // Variation selector, follows heart and others
const EMOJI_JOINER = '\u200d'; // zero-width joiner

let tlds = null, utlds = null; // don't change so only have to be computed once
let tlds = null,
utlds = null; // don't change so only have to be computed once

/**
* Scanner output token:
Expand Down Expand Up @@ -55,15 +56,21 @@ export function init(customSchemes = []) {
// States for special URL symbols that accept immediately after start
tt(Start, "'", tk.APOSTROPHE);
tt(Start, '{', tk.OPENBRACE);
tt(Start, '[', tk.OPENBRACKET);
tt(Start, '<', tk.OPENANGLEBRACKET);
tt(Start, '(', tk.OPENPAREN);
tt(Start, '(', tk.FULLWIDTH_OPENPAREN);
tt(Start, '}', tk.CLOSEBRACE);
tt(Start, '[', tk.OPENBRACKET);
tt(Start, ']', tk.CLOSEBRACKET);
tt(Start, '>', tk.CLOSEANGLEBRACKET);
tt(Start, '(', tk.OPENPAREN);
tt(Start, ')', tk.CLOSEPAREN);
tt(Start, ')', tk.FULLWIDTH_CLOSEPAREN);
tt(Start, '<', tk.OPENANGLEBRACKET);
tt(Start, '>', tk.CLOSEANGLEBRACKET);
tt(Start, '(', tk.FULLWIDTHLEFTPAREN);
tt(Start, ')', tk.FULLWIDTHRIGHTPAREN);
tt(Start, '「', tk.LEFTCORNERBRACKET);
tt(Start, '」', tk.RIGHTCORNERBRACKET);
tt(Start, '『', tk.LEFTWHITECORNERBRACKET);
tt(Start, '』', tk.RIGHTWHITECORNERBRACKET);
tt(Start, '<', tk.FULLWIDTHLESSTHAN);
tt(Start, '>', tk.FULLWIDTHGREATERTHAN);
tt(Start, '&', tk.AMPERSAND);
tt(Start, '*', tk.ASTERISK);
tt(Start, '@', tk.AT);
Expand Down Expand Up @@ -122,7 +129,10 @@ export function init(customSchemes = []) {
// Generates states for top-level domains
// Note that this is most accurate when tlds are in alphabetical order
const wordjr = [[re.ASCII_LETTER, Word]];
const uwordjr = [[re.ASCII_LETTER, null], [re.LETTER, UWord]];
const uwordjr = [
[re.ASCII_LETTER, null],
[re.LETTER, UWord],
];
for (let i = 0; i < tlds.length; i++) {
fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);
}
Expand All @@ -145,7 +155,7 @@ export function init(customSchemes = []) {
addToGroups(tk.SLASH_SCHEME, { slashscheme: true, ascii: true }, groups);

// Register custom schemes. Assumes each scheme is asciinumeric with hyphens
customSchemes = customSchemes.sort((a, b) => a[0] > b[0] ? 1 : -1);
customSchemes = customSchemes.sort((a, b) => (a[0] > b[0] ? 1 : -1));
for (let i = 0; i < customSchemes.length; i++) {
const sch = customSchemes[i][0];
const optionalSlashSlash = customSchemes[i][1];
Expand Down Expand Up @@ -233,7 +243,7 @@ export function run(start, str) {
t: latestAccepting.t, // token type/name
v: str.slice(cursor - tokenLength, cursor), // string value
s: cursor - tokenLength, // start index
e: cursor // end index (excluding)
e: cursor, // end index (excluding)
});
}

Expand All @@ -258,10 +268,14 @@ export function stringToArray(str) {
while (index < len) {
let first = str.charCodeAt(index);
let second;
let char = first < 0xd800 || first > 0xdbff || index + 1 === len
|| (second = str.charCodeAt(index + 1)) < 0xdc00 || second > 0xdfff
? str[index] // single character
: str.slice(index, index + 2); // two-index characters
let char =
first < 0xd800 ||
first > 0xdbff ||
index + 1 === len ||
(second = str.charCodeAt(index + 1)) < 0xdc00 ||
second > 0xdfff
? str[index] // single character
: str.slice(index, index + 2); // two-index characters
result.push(char);
index += char.length;
}
Expand Down
25 changes: 16 additions & 9 deletions packages/linkifyjs/src/text.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ Identifiers for token outputs from the regexp scanner
******************************************************************************/

// A valid web domain token
export const WORD = 'WORD'; // only contains a-z
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
export const WORD = 'WORD'; // only contains a-z
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN

// Special case of word
export const LOCALHOST = 'LOCALHOST';
Expand Down Expand Up @@ -36,16 +36,24 @@ export const WS = 'WS';
export const NL = 'NL'; // \n

// Opening/closing bracket classes
// TODO: Rename OPEN -> LEFT and CLOSE -> RIGHT in v5 to fit with Unicode names
// Also rename angle brackes to LESSTHAN and GREATER THAN
export const OPENBRACE = 'OPENBRACE'; // {
export const OPENBRACKET = 'OPENBRACKET'; // [
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
export const OPENPAREN = 'OPENPAREN'; // (
export const CLOSEBRACE = 'CLOSEBRACE'; // }
export const OPENBRACKET = 'OPENBRACKET'; // [
export const CLOSEBRACKET = 'CLOSEBRACKET'; // ]
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
export const OPENPAREN = 'OPENPAREN'; // (
export const CLOSEPAREN = 'CLOSEPAREN'; // )
export const FULLWIDTH_OPENPAREN = 'FULLWIDTH_OPENPAREN'; // (
export const FULLWIDTH_CLOSEPAREN = 'FULLWIDTH_CLOSEPAREN'; // )
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
export const FULLWIDTHLEFTPAREN = 'FULLWIDTHLEFTPAREN'; // (
export const FULLWIDTHRIGHTPAREN = 'FULLWIDTHRIGHTPAREN'; // )
export const LEFTCORNERBRACKET = 'LEFTCORNERBRACKET'; // 「
export const RIGHTCORNERBRACKET = 'RIGHTCORNERBRACKET'; // 」
export const LEFTWHITECORNERBRACKET = 'LEFTWHITECORNERBRACKET'; // 『
export const RIGHTWHITECORNERBRACKET = 'RIGHTWHITECORNERBRACKET'; // 』
export const FULLWIDTHLESSTHAN = 'FULLWIDTHLESSTHAN'; // <
export const FULLWIDTHGREATERTHAN = 'FULLWIDTHGREATERTHAN'; // >

// Various symbols
export const AMPERSAND = 'AMPERSAND'; // &
Expand Down Expand Up @@ -79,4 +87,3 @@ export const EMOJI = 'EMOJI';

// Default token - anything that is not one of the above
export const SYM = 'SYM';

6 changes: 4 additions & 2 deletions test/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ if [[ "$1" == "--dist" ]]; then
npm run test:coverage
npm run build:ci
npm run copy
npm run test:ci
sleep 3 # Wait for threads to exit?
if [[ "${BROWSERSTACK_USERNAME}" != "" ]] && [[ "${BROWSERSTACK_ACCESS_KEY}" != "" ]]; then
npm run test:ci
sleep 3 # Wait for threads to exit?
fi
else
# Run basic tests
echo "Running basic tests..."
Expand Down
Loading

0 comments on commit 223e317

Please sign in to comment.