Skip to content

Commit

Permalink
Combine mixed word/number tokens (#498)
Browse files Browse the repository at this point in the history
Prevents some extensions like somefile.mp4 from getting interpreted as URLs
  • Loading branch information
nfrasser authored Dec 4, 2024
1 parent 22e58d1 commit b72c682
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 8 deletions.
6 changes: 5 additions & 1 deletion packages/linkify-plugin-hashtag/src/hashtag.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const HashtagToken = createTokenClass('hashtag', { isLink: true });
*/
export default function hashtag({ scanner, parser }) {
// Various tokens that may compose a hashtag
const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT } = scanner.tokens;
const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT, ASCIINUMERICAL, ALPHANUMERICAL } = scanner.tokens;
const { alpha, numeric, alphanumeric, emoji } = scanner.tokens.groups;

// Take or create a transition from start to the '#' sign (non-accepting)
Expand All @@ -18,10 +18,14 @@ export default function hashtag({ scanner, parser }) {
const HashPrefix = Hash.tt(UNDERSCORE);
const Hashtag = new State(HashtagToken);

Hash.tt(ASCIINUMERICAL, Hashtag);
Hash.tt(ALPHANUMERICAL, Hashtag);
Hash.ta(numeric, HashPrefix);
Hash.ta(alpha, Hashtag);
Hash.ta(emoji, Hashtag);
Hash.ta(FULLWIDTHMIDDLEDOT, Hashtag);
HashPrefix.tt(ASCIINUMERICAL, Hashtag);
HashPrefix.tt(ALPHANUMERICAL, Hashtag);
HashPrefix.ta(alpha, Hashtag);
HashPrefix.ta(emoji, Hashtag);
HashPrefix.ta(FULLWIDTHMIDDLEDOT, Hashtag);
Expand Down
15 changes: 14 additions & 1 deletion packages/linkifyjs/src/scanner.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,24 @@ export function init(customSchemes = []) {

const Num = tr(Start, re.DIGIT, tk.NUM, { [fsm.numeric]: true });
tr(Num, re.DIGIT, Num);
const Asciinumeric = tr(Num, re.ASCII_LETTER, tk.ASCIINUMERICAL, { [fsm.asciinumeric]: true });
const Alphanumeric = tr(Num, re.LETTER, tk.ALPHANUMERICAL, { [fsm.alphanumeric]: true });

// State which emits a word token
const Word = tr(Start, re.ASCII_LETTER, tk.WORD, { [fsm.ascii]: true });
tr(Word, re.DIGIT, Asciinumeric);
tr(Word, re.ASCII_LETTER, Word);
tr(Asciinumeric, re.DIGIT, Asciinumeric);
tr(Asciinumeric, re.ASCII_LETTER, Asciinumeric);

// Same as previous, but specific to non-fsm.ascii alphabet words
const UWord = tr(Start, re.LETTER, tk.UWORD, { [fsm.alpha]: true });
tr(UWord, re.ASCII_LETTER); // Non-accepting
tr(UWord, re.DIGIT, Alphanumeric);
tr(UWord, re.LETTER, UWord);
tr(Alphanumeric, re.DIGIT, Alphanumeric);
tr(Alphanumeric, re.ASCII_LETTER); // Non-accepting
tr(Alphanumeric, re.LETTER, Alphanumeric); // Non-accepting

// Whitespace jumps
// Tokens of only non-newline whitespace are arbitrarily long
Expand All @@ -132,10 +141,14 @@ export function init(customSchemes = []) {

// Generates states for top-level domains
// Note that this is most accurate when tlds are in alphabetical order
const wordjr = [[re.ASCII_LETTER, Word]];
const wordjr = [
[re.ASCII_LETTER, Word],
[re.DIGIT, Asciinumeric],
];
const uwordjr = [
[re.ASCII_LETTER, null],
[re.LETTER, UWord],
[re.DIGIT, Alphanumeric],
];
for (let i = 0; i < tlds.length; i++) {
fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);
Expand Down
2 changes: 2 additions & 0 deletions packages/linkifyjs/src/text.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Identifiers for token outputs from the regexp scanner
// A valid web domain token
export const WORD = 'WORD'; // only contains a-z
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
export const ASCIINUMERICAL = 'ASCIINUMERICAL'; // contains a-z, 0-9
export const ALPHANUMERICAL = 'ALPHANUMERICAL'; // contains numbers and letters other than a-z, used for IDN

// Special case of word
export const LOCALHOST = 'LOCALHOST';
Expand Down
1 change: 1 addition & 0 deletions test/spec/linkifyjs/parser.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ const tests = [
[Url, Text],
['https://google.com', '\ufffcthis'],
],
['some string with somefile.mp4 token', [Text], ['some string with somefile.mp4 token']],
];

describe('linkifyjs/parser#run()', () => {
Expand Down
12 changes: 6 additions & 6 deletions test/spec/linkifyjs/scanner.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ const tests = [
],
["!,;'", [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', "'"]],
['hello', [t.WORD], ['hello']],
['Hello123', [t.WORD, t.NUM], ['Hello', '123']],
['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']],
['Hello123', [t.ASCIINUMERICAL], ['Hello123']],
['hello123world', [t.ASCIINUMERICAL], ['hello123world']],
['0123', [t.NUM], ['0123']],
['123abc', [t.NUM, t.TLD], ['123', 'abc']],
['123abc', [t.ASCIINUMERICAL], ['123abc']],
['http', [t.SLASH_SCHEME], ['http']],
['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']],
['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']],
Expand All @@ -66,10 +66,10 @@ const tests = [
['local', [t.WORD], ['local']],
['localhost', [t.LOCALHOST], ['localhost']],
['localhosts', [t.WORD], ['localhosts']],
['500px', [t.NUM, t.WORD], ['500', 'px']],
['500px', [t.ASCIINUMERICAL], ['500px']],
['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']],
['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']],
['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']],
['-500px', [t.HYPHEN, t.ASCIINUMERICAL], ['-', '500px']],
['500px-', [t.ASCIINUMERICAL, t.HYPHEN], ['500px', '-']],
['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']],
['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp
['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']],
Expand Down

0 comments on commit b72c682

Please sign in to comment.