Combine mixed word/number tokens (#498)

Prevents some extensions like somefile.mp4 from getting interpreted as URLs
nfrasser · Dec 4, 2024 · b72c682 · b72c682
1 parent 22e58d1
commit b72c682
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 8 deletions.
diff --git a/packages/linkify-plugin-hashtag/src/hashtag.mjs b/packages/linkify-plugin-hashtag/src/hashtag.mjs
@@ -8,7 +8,7 @@ const HashtagToken = createTokenClass('hashtag', { isLink: true });
  */
 export default function hashtag({ scanner, parser }) {
 	// Various tokens that may compose a hashtag
-	const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT } = scanner.tokens;
+	const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT, ASCIINUMERICAL, ALPHANUMERICAL } = scanner.tokens;
 	const { alpha, numeric, alphanumeric, emoji } = scanner.tokens.groups;
 
 	// Take or create a transition from start to the '#' sign (non-accepting)
@@ -18,10 +18,14 @@ export default function hashtag({ scanner, parser }) {
 	const HashPrefix = Hash.tt(UNDERSCORE);
 	const Hashtag = new State(HashtagToken);
 
+	Hash.tt(ASCIINUMERICAL, Hashtag);
+	Hash.tt(ALPHANUMERICAL, Hashtag);
 	Hash.ta(numeric, HashPrefix);
 	Hash.ta(alpha, Hashtag);
 	Hash.ta(emoji, Hashtag);
 	Hash.ta(FULLWIDTHMIDDLEDOT, Hashtag);
+	HashPrefix.tt(ASCIINUMERICAL, Hashtag);
+	HashPrefix.tt(ALPHANUMERICAL, Hashtag);
 	HashPrefix.ta(alpha, Hashtag);
 	HashPrefix.ta(emoji, Hashtag);
 	HashPrefix.ta(FULLWIDTHMIDDLEDOT, Hashtag);

diff --git a/packages/linkifyjs/src/scanner.mjs b/packages/linkifyjs/src/scanner.mjs
@@ -99,15 +99,24 @@ export function init(customSchemes = []) {
 
 	const Num = tr(Start, re.DIGIT, tk.NUM, { [fsm.numeric]: true });
 	tr(Num, re.DIGIT, Num);
+	const Asciinumeric = tr(Num, re.ASCII_LETTER, tk.ASCIINUMERICAL, { [fsm.asciinumeric]: true });
+	const Alphanumeric = tr(Num, re.LETTER, tk.ALPHANUMERICAL, { [fsm.alphanumeric]: true });
 
 	// State which emits a word token
 	const Word = tr(Start, re.ASCII_LETTER, tk.WORD, { [fsm.ascii]: true });
+	tr(Word, re.DIGIT, Asciinumeric);
 	tr(Word, re.ASCII_LETTER, Word);
+	tr(Asciinumeric, re.DIGIT, Asciinumeric);
+	tr(Asciinumeric, re.ASCII_LETTER, Asciinumeric);
 
 	// Same as previous, but specific to non-fsm.ascii alphabet words
 	const UWord = tr(Start, re.LETTER, tk.UWORD, { [fsm.alpha]: true });
 	tr(UWord, re.ASCII_LETTER); // Non-accepting
+	tr(UWord, re.DIGIT, Alphanumeric);
 	tr(UWord, re.LETTER, UWord);
+	tr(Alphanumeric, re.DIGIT, Alphanumeric);
+	tr(Alphanumeric, re.ASCII_LETTER); // Non-accepting
+	tr(Alphanumeric, re.LETTER, Alphanumeric); // Non-accepting
 
 	// Whitespace jumps
 	// Tokens of only non-newline whitespace are arbitrarily long
@@ -132,10 +141,14 @@ export function init(customSchemes = []) {
 
 	// Generates states for top-level domains
 	// Note that this is most accurate when tlds are in alphabetical order
-	const wordjr = [[re.ASCII_LETTER, Word]];
+	const wordjr = [
+		[re.ASCII_LETTER, Word],
+		[re.DIGIT, Asciinumeric],
+	];
 	const uwordjr = [
 		[re.ASCII_LETTER, null],
 		[re.LETTER, UWord],
+		[re.DIGIT, Alphanumeric],
 	];
 	for (let i = 0; i < tlds.length; i++) {
 		fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);

diff --git a/packages/linkifyjs/src/text.mjs b/packages/linkifyjs/src/text.mjs
@@ -6,6 +6,8 @@ Identifiers for token outputs from the regexp scanner
 // A valid web domain token
 export const WORD = 'WORD'; // only contains a-z
 export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
+export const ASCIINUMERICAL = 'ASCIINUMERICAL'; // contains a-z, 0-9
+export const ALPHANUMERICAL = 'ALPHANUMERICAL'; // contains numbers and letters other than a-z, used for IDN
 
 // Special case of word
 export const LOCALHOST = 'LOCALHOST';

diff --git a/test/spec/linkifyjs/parser.test.mjs b/test/spec/linkifyjs/parser.test.mjs
@@ -318,6 +318,7 @@ const tests = [
 		[Url, Text],
 		['https://google.com', '\ufffcthis'],
 	],
+	['some string with somefile.mp4 token', [Text], ['some string with somefile.mp4 token']],
 ];
 
 describe('linkifyjs/parser#run()', () => {

diff --git a/test/spec/linkifyjs/scanner.test.mjs b/test/spec/linkifyjs/scanner.test.mjs
@@ -36,10 +36,10 @@ const tests = [
 	],
 	["!,;'", [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', "'"]],
 	['hello', [t.WORD], ['hello']],
-	['Hello123', [t.WORD, t.NUM], ['Hello', '123']],
-	['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']],
+	['Hello123', [t.ASCIINUMERICAL], ['Hello123']],
+	['hello123world', [t.ASCIINUMERICAL], ['hello123world']],
 	['0123', [t.NUM], ['0123']],
-	['123abc', [t.NUM, t.TLD], ['123', 'abc']],
+	['123abc', [t.ASCIINUMERICAL], ['123abc']],
 	['http', [t.SLASH_SCHEME], ['http']],
 	['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']],
 	['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']],
@@ -66,10 +66,10 @@ const tests = [
 	['local', [t.WORD], ['local']],
 	['localhost', [t.LOCALHOST], ['localhost']],
 	['localhosts', [t.WORD], ['localhosts']],
-	['500px', [t.NUM, t.WORD], ['500', 'px']],
+	['500px', [t.ASCIINUMERICAL], ['500px']],
 	['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']],
-	['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']],
-	['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']],
+	['-500px', [t.HYPHEN, t.ASCIINUMERICAL], ['-', '500px']],
+	['500px-', [t.ASCIINUMERICAL, t.HYPHEN], ['500px', '-']],
 	['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']],
 	['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp
 	['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']],