From 43e8cdf28b34c2547e65275ca28413818b8e9b9a Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 1 Nov 2023 22:57:08 +0100 Subject: [PATCH 1/3] markdown-plain-urls --- package.json | 1 + .../src/MarkdownTextWrap/MarkdownTextWrap.tsx | 12 ++++++---- .../components/src/SimpleMarkdownText.tsx | 3 ++- .../markdown/mdast-util-find-and-replace.d.ts | 1 + .../src/markdown/remarkPlainLinks.ts | 23 +++++++++++++++++++ yarn.lock | 12 ++++++++++ 6 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts create mode 100644 packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts diff --git a/package.json b/package.json index d99adf85805..e0e71838036 100644 --- a/package.json +++ b/package.json @@ -155,6 +155,7 @@ "lodash": "^4.17.20", "mathjax-full": "^3.1.0", "md5": "^2.3.0", + "mdast-util-find-and-replace": "1.1.1", "mdast-util-from-markdown": "^0.8.0", "minimist": "^1.2.6", "mobx": "^5.15.7", diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx index 81239787b57..d5b997c5dd3 100644 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx +++ b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx @@ -17,6 +17,7 @@ import { TextWrap } from "../TextWrap/TextWrap.js" import fromMarkdown from "mdast-util-from-markdown" import type { Root, Content } from "mdast" import { match } from "ts-pattern" +import { urlRegex } from "../markdown/remarkPlainLinks.js" const SUPERSCRIPT_NUMERALS = { "0": "\u2070", @@ -895,12 +896,13 @@ function convertMarkdownNodeToIRTokens( (item) => { const splitted = item.value.split(/\s+/) const tokens = splitted.flatMap((text, i) => { + const textNode = new IRText(text, fontParams) + const node = text.match(urlRegex) + ? new IRLink(text, [textNode], fontParams) + : textNode if (i < splitted.length - 1) { - return [ - new IRText(text, fontParams), - new IRWhitespace(fontParams), - ] - } else return [new IRText(text, fontParams)] + return [node, new IRWhitespace(fontParams)] + } else return [node] }) return tokens } diff --git a/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx b/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx index 315b3b8d018..ab2f6d9b55e 100644 --- a/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx +++ b/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx @@ -1,6 +1,7 @@ import React from "react" import { computed } from "mobx" import { Remark } from "react-remark" +import { remarkPlainLinks } from "./markdown/remarkPlainLinks.js" type SimpleMarkdownTextProps = { text: string @@ -12,6 +13,6 @@ export class SimpleMarkdownText extends React.Component } render(): JSX.Element | null { - return {this.text} + return {this.text} } } diff --git a/packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts b/packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts new file mode 100644 index 00000000000..a338cd9b17e --- /dev/null +++ b/packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts @@ -0,0 +1 @@ +declare module "mdast-util-find-and-replace" diff --git a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts new file mode 100644 index 00000000000..b1f9819381e --- /dev/null +++ b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts @@ -0,0 +1,23 @@ +import findAndReplace from "mdast-util-find-and-replace" + +export const urlRegex = /https?:\/\/([\w-]+\.)+[\w-]+(\/[\w\- .\+/?:%&=~#]*)?/ + +export function remarkPlainLinks() { + const turnIntoLink = (value: any, _match: string) => { + return [ + { + type: "link", + url: value, + children: [ + { + type: "text", + value: value, + }, + ], + }, + ] + } + return (tree: any) => { + findAndReplace(tree, [[urlRegex, turnIntoLink]]) + } +} diff --git a/yarn.lock b/yarn.lock index 2ae9655b992..10bdcd50dff 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10518,6 +10518,7 @@ __metadata: lodash: "npm:^4.17.20" mathjax-full: "npm:^3.1.0" md5: "npm:^2.3.0" + mdast-util-find-and-replace: "npm:1.1.1" mdast-util-from-markdown: "npm:^0.8.0" minimist: "npm:^1.2.6" mobx: "npm:^5.15.7" @@ -13597,6 +13598,17 @@ __metadata: languageName: node linkType: hard +"mdast-util-find-and-replace@npm:1.1.1": + version: 1.1.1 + resolution: "mdast-util-find-and-replace@npm:1.1.1" + dependencies: + escape-string-regexp: "npm:^4.0.0" + unist-util-is: "npm:^4.0.0" + unist-util-visit-parents: "npm:^3.0.0" + checksum: e4c9e50d9bce5ae4c728a925bd60080b94d16aaa312c27e2b70b16ddc29a5d0a0844d6e18efaef08aeb22c68303ec528f20183d1b0420504a0c2c1710cebd76f + languageName: node + linkType: hard + "mdast-util-from-markdown@npm:^0.8.0": version: 0.8.5 resolution: "mdast-util-from-markdown@npm:0.8.5" From 5eb83904eb80db44f68650ce1edd0d516b93ccaf Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Tue, 7 Nov 2023 18:43:09 +0100 Subject: [PATCH 2/3] :hammer: improve link matching --- .../src/markdown/remarkPlainLinks.ts | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts index b1f9819381e..bc2e27744be 100644 --- a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts +++ b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts @@ -1,20 +1,55 @@ import findAndReplace from "mdast-util-find-and-replace" -export const urlRegex = /https?:\/\/([\w-]+\.)+[\w-]+(\/[\w\- .\+/?:%&=~#]*)?/ +// This regex matches: +// "http" +// an optional "s" +// two / characters +// The subdomains and hostname: Any word or numeric character or "_" or "-" one or more times followed by a period +// The TLD: Any word or numeric character or "_" or "-" one or more times +// The path, query string and fragment: A forward slash followed by any word or numeric character (unicode classes so umlauts like ö match +// as well as any of the following: .+?:%&=~#) zero or more times. Note that we exclude space even though that is valid in a URL but it tends +// to make the match too greedy. +// Note that this URL will tend to match too much at the end - if there is a URL at the end of a sentence, the period will +// be matched by the regex. We accept this and handle it below when we create the link. +// This URL could be made to not match trailing characters like .?: but AFAIK this requires a negative lookbehind which are not supported in +// Safari before 16.4 which came out in Spring 2023 +export const urlRegex = + /https?:\/\/([\w-]+\.)+[\w-]+(\/[\p{L}\p{N}_\-.\+/?:%&=~#]*)?/gu export function remarkPlainLinks() { const turnIntoLink = (value: any, _match: string) => { + // Split off any trailing .?: characters and add them back after the link + const isSentenceInterpunctation = (c: string) => + c === "." || c === "?" || c === ":" + let i + for ( + i = value.length - 1; + i >= 0 && isSentenceInterpunctation(value[i]); + i-- + ) {} + const link = value.slice(0, i + 1) + const rest = value.slice(i + 1) + const restText = + rest.length > 0 + ? [ + { + type: "text", + value: rest, + }, + ] + : [] return [ { type: "link", - url: value, + url: link, children: [ { type: "text", - value: value, + value: link, }, ], }, + ...restText, ] } return (tree: any) => { From 944d5b2ad048e77eb16d31bc10e18e976968bb20 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Tue, 7 Nov 2023 19:48:41 +0100 Subject: [PATCH 3/3] :hammer: simplify regex solution for urls with trailing period etc --- .../src/markdown/remarkPlainLinks.ts | 35 ++++--------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts index bc2e27744be..b0b112b58cf 100644 --- a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts +++ b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts @@ -9,47 +9,26 @@ import findAndReplace from "mdast-util-find-and-replace" // The path, query string and fragment: A forward slash followed by any word or numeric character (unicode classes so umlauts like ö match // as well as any of the following: .+?:%&=~#) zero or more times. Note that we exclude space even though that is valid in a URL but it tends // to make the match too greedy. -// Note that this URL will tend to match too much at the end - if there is a URL at the end of a sentence, the period will -// be matched by the regex. We accept this and handle it below when we create the link. -// This URL could be made to not match trailing characters like .?: but AFAIK this requires a negative lookbehind which are not supported in -// Safari before 16.4 which came out in Spring 2023 +// We match the same subgroup [\p{L}\p{N}_\-.\+/?:%&=~#] twice, once with a * and then excactly once but without interpuncation characters .?: +// This is to make sure that we don't match trailing punctuation as part of the URL ("This is an http://example.com." - note that the leading +// period should not be part of the URL) +// Finally, the very last part is a lone forward slash which would not be matched by the previous subgroup. export const urlRegex = - /https?:\/\/([\w-]+\.)+[\w-]+(\/[\p{L}\p{N}_\-.\+/?:%&=~#]*)?/gu + /https?:\/\/([\w-]+\.)+[\w-]+((\/[\p{L}\p{N}_\-.\+/?:%&=~#]*[\p{L}\p{N}_\-\+/%&=~#])|\/)?/gu export function remarkPlainLinks() { const turnIntoLink = (value: any, _match: string) => { - // Split off any trailing .?: characters and add them back after the link - const isSentenceInterpunctation = (c: string) => - c === "." || c === "?" || c === ":" - let i - for ( - i = value.length - 1; - i >= 0 && isSentenceInterpunctation(value[i]); - i-- - ) {} - const link = value.slice(0, i + 1) - const rest = value.slice(i + 1) - const restText = - rest.length > 0 - ? [ - { - type: "text", - value: rest, - }, - ] - : [] return [ { type: "link", - url: link, + url: value, children: [ { type: "text", - value: link, + value: value, }, ], }, - ...restText, ] } return (tree: any) => {