Markdown: turn plain URLs into links (#2884)

By default, plain link like https://domain.com are not shown as links in markdown (see [commonmark spec](https://spec.commonmark.org/0.30/#autolinks)) and instead have to be written in the `[text](https://domain.com)` style. This PR adds autolinking to plain links. This is done in two different way - once for the SimpleMarkdown component that uses react-remark via a remark transformation plugin - and once for the MarkdownTextWrap component by doing the translation as part of the mapping to IRNodes In theory it should be possible to use the remark plugin in both cases, but navigating the set of interdependent libraries at the state two years ago is a pain and so I went for the two bespoke versions. When we upgrade our setup to be ESM everywhere we can upgrade to the latest versions of the unified, react-remark et al libraries and try again to use the same code in both cases.
owid · Nov 7, 2023 · 0541da5 · 0541da5
2 parents bed3489 + 944d5b2
commit 0541da5
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 6 deletions.
diff --git a/package.json b/package.json
@@ -155,6 +155,7 @@
         "lodash": "^4.17.20",
         "mathjax-full": "^3.1.0",
         "md5": "^2.3.0",
+        "mdast-util-find-and-replace": "1.1.1",
         "mdast-util-from-markdown": "^0.8.0",
         "minimist": "^1.2.6",
         "mobx": "^5.15.7",

diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx
@@ -17,6 +17,7 @@ import { TextWrap } from "../TextWrap/TextWrap.js"
 import fromMarkdown from "mdast-util-from-markdown"
 import type { Root, Content } from "mdast"
 import { match } from "ts-pattern"
+import { urlRegex } from "../markdown/remarkPlainLinks.js"
 
 const SUPERSCRIPT_NUMERALS = {
     "0": "\u2070",
@@ -895,12 +896,13 @@ function convertMarkdownNodeToIRTokens(
             (item) => {
                 const splitted = item.value.split(/\s+/)
                 const tokens = splitted.flatMap((text, i) => {
+                    const textNode = new IRText(text, fontParams)
+                    const node = text.match(urlRegex)
+                        ? new IRLink(text, [textNode], fontParams)
+                        : textNode
                     if (i < splitted.length - 1) {
-                        return [
-                            new IRText(text, fontParams),
-                            new IRWhitespace(fontParams),
-                        ]
-                    } else return [new IRText(text, fontParams)]
+                        return [node, new IRWhitespace(fontParams)]
+                    } else return [node]
                 })
                 return tokens
             }

diff --git a/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx b/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx
@@ -1,6 +1,7 @@
 import React from "react"
 import { computed } from "mobx"
 import { Remark } from "react-remark"
+import { remarkPlainLinks } from "./markdown/remarkPlainLinks.js"
 
 type SimpleMarkdownTextProps = {
     text: string
@@ -12,6 +13,6 @@ export class SimpleMarkdownText extends React.Component<SimpleMarkdownTextProps>
     }
 
     render(): JSX.Element | null {
-        return <Remark>{this.text}</Remark>
+        return <Remark remarkPlugins={[remarkPlainLinks]}>{this.text}</Remark>
     }
 }
diff --git a/packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts b/packages/@ourworldindata/components/src/markdown/mdast-util-find-and-replace.d.ts
@@ -0,0 +1 @@
+declare module "mdast-util-find-and-replace"
diff --git a/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts b/packages/@ourworldindata/components/src/markdown/remarkPlainLinks.ts
@@ -0,0 +1,37 @@
+import findAndReplace from "mdast-util-find-and-replace"
+
+// This regex matches:
+//   "http"
+//   an optional "s"
+//   two / characters
+//   The subdomains and hostname: Any word or numeric character or "_" or "-" one or more times followed by a period
+//   The TLD: Any word or numeric character or "_" or "-" one or more times
+//   The path, query string and fragment: A forward slash followed by any word or numeric character (unicode classes so umlauts like ö match
+//       as well as any of the following: .+?:%&=~#) zero or more times. Note that we exclude space even though that is valid in a URL but it tends
+//       to make the match too greedy.
+//       We match the same subgroup [\p{L}\p{N}_\-.\+/?:%&=~#] twice, once with a * and then excactly once but without interpuncation characters .?:
+//       This is to make sure that we don't match trailing punctuation as part of the URL ("This is an http://example.com." - note that the leading
+//       period should not be part of the URL)
+//       Finally, the very last part is a lone forward slash which would not be matched by the previous subgroup.
+export const urlRegex =
+    /https?:\/\/([\w-]+\.)+[\w-]+((\/[\p{L}\p{N}_\-.\+/?:%&=~#]*[\p{L}\p{N}_\-\+/%&=~#])|\/)?/gu
+
+export function remarkPlainLinks() {
+    const turnIntoLink = (value: any, _match: string) => {
+        return [
+            {
+                type: "link",
+                url: value,
+                children: [
+                    {
+                        type: "text",
+                        value: value,
+                    },
+                ],
+            },
+        ]
+    }
+    return (tree: any) => {
+        findAndReplace(tree, [[urlRegex, turnIntoLink]])
+    }
+}
diff --git a/yarn.lock b/yarn.lock
@@ -10518,6 +10518,7 @@ __metadata:
     lodash: "npm:^4.17.20"
     mathjax-full: "npm:^3.1.0"
     md5: "npm:^2.3.0"
+    mdast-util-find-and-replace: "npm:1.1.1"
     mdast-util-from-markdown: "npm:^0.8.0"
     minimist: "npm:^1.2.6"
     mobx: "npm:^5.15.7"
@@ -13597,6 +13598,17 @@ __metadata:
   languageName: node
   linkType: hard
 
+"mdast-util-find-and-replace@npm:1.1.1":
+  version: 1.1.1
+  resolution: "mdast-util-find-and-replace@npm:1.1.1"
+  dependencies:
+    escape-string-regexp: "npm:^4.0.0"
+    unist-util-is: "npm:^4.0.0"
+    unist-util-visit-parents: "npm:^3.0.0"
+  checksum: e4c9e50d9bce5ae4c728a925bd60080b94d16aaa312c27e2b70b16ddc29a5d0a0844d6e18efaef08aeb22c68303ec528f20183d1b0420504a0c2c1710cebd76f
+  languageName: node
+  linkType: hard
+
 "mdast-util-from-markdown@npm:^0.8.0":
   version: 0.8.5
   resolution: "mdast-util-from-markdown@npm:0.8.5"