Skip to content

Commit

Permalink
Markdown: turn plain URLs into links (#2884)
Browse files Browse the repository at this point in the history
By default, plain link like https://domain.com are not shown as links in markdown (see [commonmark spec](https://spec.commonmark.org/0.30/#autolinks)) and instead have to be written in the `[text](https://domain.com)` style. 

This PR adds autolinking to plain links. This is done in two different way
- once for the SimpleMarkdown component that uses react-remark via a remark transformation plugin
- and once for the MarkdownTextWrap component by doing the translation as part of the mapping to IRNodes

In theory it should be possible to use the remark plugin in both cases, but navigating the set of interdependent libraries at the state two years ago is a pain and so I went for the two bespoke versions. When we upgrade our setup to be ESM everywhere we can upgrade to the latest versions of the unified, react-remark et al libraries and try again to use the same code in both cases.
  • Loading branch information
danyx23 authored Nov 7, 2023
2 parents bed3489 + 944d5b2 commit 0541da5
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 6 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@
"lodash": "^4.17.20",
"mathjax-full": "^3.1.0",
"md5": "^2.3.0",
"mdast-util-find-and-replace": "1.1.1",
"mdast-util-from-markdown": "^0.8.0",
"minimist": "^1.2.6",
"mobx": "^5.15.7",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { TextWrap } from "../TextWrap/TextWrap.js"
import fromMarkdown from "mdast-util-from-markdown"
import type { Root, Content } from "mdast"
import { match } from "ts-pattern"
import { urlRegex } from "../markdown/remarkPlainLinks.js"

const SUPERSCRIPT_NUMERALS = {
"0": "\u2070",
Expand Down Expand Up @@ -895,12 +896,13 @@ function convertMarkdownNodeToIRTokens(
(item) => {
const splitted = item.value.split(/\s+/)
const tokens = splitted.flatMap((text, i) => {
const textNode = new IRText(text, fontParams)
const node = text.match(urlRegex)
? new IRLink(text, [textNode], fontParams)
: textNode
if (i < splitted.length - 1) {
return [
new IRText(text, fontParams),
new IRWhitespace(fontParams),
]
} else return [new IRText(text, fontParams)]
return [node, new IRWhitespace(fontParams)]
} else return [node]
})
return tokens
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import React from "react"
import { computed } from "mobx"
import { Remark } from "react-remark"
import { remarkPlainLinks } from "./markdown/remarkPlainLinks.js"

type SimpleMarkdownTextProps = {
text: string
Expand All @@ -12,6 +13,6 @@ export class SimpleMarkdownText extends React.Component<SimpleMarkdownTextProps>
}

render(): JSX.Element | null {
return <Remark>{this.text}</Remark>
return <Remark remarkPlugins={[remarkPlainLinks]}>{this.text}</Remark>
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
declare module "mdast-util-find-and-replace"
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import findAndReplace from "mdast-util-find-and-replace"

// This regex matches:
// "http"
// an optional "s"
// two / characters
// The subdomains and hostname: Any word or numeric character or "_" or "-" one or more times followed by a period
// The TLD: Any word or numeric character or "_" or "-" one or more times
// The path, query string and fragment: A forward slash followed by any word or numeric character (unicode classes so umlauts like ö match
// as well as any of the following: .+?:%&=~#) zero or more times. Note that we exclude space even though that is valid in a URL but it tends
// to make the match too greedy.
// We match the same subgroup [\p{L}\p{N}_\-.\+/?:%&=~#] twice, once with a * and then excactly once but without interpuncation characters .?:
// This is to make sure that we don't match trailing punctuation as part of the URL ("This is an http://example.com." - note that the leading
// period should not be part of the URL)
// Finally, the very last part is a lone forward slash which would not be matched by the previous subgroup.
export const urlRegex =
/https?:\/\/([\w-]+\.)+[\w-]+((\/[\p{L}\p{N}_\-.\+/?:%&=~#]*[\p{L}\p{N}_\-\+/%&=~#])|\/)?/gu

export function remarkPlainLinks() {
const turnIntoLink = (value: any, _match: string) => {
return [
{
type: "link",
url: value,
children: [
{
type: "text",
value: value,
},
],
},
]
}
return (tree: any) => {
findAndReplace(tree, [[urlRegex, turnIntoLink]])
}
}
12 changes: 12 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10518,6 +10518,7 @@ __metadata:
lodash: "npm:^4.17.20"
mathjax-full: "npm:^3.1.0"
md5: "npm:^2.3.0"
mdast-util-find-and-replace: "npm:1.1.1"
mdast-util-from-markdown: "npm:^0.8.0"
minimist: "npm:^1.2.6"
mobx: "npm:^5.15.7"
Expand Down Expand Up @@ -13597,6 +13598,17 @@ __metadata:
languageName: node
linkType: hard

"mdast-util-find-and-replace@npm:1.1.1":
version: 1.1.1
resolution: "mdast-util-find-and-replace@npm:1.1.1"
dependencies:
escape-string-regexp: "npm:^4.0.0"
unist-util-is: "npm:^4.0.0"
unist-util-visit-parents: "npm:^3.0.0"
checksum: e4c9e50d9bce5ae4c728a925bd60080b94d16aaa312c27e2b70b16ddc29a5d0a0844d6e18efaef08aeb22c68303ec528f20183d1b0420504a0c2c1710cebd76f
languageName: node
linkType: hard

"mdast-util-from-markdown@npm:^0.8.0":
version: 0.8.5
resolution: "mdast-util-from-markdown@npm:0.8.5"
Expand Down

0 comments on commit 0541da5

Please sign in to comment.