diff --git a/.prettierignore b/.prettierignore index 39ebada93d8..e37f3f047b1 100644 --- a/.prettierignore +++ b/.prettierignore @@ -14,3 +14,4 @@ coverage packages/@ourworldindata/*/dist/ dist/ grapherData/ +.vscode/ diff --git a/.vscode/launch.json b/.vscode/launch.json index 9cb3e25081e..4ef08867736 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,9 @@ "name": "Jest Tests", "program": "${workspaceFolder}/node_modules/jest/bin/jest.js", "request": "launch", - "skipFiles": ["/**"], + "skipFiles": [ + "/**" + ], "type": "node" }, { @@ -19,7 +21,10 @@ "request": "launch", "name": "Jest Test current file", "program": "${workspaceFolder}/node_modules/.bin/jest", - "args": ["${fileBasenameNoExtension}.js", "--watch"], + "args": [ + "${fileBasenameNoExtension}.js", + "--watch" + ], "console": "integratedTerminal" // "internalConsoleOptions": "neverOpen" }, @@ -53,23 +58,32 @@ "name": "Run migrate WP to ArchieML", "program": "${workspaceFolder}/itsJustJavascript/db/migrateWpPostsToArchieMl.js", "request": "launch", - "skipFiles": ["/**"], + "skipFiles": [ + "/**" + ], "type": "node" }, { "name": "Sync WP posts to grapher", "program": "${workspaceFolder}/itsJustJavascript/db/syncPostsToGrapher.js", "request": "launch", - "skipFiles": ["/**"], + "skipFiles": [ + "/**" + ], "type": "node" }, { "name": "Run SVGTester", "program": "${workspaceFolder}/itsJustJavascript/devTools/svgTester/verify-graphs.js", "request": "launch", - "skipFiles": ["/**"], + "skipFiles": [ + "/**" + ], "type": "node", - "args": ["-g", "367"] + "args": [ + "-g", + "367" + ] }, { "name": "Launch admin server", diff --git a/adminSiteClient/ChartEditorPage.tsx b/adminSiteClient/ChartEditorPage.tsx index 7d5406aecee..db273e8ad81 100644 --- a/adminSiteClient/ChartEditorPage.tsx +++ b/adminSiteClient/ChartEditorPage.tsx @@ -10,7 +10,6 @@ import { IReactionDisposer, } from "mobx" import { Prompt, Redirect } from "react-router-dom" -import { extractDetailsFromSyntax } from "@ourworldindata/components" import { Bounds, capitalize, @@ -19,6 +18,7 @@ import { get, set, groupBy, + extractDetailsFromSyntax, } from "@ourworldindata/utils" import { Grapher, Topic, GrapherInterface } from "@ourworldindata/grapher" import { Admin } from "./Admin.js" diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index d73fa5fcddb..b5cb6e0f5e1 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -45,8 +45,9 @@ import { OwidGdocPublished, clone, LinkedChart, + extractDetailsFromSyntax, } from "@ourworldindata/utils" -import { extractDetailsFromSyntax } from "@ourworldindata/components" + import { execWrapper } from "../db/execWrapper.js" import { countryProfileSpecs } from "../site/countryProfileProjects.js" import { diff --git a/devTools/dodParserTestGenerator/generate-test.ts b/devTools/dodParserTestGenerator/generate-test.ts deleted file mode 100644 index 6fc0e6e34ae..00000000000 --- a/devTools/dodParserTestGenerator/generate-test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { mdParser } from "@ourworldindata/components" - -import parseArgs from "minimist" -async function main(parsedArgs: parseArgs.ParsedArgs) { - const parseString = parsedArgs._[0] - const result = mdParser.markdown.parse(parseString) - const description = parsedArgs["d"] - if (parsedArgs["result-only"]) - console.log(JSON.stringify(result, undefined, 2)) - else - console.log(` - it(${description || "parses markdown correctly"}, () => { - expect(mdParser.markdown.parse("${parseString}")).toEqual( - ${JSON.stringify(result, undefined, 2)} - ) - }) - `) -} - -const parsedArgs = parseArgs(process.argv.slice(2), { - boolean: true, -}) - -if (parsedArgs["h"] || parsedArgs["help"]) { - console.log(`generate-tests.js - utility to generate tests for the DoD parser from an input text - -Usage: - dump-data.js (--result-only) (-d "Test description") '[test](hover::cat::term)' - -Options: - --result-only Only output the parse result, not the test case chrome around it - -d DESC Use the given desription for the test - `) - process.exit(0) -} else { - main(parsedArgs) -} diff --git a/devTools/dodParserTestGenerator/tsconfig.json b/devTools/dodParserTestGenerator/tsconfig.json deleted file mode 100644 index e0d38a4ff33..00000000000 --- a/devTools/dodParserTestGenerator/tsconfig.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "extends": "../tsconfigs/tsconfig.base.json", - "compilerOptions": { - "outDir": "../../itsJustJavascript/devTools/dodParserTestGenerator", - "rootDir": "." - }, - "references": [{ "path": "../../settings" }] -} diff --git a/package.json b/package.json index 13a836cc626..d99adf85805 100644 --- a/package.json +++ b/package.json @@ -82,6 +82,7 @@ "@types/jsonwebtoken": "^9.0.0", "@types/lodash": "^4.14.185", "@types/md5": "^2.3.2", + "@types/mdast": "^3.0", "@types/minimist": "^1.2.2", "@types/mousetrap": "^1.6.9", "@types/mysql": "^2.15.21", @@ -154,6 +155,7 @@ "lodash": "^4.17.20", "mathjax-full": "^3.1.0", "md5": "^2.3.0", + "mdast-util-from-markdown": "^0.8.0", "minimist": "^1.2.6", "mobx": "^5.15.7", "mobx-formatters": "^1.0.2", diff --git a/packages/@ourworldindata/components/src/GdocsUtils.ts b/packages/@ourworldindata/components/src/GdocsUtils.ts index 4eeaff0e7d1..ac040b0a8f2 100644 --- a/packages/@ourworldindata/components/src/GdocsUtils.ts +++ b/packages/@ourworldindata/components/src/GdocsUtils.ts @@ -1,18 +1,11 @@ import { spansToUnformattedPlainText, gdocUrlRegex, - EnrichedBlockText, OwidGdocLinkJSON, Span, Url, } from "@ourworldindata/utils" import urlSlug from "url-slug" -import { - EveryMarkdownNode, - MarkdownRoot, - mdParser, -} from "./MarkdownTextWrap/parser.js" -import { P, match } from "ts-pattern" export function getLinkType(urlString: string): OwidGdocLinkJSON["linkType"] { const url = Url.fromURL(urlString) @@ -50,138 +43,3 @@ export function getUrlTarget(urlString: string): string { export function convertHeadingTextToId(headingText: Span[]): string { return urlSlug(spansToUnformattedPlainText(headingText)) } - -const convertMarkdownNodeToSpan = (node: EveryMarkdownNode): Span[] => { - return match(node) - .with( - { - type: "text", - }, - (n) => [ - { - spanType: "span-simple-text" as const, - text: n.value, - } as Span, - ] - ) - .with( - { - type: "textSegments", - }, - (n) => n.children.flatMap(convertMarkdownNodeToSpan) as Span[] - ) - .with( - { - type: "newline", - }, - () => [ - { - spanType: "span-simple-text" as const, - text: "\n", - } as Span, - ] - ) - .with( - { - type: "whitespace", - }, - () => [ - { - spanType: "span-simple-text" as const, - text: " ", - } as Span, - ] - ) - .with( - { - type: "detailOnDemand", - }, - (n) => [ - { - spanType: "span-dod" as const, - id: n.term, - children: n.children.flatMap(convertMarkdownNodeToSpan), - } as Span, - ] - ) - .with( - { - type: "markdownLink", - }, - (n) => [ - { - spanType: "span-link" as const, - url: n.href, - children: n.children.flatMap(convertMarkdownNodeToSpan), - } as Span, - ] - ) - .with( - { - type: "plainUrl", - }, - (n) => [ - { - spanType: "span-link" as const, - url: n.href, - children: [ - { - spanType: "span-simple-text" as const, - text: n.href, - }, - ], - } as Span, - ] - ) - .with( - { - type: P.union("italic", "plainItalic", "italicWithoutBold"), - }, - (n) => [ - { - spanType: "span-italic" as const, - children: n.children.flatMap(convertMarkdownNodeToSpan), - } as Span, - ] - ) - .with( - { - type: P.union("bold", "plainBold", "boldWithoutItalic"), - }, - (n) => [ - { - spanType: "span-bold" as const, - children: n.children.flatMap(convertMarkdownNodeToSpan), - } as Span, - ] - ) - .exhaustive() - //.otherwise(() => ({ spanType: "span-simple-text" as const, text: "" })) -} - -const convertMarkdownNodesToSpans = (nodes: MarkdownRoot): Span[] => - nodes.children.flatMap(convertMarkdownNodeToSpan) - -export const markdownToEnrichedTextBlock = ( - markdown: string -): EnrichedBlockText => { - const parsedMarkdown = mdParser.markdown.parse(markdown) - if (parsedMarkdown.status) { - const spans = convertMarkdownNodesToSpans(parsedMarkdown.value) - return { - type: "text", - value: spans, - parseErrors: [], - } - } else - return { - type: "text", - value: [], - parseErrors: [ - { - message: `Failed to parse markdown - expected ${parsedMarkdown.expected} at ${parsedMarkdown.index}`, - isWarning: false, - }, - ], - } -} diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.stories.tsx b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.stories.tsx index 9397766a348..314fa6b32c0 100644 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.stories.tsx +++ b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.stories.tsx @@ -3,10 +3,9 @@ import { action, computed, observable } from "mobx" import { observer } from "mobx-react" import { MarkdownTextWrap, - parsimmonToTextTokens, IRToken, + convertMarkdownToIRTokens, } from "./MarkdownTextWrap" -import { mdParser } from "./parser.js" import { TextWrap } from "../TextWrap/TextWrap.js" export default { @@ -54,11 +53,8 @@ _THE END_ } @computed get tokens(): IRToken[] { - const result = mdParser.markdown.parse(this.markdown) - if (result.status) { - return parsimmonToTextTokens(result.value.children) - } - return [] + const result = convertMarkdownToIRTokens(this.markdown) + return result } render(): JSX.Element { diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.test.ts b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.test.ts index 0dd456d5c83..05da1aaffe3 100644 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.test.ts +++ b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.test.ts @@ -83,7 +83,7 @@ describe("MarkdownTextWrap", () => { it("should split on newline", () => { const element = new MarkdownTextWrap({ - text: "_test\n**\nnewline\n**_test", + text: "_test\n**\nnewlineyarn \n**_test", fontSize: 10, lineHeight: 1, }) diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx index 1b70e5bd10e..81239787b57 100644 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx +++ b/packages/@ourworldindata/components/src/MarkdownTextWrap/MarkdownTextWrap.tsx @@ -1,6 +1,5 @@ import React, { CSSProperties } from "react" import { computed } from "mobx" -import { EveryMarkdownChildNode, MarkdownRoot, mdParser } from "./parser.js" import { excludeUndefined, last, @@ -11,8 +10,13 @@ import { get, Bounds, FontFamily, + dropWhile, + dropRightWhile, } from "@ourworldindata/utils" import { TextWrap } from "../TextWrap/TextWrap.js" +import fromMarkdown from "mdast-util-from-markdown" +import type { Root, Content } from "mdast" +import { match } from "ts-pattern" const SUPERSCRIPT_NUMERALS = { "0": "\u2070", @@ -57,7 +61,7 @@ export class IRText implements IRToken { return Bounds.forText(this.text, this.fontParams).width } @imemo get height(): number { - return this.fontParams?.fontSize || 16 + return this.fontParams?.fontSize || 13 } getBreakpointBefore(): undefined { return undefined @@ -493,63 +497,6 @@ export const sumTextWrapHeights = ( sum(elements.map((element) => element.height)) + (elements.length - 1) * spacer -export function parsimmonToTextTokens( - nodes: EveryMarkdownChildNode[], - fontParams?: IRFontParams -): IRToken[] { - return nodes.map((node): IRToken => { - if (node.type === "text") { - return new IRText(node.value, fontParams) - } else if (node.type === "newline") { - return new IRLineBreak() - } else if (node.type === "whitespace") { - return new IRWhitespace(fontParams) - } else if ( - node.type === "bold" || - node.type === "plainBold" || - node.type === "boldWithoutItalic" - ) { - return new IRBold( - parsimmonToTextTokens(node.children, { - ...fontParams, - fontWeight: 700, - }) - ) - } else if ( - node.type === "italic" || - node.type === "plainItalic" || - node.type === "italicWithoutBold" - ) { - return new IRItalic( - parsimmonToTextTokens(node.children, { - ...fontParams, - isItalic: true, - }) - ) - } else if (node.type === "plainUrl") { - return new IRLink( - node.href, - parsimmonToTextTokens( - [{ type: "text", value: node.href }], - fontParams - ) - ) - } else if (node.type === "markdownLink") { - return new IRLink( - node.href, - parsimmonToTextTokens(node.children, fontParams) - ) - } else if (node.type === "detailOnDemand") { - return new IRDetailOnDemand( - node.term, - parsimmonToTextTokens(node.children, fontParams) - ) - } else { - throw new Error(`Unknown node type: ${(node as any).type}`) - } - }) -} - type MarkdownTextWrapProps = { text: string fontSize: number @@ -579,26 +526,42 @@ export class MarkdownTextWrap extends React.Component { } } @computed get text(): string { - return this.props.text + // NOTE: ❗Here we deviate from the normal markdown spec. We replace \n with \n to make sure that single \n are treated as + // actual line breaks but only if none of the other markdown line break rules apply. + // This is a bit different to how markdown usually works but we have a substantial + // amount of legacy charts that use newlines in this way and it seems that it is + // better to support this simple case than to do a data migration of many chart subtitles. + const baseText = this.props.text + // This replace is a bit funky - we want to make sure that single \n are treated as + // actual line breaks but only if none of the other markdown line break rules apply. + // These are: + // - \n\n is always a new paragraph + // - Two spaces before \n is a line break (this rule is not entirely checked as we only check for a single space) + // - A backslash before \n is a line break + // The code below normalizes all cases to \n which will lead to them surviving the markdown parsing + let text = baseText.trim() + text = text.replaceAll("\n\n", "@@LINEBREAK@@") + text = text.replaceAll("\\\n", "@@LINEBREAK@@") + text = text.replaceAll(" \n", "@@LINEBREAK@@") + text = text.replaceAll("\n", " \n") + text = text.replaceAll("@@LINEBREAK@@", " \n") + return text } @computed get detailsOrderedByReference(): Set { return this.props.detailsOrderedByReference || new Set() } - @computed get ast(): MarkdownRoot["children"] { - if (!this.text) return [] - const result = mdParser.markdown.parse(this.props.text) - if (result.status) { - return result.value.children - } - return [] - } @computed get plaintext(): string { return this.htmlLines.map(lineToPlaintext).join("\n") } + @computed get tokensFromMarkdown(): IRToken[] { + const tokens = convertMarkdownToIRTokens(this.text, this.fontParams) + return tokens + } + @computed get htmlLines(): IRToken[][] { - const tokens = parsimmonToTextTokens(this.ast, this.fontParams) + const tokens = this.tokensFromMarkdown const lines = splitIntoLines(tokens, this.maxWidth) return lines.map(recursiveMergeTextTokens) } @@ -640,7 +603,7 @@ export class MarkdownTextWrap extends React.Component { return appendedTokens } - const tokens = parsimmonToTextTokens(this.ast, this.fontParams) + const tokens = this.tokensFromMarkdown const tokensWithReferenceNumbers = appendReferenceNumbers(tokens) return splitIntoLines(tokensWithReferenceNumbers, this.maxWidth) } @@ -740,3 +703,326 @@ function MarkdownTextWrapLine({ line }: { line: IRToken[] }): JSX.Element { ) } + +export function convertMarkdownToIRTokens( + markdown: string, + fontParams?: IRFontParams +): IRToken[] { + const ast: Root = fromMarkdown(markdown) + const children = ast.children.flatMap((item: Content) => + convertMarkdownNodeToIRTokens(item, fontParams) + ) + // ensure that there are no leading or trailing line breaks + return dropRightWhile( + dropWhile(children, (token) => token instanceof IRLineBreak), + (token) => token instanceof IRLineBreak + ) +} + +// When using mdast types version 4 this should be typed as: +// node: RootContentMap[keyof RootContentMap] +function convertMarkdownNodeToIRTokens( + node: Content, + fontParams: IRFontParams = {} +): IRToken[] { + const converted = match(node) + .with( + { + type: "blockquote", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "break", + }, + (_) => { + return [new IRLineBreak()] + } + ) + .with( + { + type: "code", + }, + (item) => { + return [new IRText(item.value, fontParams)] + } + ) + .with( + { + type: "emphasis", + }, + (item) => { + return [ + new IRItalic( + item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, { + ...fontParams, + isItalic: true, + }) + ) + ), + ] + } + ) + .with( + { + type: "heading", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "html", + }, + (item) => { + return [new IRText(item.value, fontParams)] + } + ) + .with( + { + type: "image", + }, + (item) => { + return [new IRText(item.alt ?? "", fontParams)] + } + ) + .with( + { + type: "inlineCode", + }, + (item) => { + return [new IRText(item.value, fontParams)] + } + ) + .with( + { + type: "link", + }, + (item) => { + if (item.url.startsWith("#dod:")) { + const term = item.url.replace("#dod:", "") + return [ + new IRDetailOnDemand( + term, + item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ), + fontParams + ), + ] + } else + return [ + new IRLink( + item.url, + item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + ), + ] + } + ) + .with( + { + type: "list", + }, + (item) => { + if (item.ordered) + return item.children.flatMap((child, index) => [ + new IRLineBreak(), + new IRText(`${index + 1}) `, fontParams), + ...convertMarkdownNodeToIRTokens(child, fontParams), + ]) + else + return item.children.flatMap((child) => [ + new IRLineBreak(), + new IRText(`• `, fontParams), + ...convertMarkdownNodeToIRTokens(child, fontParams), + ]) + } + ) + .with( + { + type: "listItem", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "paragraph", + }, + (item) => { + return [ + ...item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ), + ] + } + ) + .with( + { + type: "strong", + }, + (item) => { + return [ + new IRBold( + item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, { + ...fontParams, + fontWeight: 700, + }) + ) + ), + ] + } + ) + .with( + { + type: "text", + }, + (item) => { + const splitted = item.value.split(/\s+/) + const tokens = splitted.flatMap((text, i) => { + if (i < splitted.length - 1) { + return [ + new IRText(text, fontParams), + new IRWhitespace(fontParams), + ] + } else return [new IRText(text, fontParams)] + }) + return tokens + } + ) + .with( + { + type: "thematicBreak", + }, + (_) => { + return [new IRText("---", fontParams)] + } + ) + .with( + { + type: "delete", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + // Now lets finish this with blocks for FootnoteDefinition, Definition, ImageReference, LinkReference, FootnoteReference, and Table + .with( + { + type: "footnoteDefinition", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "definition", + }, + (item) => { + return [ + new IRText(`${item.identifier}: ${item.label}`, fontParams), + ] + } + ) + .with( + { + type: "imageReference", + }, + (item) => { + return [ + new IRText(`${item.identifier}: ${item.label}`, fontParams), + ] + } + ) + .with( + { + type: "linkReference", + }, + (item) => { + return [ + new IRText(`${item.identifier}: ${item.label}`, fontParams), + ] + } + ) + .with( + { + type: "footnoteReference", + }, + (item) => { + return [ + new IRText(`${item.identifier}: ${item.label}`, fontParams), + ] + } + ) + .with( + { + type: "table", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "tableCell", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + // and now TableRow and Yaml + .with( + { + type: "tableRow", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .with( + { + type: "yaml", + }, + (item) => { + return [new IRText(item.value, fontParams)] + } + ) + .with( + { + type: "footnote", + }, + (item) => { + return item.children.flatMap((child) => + convertMarkdownNodeToIRTokens(child, fontParams) + ) + } + ) + .exhaustive() + return converted +} diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.test.ts b/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.test.ts deleted file mode 100644 index 4f87a89b3f0..00000000000 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.test.ts +++ /dev/null @@ -1,1037 +0,0 @@ -import { mdParser } from "./parser" - -describe("mdast parsers", () => { - it("mdParser works for non-link brackets", () => { - expect(mdParser.markdown.parse("[some text]")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "[", - }, - { - type: "text", - value: "some", - }, - { - type: "whitespace", - }, - - { - type: "text", - value: "text", - }, - { - type: "text", - value: "]", - }, - ], - }, - }) - }) - it("mdParser works for funky characters in dod texts", () => { - expect( - mdParser.markdown.parse("[int.$ *?=😛§&/%ü€](#dod:term)") - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "detailOnDemand", - children: [ - { - type: "text", - value: "int.$", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "*?=😛§&/%ü€", - }, - ], - term: "term", - }, - ], - }, - }) - }) - it("mdParser can parse a word", () => { - expect(mdParser.markdown.parse("word")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "word", - }, - ], - }, - }) - }) - it("mdParser can parse words with punctuation", () => { - expect(mdParser.markdown.parse("can't?")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "can't?", - }, - ], - }, - }) - - expect(mdParser.markdown.parse("'mid-west'")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "'mid-west'", - }, - ], - }, - }) - }) - it("mdParser can parse a word with bold", () => { - expect(mdParser.markdown.parse("**I'm bold as brass**")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "bold", - children: [ - { type: "text", value: "I'm" }, - { type: "whitespace" }, - { type: "text", value: "bold" }, - { type: "whitespace" }, - { type: "text", value: "as" }, - { type: "whitespace" }, - { type: "text", value: "brass" }, - ], - }, - ], - }, - }) - }) - it("mdParser can parse a phrase with italics", () => { - expect(mdParser.markdown.parse("_Mamma mia!_")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "italic", - children: [ - { type: "text", value: "Mamma" }, - { type: "whitespace" }, - { type: "text", value: "mia!" }, - ], - }, - ], - }, - }) - }) - it("mdParser can parse URLs", () => { - expect(mdParser.markdown.parse("www.google.com")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "plainUrl", - href: "www.google.com", - }, - ], - }, - }) - expect(mdParser.markdown.parse("[test](www.google.com)")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "markdownLink", - children: [{ type: "text", value: "test" }], - href: "www.google.com", - }, - ], - }, - }) - }) - it("can parse markdown links with relative URLs", () => { - expect(mdParser.markdown.parse("[about us](/about-us)")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - type: "text", - value: "about", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "us", - }, - ], - href: "/about-us", - type: "markdownLink", - }, - ], - }, - }) - expect(mdParser.markdown.parse("[test](www.google.com)")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "markdownLink", - children: [{ type: "text", value: "test" }], - href: "www.google.com", - }, - ], - }, - }) - }) - - it("mdParser can parse detail on demand syntax", () => { - expect(mdParser.markdown.parse("[**dod**](#dod:thing)")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "detailOnDemand", - children: [ - { - type: "plainBold", - children: [ - { - type: "text", - value: "dod", - }, - ], - }, - ], - term: "thing", - }, - ], - }, - }) - expect( - mdParser.markdown.parse("[a dod with multiple words](#dod:thing)") - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "detailOnDemand", - children: [ - { - type: "text", - value: "a", - }, - { type: "whitespace" }, - { - type: "text", - value: "dod", - }, - { type: "whitespace" }, - { - type: "text", - value: "with", - }, - { type: "whitespace" }, - { - type: "text", - value: "multiple", - }, - { type: "whitespace" }, - { - type: "text", - value: "words", - }, - ], - term: "thing", - }, - ], - }, - }) - }) - it("mdParser can parse words and newlines", () => { - expect( - mdParser.markdown.parse(`hello - -how **are** you?`) - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "hello", - }, - { - type: "newline", - }, - { - type: "newline", - }, - - { - type: "text", - value: "how", - }, - { type: "whitespace" }, - { - children: [ - { - type: "text", - value: "are", - }, - ], - type: "bold", - }, - { type: "whitespace" }, - - { - type: "text", - value: "you?", - }, - ], - }, - }) - }) - - it("mdParser can parse nested bold and italics", () => { - expect( - mdParser.markdown.parse( - "Hello _I am italicized and **I am bolded and italicized**_" - ) - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "Hello", - }, - { type: "whitespace" }, - { - children: [ - { - type: "text", - value: "I", - }, - { type: "whitespace" }, - { - type: "text", - value: "am", - }, - { type: "whitespace" }, - { - type: "text", - value: "italicized", - }, - { type: "whitespace" }, - { - type: "text", - value: "and", - }, - { type: "whitespace" }, - { - children: [ - { - type: "text", - value: "I", - }, - { type: "whitespace" }, - { - type: "text", - value: "am", - }, - { type: "whitespace" }, - { - type: "text", - value: "bolded", - }, - { type: "whitespace" }, - { - type: "text", - value: "and", - }, - { type: "whitespace" }, - { - type: "text", - value: "italicized", - }, - ], - type: "boldWithoutItalic", - }, - ], - type: "italic", - }, - ], - }, - }) - }) - - it("mdParser can parse links inside bold and italics", () => { - expect( - mdParser.markdown.parse( - "**_[bold and italic](www.ourworldindata.org)_**" - ) - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - children: [ - { - children: [ - { - type: "text", - value: "bold", - }, - { type: "whitespace" }, - { - type: "text", - value: "and", - }, - { type: "whitespace" }, - { - type: "text", - value: "italic", - }, - ], - href: "www.ourworldindata.org", - type: "markdownLink", - }, - ], - type: "italicWithoutBold", - }, - ], - type: "bold", - }, - ], - }, - }) - - expect( - mdParser.markdown.parse("_**[italic and bold](www.google.com)**_") - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - children: [ - { - children: [ - { - type: "text", - value: "italic", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "and", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "bold", - }, - ], - href: "www.google.com", - type: "markdownLink", - }, - ], - type: "boldWithoutItalic", - }, - ], - type: "italic", - }, - ], - }, - }) - }) - - it("mdParser can parse details on demand inside bold", () => { - expect( - mdParser.markdown.parse( - "**[an _italicized_ detail on demand](#dod:monad)**" - ) - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - children: [ - { - type: "text", - value: "an", - }, - { - type: "whitespace", - }, - { - children: [ - { - type: "text", - value: "italicized", - }, - ], - type: "plainItalic", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "detail", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "on", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "demand", - }, - ], - term: "monad", - type: "detailOnDemand", - }, - ], - type: "bold", - }, - ], - }, - }) - }) - - it("mdParser can parse words adjacent to bold", () => { - expect(mdParser.markdown.parse("**bold**-word")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "bold", - children: [{ type: "text", value: "bold" }], - }, - { type: "text", value: "-word" }, - ], - }, - }) - }) - - it("Parser can parse bold starting and stopping inside a word", () => { - expect(mdParser.markdown.parse("test**some**postfix")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "test", - }, - { - type: "bold", - children: [ - { - type: "text", - value: "some", - }, - ], - }, - { - type: "text", - value: "postfix", - }, - ], - }, - }) - }) - - it("parses unfinished bold correctly as text", () => { - expect(mdParser.markdown.parse("** unfinished bold")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "**", - }, - - { - type: "whitespace", - }, - - { - type: "text", - value: "unfinished", - }, - - { - type: "whitespace", - }, - - { - type: "text", - value: "bold", - }, - ], - }, - }) - }) - - it("parses unfinished bold with finished italic correctly", () => { - expect( - mdParser.markdown.parse("** unfinished bold _ italic _") - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "**", - }, - { - type: "whitespace", - }, - - { - type: "text", - value: "unfinished", - }, - - { - type: "whitespace", - }, - - { - type: "text", - value: "bold", - }, - - { - type: "whitespace", - }, - { - type: "italic", - children: [ - { - type: "whitespace", - }, - { - type: "text", - value: "italic", - }, - { - type: "whitespace", - }, - ], - }, - ], - }, - }) - }) - - it("parses nested-in-bold, non-spaced italics", () => { - expect(mdParser.markdown.parse("**one-_two_-three**")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - type: "text", - value: "one-", - }, - { - children: [ - { - type: "text", - value: "two", - }, - ], - type: "italicWithoutBold", - }, - { - type: "text", - value: "-three", - }, - ], - type: "bold", - }, - ], - }, - }) - }) - - it("parses nested-in-italic, non-spaced bold", () => { - expect(mdParser.markdown.parse("_one-**two**-three_")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - children: [ - { - type: "text", - value: "one-", - }, - { - children: [ - { - type: "text", - value: "two", - }, - ], - type: "boldWithoutItalic", - }, - { - type: "text", - value: "-three", - }, - ], - type: "italic", - }, - ], - }, - }) - }) - - it("parses markdown links with just bold or just italic correctly and ignores nested bold/italic", () => { - expect( - mdParser.markdown.parse( - "[A **bold** _italic **nonnested**_ link](https://owid.io/test)" - ) - ).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "markdownLink", - children: [ - { - type: "text", - value: "A", - }, - { - type: "whitespace", - }, - { - type: "plainBold", - children: [ - { - type: "text", - value: "bold", - }, - ], - }, - { - type: "whitespace", - }, - { - type: "plainItalic", - children: [ - { - type: "text", - value: "italic", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "**nonnested**", - }, - ], - }, - { - type: "whitespace", - }, - { - type: "text", - value: "link", - }, - ], - href: "https://owid.io/test", - }, - ], - }, - }) - }) - it("Parses nonbreaking spaces as text", () => { - expect(mdParser.markdown.parse("text with nonbreaking space")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "text", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "with", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "nonbreaking", - }, - { - type: "text", - value: " ", - }, - { - type: "text", - value: "space", - }, - ], - }, - }) - }) - it("Parses whitespace preceding a newline", () => { - const input = - "this-line-ends-with-a-space" + - " " + - "\n" + - "but-the-newline-should-be-tracked-separately" - expect(mdParser.markdown.parse(input)).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "this-line-ends-with-a-space", - }, - { - type: "whitespace", - }, - { - type: "newline", - }, - { - type: "text", - value: "but-the-newline-should-be-tracked-separately", - }, - ], - }, - }) - }) - it("Parses newlines surrounded by whitespace", () => { - const input = - "this-line-ends-with-a-space" + - " " + - "\n\n" + - " " + - "but-the-newline-should-be-tracked-separately" - expect(mdParser.markdown.parse(input)).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "this-line-ends-with-a-space", - }, - { - type: "whitespace", - }, - { - type: "newline", - }, - { - type: "newline", - }, - { - type: "whitespace", - }, - { - type: "text", - value: "but-the-newline-should-be-tracked-separately", - }, - ], - }, - }) - }) - it("parses link inside brackets", () => { - expect(mdParser.markdown.parse("[[link](www.google.com)]")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "[", - }, - { - type: "markdownLink", - children: [ - { - type: "text", - value: "link", - }, - ], - href: "www.google.com", - }, - { - type: "text", - value: "]", - }, - ], - }, - }) - }) - it("parses link inside parentheses", () => { - expect(mdParser.markdown.parse("([link](www.google.com))")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "(", - }, - { - type: "markdownLink", - children: [ - { - type: "text", - value: "link", - }, - ], - href: "www.google.com", - }, - { - type: "text", - value: ")", - }, - ], - }, - }) - }) - it("parses parens inside link inside parentheses", () => { - expect(mdParser.markdown.parse("([l(i)nk](www.google.com))")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "(", - }, - { - type: "markdownLink", - children: [ - { - type: "text", - value: "l(i)nk", - }, - ], - href: "www.google.com", - }, - { - type: "text", - value: ")", - }, - ], - }, - }) - }) - - it("parses parens next to closing bold sequence", () => { - expect(mdParser.markdown.parse("**Our World in Data (OWID)**")).toEqual( - { - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "bold", - children: [ - { type: "text", value: "Our" }, - { type: "whitespace" }, - { type: "text", value: "World" }, - { type: "whitespace" }, - { type: "text", value: "in" }, - { type: "whitespace" }, - { type: "text", value: "Data" }, - { type: "whitespace" }, - { type: "text", value: "(" }, - { type: "text", value: "OWID" }, - { type: "text", value: ")" }, - ], - }, - ], - }, - } - ) - }) - - it("parses too many underscores as text", () => { - expect(mdParser.markdown.parse("____abc__")).toEqual({ - status: true, - value: { - type: "MarkdownRoot", - children: [ - { - type: "text", - value: "____abc__", - }, - ], - }, - }) - }) -}) diff --git a/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.ts b/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.ts deleted file mode 100644 index e34a114b993..00000000000 --- a/packages/@ourworldindata/components/src/MarkdownTextWrap/parser.ts +++ /dev/null @@ -1,572 +0,0 @@ -import P from "parsimmon" -import { detailOnDemandRegex } from "@ourworldindata/utils" -// An AST inspired by MDAST -// Deviates because we want to track individual words, whitespace, and newlines to use with MarkdownTextWrap and our SVG exporter - -// How this parser works - -// This parser uses the parsimmon javascript library that implements a monadic parser combinator. -// We considered forking simple-markdown, but went with combinators for easier maintenance. -// You can think of a parser in this context as a generic class that parses an object of the type -// that is specified as the type parameter of the class. parser combinators build up more complex -// parsers by combining smaller parsers, mostly by specifying either sequences or alternatives. -// Alternatives (P.alt()) try a list of given parsers one at a time, backtracking when parsing with -// a given parser doesn't work and trying the next one. - -// Because of this it is important to consider the order of parsers in alternatives and to make -// sure that nested parsers fail when they should (e.g. when you have an opening ** for bold it -// is important that you make sure that you find a matching ** at the end in instead of -// accidentally consuming ** with a very generic parser that takes any token and that would -// then not let you match this end string fragment and close the bold tag). - -// By and large this parser tries to define a special type for every individual parser plus a -// parsing function. We don't really care about some of the differences (e.g. Text and NonBracketWord -// have to be parsed differently but result in the same shape of data). To make things more -// consistent though, every parser has it's own type name, even if they rather often just alias -// to Text. This should make it easy in the future to switch more parsed types to actual concrete -// types if we need a richer AST for some reason. - -// Parsing bold and italic in markdown is a bit more involved than most parsing jobs for actual -// programming languages that try harder to be parseable with a context free grammar. Consider -// that bold and italic can be nested in each other but it doesn't really make sense to nest -// bold in italic in bold (and this would create annoying ambiguity). For this reason this parser -// is quite explicit and has 3 different kinds of bold and italic: -// * one that can contain only contain text, whitespace and newlines -// * one that can also contain Urls, markdown links and Details on Demand but not other italic or bold -// * and finally one for the top level that can also contain the other one (bold that can have -// italic or italic that can have bold content) but in a non-nestable way - -// This might be overkill for our current needs but I wanted to err on the side of making the -// parser strict and precisise now to avoid weird ambiguities in the future. - -//#region Parser types - -// The default interface for nodes that (for now) we don't want to track as a special type -interface Text { - type: "text" - value: string -} - -// A special literal that we use when building lines with TextWrap -interface Newline { - type: "newline" -} - -// Another literal that's needed to know when to reinsert spaces (e.g. "**one**-two" versus "**one** -two") -interface Whitespace { - type: "whitespace" -} - -interface PlainUrl { - type: "plainUrl" - href: string -} - -type NonBracketWord = Text - -type NonParensWord = Text - -type NonSingleUnderscoreWord = Text - -type NonDoubleColonOrParensWord = Text - -type NonDoubleStarWord = Text - -type MarkdownLinkContent = Whitespace | Newline | NonBracketWord - -type DodCategory = Text - -type DodTerm = Text - -interface MarkdownLink { - type: "markdownLink" - children: MarkdownLinkContent[] - href: string -} - -type DetailsOnDemandContent = - | Whitespace - | Newline - | PlainItalic - | PlainBold - | NonBracketWord - -interface DetailOnDemand { - type: "detailOnDemand" - term: string - children: DetailsOnDemandContent[] -} - -type BoldWithoutItalicContent = - | Whitespace - | Newline - | PlainUrl - | MarkdownLink - | DetailOnDemand - | NonDoubleStarWord - -interface BoldWithoutItalic { - type: "boldWithoutItalic" - children: BoldWithoutItalicContent[] -} - -type BoldContent = - | ItalicWithoutBold - | Whitespace - | Newline - | PlainUrl - | MarkdownLink - | DetailOnDemand - | NonDoubleStarWord - -interface Bold { - type: "bold" - children: BoldContent[] -} - -type PlainBoldContent = Whitespace | Newline | NonDoubleStarWord - -interface PlainBold { - type: "plainBold" - children: PlainBoldContent[] -} - -type ItalicWithoutBoldContent = - | Whitespace - | Newline - | PlainUrl - | MarkdownLink - | DetailOnDemand - | NonSingleUnderscoreWord - -interface ItalicWithoutBold { - type: "italicWithoutBold" - children: ItalicWithoutBoldContent[] -} -type ItalicContent = - | BoldWithoutItalic - | Whitespace - | Newline - | PlainUrl - | MarkdownLink - | DetailOnDemand - | NonSingleUnderscoreWord - -interface Italic { - type: "italic" - children: ItalicContent[] -} - -type PlainItalicContent = Whitespace | Newline | NonSingleUnderscoreWord - -interface PlainItalic { - type: "plainItalic" - children: PlainItalicContent[] -} - -// TextSegment is used when we need to break up a string of non-whitespace characters -// into multiple segments because it may have "formatting tmesis" -// e.g. abso_freaking_lutely -type TextSegment = Bold | Italic | Text - -interface TextSegments { - type: "textSegments" - children: TextSegment[] -} - -export interface MarkdownRoot { - type: "MarkdownRoot" - children: Array -} - -type languagePartsType = typeof languageParts - -type MdParser = { - [P in keyof languagePartsType]: ReturnType -} - -export type EveryMarkdownRootNode = - | Newline - | Whitespace - | DetailOnDemand - | MarkdownLink - | PlainUrl - | Bold - | PlainBold - | Italic - | PlainItalic - | TextSegments - | Text - -// Every possible child of a MarkdownRoot node -export type EveryMarkdownChildNode = - | TextSegments - | NonSingleUnderscoreWord - | Bold - | BoldContent - | PlainBold - | Italic - | ItalicContent - | PlainItalic - -export type EveryMarkdownNode = - | EveryMarkdownChildNode - | EveryMarkdownRootNode - | BoldWithoutItalic - | ItalicWithoutBold - | NonSingleUnderscoreWord -// #endregion - -//#region Terminal parsers -const wordWithoutParensParser = (): P.Parser => - P.regex(/[^\s\(\)\[\]]+/).map((val) => ({ type: "text", value: val })) - -const singleParenParser = (): P.Parser => - P.regex(/[\(\)\[\]]/).map((val) => ({ type: "text", value: val })) - -const newlineParser = (): P.Parser => - P.regex(/\n/).result({ type: "newline" }) - -const nonbreakingSpaceParser = (): P.Parser => - // According to https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes - // the \s character class includes the following codepoints: [ \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff] - // We want to treat newlines and nonbreaking spaces specially. Out of the list above, the codepoints u+00a0 and u+202f look like - // they should be treated as non-breaking whitespace - P.regex(/[\u00a0\ufeff]+/).map((val) => ({ type: "text", value: val })) - -// Also based on that MDN article, we don't want to consume newlines when we're looking for spaces and tabs -// " \n" should turn into [{ type: "whitespace" }, { type: "newline" }] -const nonNewlineWhitespaceParser = (): P.Parser => - P.regex( - /[\r\t\f\v \u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+/ - ).result({ type: "whitespace" }) - -const plainUrlParser = (): P.Parser => - P.regex(urlRegex).map((result) => ({ - type: "plainUrl", - href: result, - })) - -// https://urlregex.com -const urlRegex = - /((([A-Za-z]{3,9}:(?:\/\/)?)(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+|(?:www\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)((?:\/[\+~:%\/\.\w\-_]*)?\??(?:[\-\+=&;%@~:\.\w_]*)#?(?:[\.\!\/\\\w\-]*))?)/ - -const nonBracketWordParser: (r: MdParser) => P.Parser = () => - P.regex(/[^\[\]\s]+/).map((val) => ({ type: "text", value: val })) // no brackets, no WS - -const nonParensWordParser: (r: MdParser) => P.Parser = () => - P.regex(/[^\(\)\s]+/).map((val) => ({ type: "text", value: val })) // no parens, no WS - -const nonDoubleColonOrParensWordParser: ( - r: MdParser -) => P.Parser = () => - P.regex(/([^\(\):\s]|:(?!:))+/).map((val) => ({ type: "text", value: val })) // no parens, no WS, no :: - -const nonSingleUnderscoreWordParser: ( - r: MdParser -) => P.Parser = () => - P.regex(/[^_\s]+/).map((val) => ({ type: "text", value: val })) // no WS, no _ - -const nonDoubleStarWordParser: ( - r: MdParser -) => P.Parser = () => - P.regex(/([^*\s]|\*(?!\*))+/).map((val) => ({ type: "text", value: val })) // no WS, no ** - -const nonSpecialCharactersParser: (r: MdParser) => P.Parser = () => - P.regex(/[^\s*_\(\)\[\]]+/).map((value) => ({ type: "text", value })) // Consume up to one of *_()[] - -const dodCategoryParser: (r: MdParser) => P.Parser = () => - P.regex(/([^\(\):\s]|:(?!:))+/).map((val) => ({ - type: "text", - value: val, - })) // no WS, no parens, no :: - -const dodTermParser: (r: MdParser) => P.Parser = () => - P.regex(/([^\(\):\s]|:(?!:))+/).map((val) => ({ - type: "text", - value: val, - })) // no WS, no parens, no :: - -//#endregion - -//#region Higher level parsers - -const markdownLinkContentParser: ( - r: MdParser -) => P.Parser = (r: MdParser) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.plainBold, - r.plainItalic, - r.nonBracketWord - ) - -const markdownLinkParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.seqObj<{ children: MarkdownLinkContent[]; href: string }>( - P.string("["), - [ - "children", - r.markdownLinkContent /* as P.Parser */ - .atLeast(1), - ], - P.string("]("), - ["href", P.alt(P.regex(/\/[\w\-]+/), P.regex(urlRegex))], - P.string(")") - ).map(({ children, href }) => ({ - type: "markdownLink", - children, - href, - })) - -const detailOnDemandContentParser: ( - r: MdParser -) => P.Parser = (r: MdParser) => - P.alt( - // In TS 4.7 parsimmon could type the parser as Covariant on its type parameter which would remove the need for these casts - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.plainBold, - r.plainItalic, - r.nonBracketWord - ) - -export function extractDetailsFromSyntax(str: string): string[] { - return [...str.matchAll(new RegExp(detailOnDemandRegex, "g"))].map( - ([_, term]) => term - ) -} - -const detailOnDemandParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.seqObj<{ - category: Text - term: Text - children: DetailsOnDemandContent[] - }>( - P.string("["), - ["children", r.detailOnDemandContent.atLeast(1)], - P.string("](#dod:"), - ["term", r.dodTerm], - P.string(")") - ).map(({ children, term }) => ({ - type: "detailOnDemand", - term: term.value, - children, - })) - -const boldWithoutItalicContentParser: ( - r: MdParser -) => P.Parser = (r: MdParser) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.detailOnDemand, - r.markdownLink, - r.plainUrl, - r.nonDoubleStarWord - ) - -const boldWithoutItalicParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.seqObj<{ children: BoldWithoutItalicContent[] }>( - P.string("**"), - ["children", r.boldWithoutItalicContent.atLeast(1)], - P.string("**") - ).map(({ children }) => ({ - type: "boldWithoutItalic", - children, - })) - -const boldContentParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.italicWithoutBold, - r.detailOnDemand, - r.markdownLink, - r.plainUrl, - r.nonSpecialCharacters, - r.singleParen - ) - -const boldParser: (r: MdParser) => P.Parser = (r: MdParser) => - P.seqObj<{ children: BoldContent[] }>( - P.string("**"), - ["children", r.boldContent.atLeast(1)], - P.string("**") - ).map(({ children }) => ({ - type: "bold", - children, - })) - -const plainBoldContentParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.nonDoubleStarWord - ) - -const plainBoldParser: (r: MdParser) => P.Parser = (r: MdParser) => - P.seqObj( - P.string("**"), - ["children", r.plainBoldContent.atLeast(1)], - P.string("**") - ).map(({ children }) => ({ - type: "plainBold", - children, - })) - -const italicWithoutBoldContentParser: ( - r: MdParser -) => P.Parser = (r: MdParser) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.newline, - r.detailOnDemand, - r.markdownLink, - r.plainUrl, - r.nonSpecialCharacters, - r.singleParen - ) - -const italicWithoutBoldParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.seqObj<{ children: ItalicWithoutBoldContent[] }>( - P.string("_"), - ["children", r.italicWithoutBoldContent.atLeast(1)], - P.string("_") - ).map(({ children }) => ({ - type: "italicWithoutBold", - children, - })) -const italicContentParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.boldWithoutItalic, - r.detailOnDemand, - r.markdownLink, - r.plainUrl, - r.nonSpecialCharacters, - r.singleParen - ) - -const italicParser: (r: MdParser) => P.Parser = (r: MdParser) => - P.seqObj( - P.string("_"), - ["children", r.italicContent.atLeast(1)], - P.string("_") - ).map(({ children }) => ({ - type: "italic", - children, - })) - -const plainItalicContentParser: ( - r: MdParser -) => P.Parser = (r: MdParser) => - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.nonSingleUnderscoreWord - ) - -const plainItalicParser: (r: MdParser) => P.Parser = ( - r: MdParser -) => - P.seqObj( - P.string("_"), - ["children", r.plainItalicContent.atLeast(1)], - P.string("_") - ).map(({ children }) => ({ - type: "plainItalic", - children, - })) - -//#endregion - -//#region Top level language construction - -const markdownParser: (r: MdParser) => P.Parser = (r) => - // The order is crucial here! - P.alt( - r.newline, - r.nonbreakingSpace, - r.nonNewlineWhitespace, - r.detailOnDemand, - r.markdownLink, - r.plainUrl, - r.bold, - r.italic, - // Consume up to ** or _, if possible - r.nonSpecialCharacters, - // Otherwise consume everything - r.wordWithoutParens, - r.singleParen - ) - .atLeast(1) - .map( - (tokens): MarkdownRoot => ({ - type: "MarkdownRoot", - children: tokens, - }) - ) - -const languageParts = { - markdown: markdownParser, - newline: newlineParser, - nonbreakingSpace: nonbreakingSpaceParser, - nonNewlineWhitespace: nonNewlineWhitespaceParser, - detailOnDemand: detailOnDemandParser, - markdownLink: markdownLinkParser, - plainUrl: plainUrlParser, - bold: boldParser, - italic: italicParser, - plainBold: plainBoldParser, - plainItalic: plainItalicParser, - wordWithoutParens: wordWithoutParensParser, - singleParen: singleParenParser, - // Utility parsers below - these will never be tried on the top level because text covers everything else - detailOnDemandContent: detailOnDemandContentParser, - markdownLinkContent: markdownLinkContentParser, - boldContent: boldContentParser, - plainBoldContent: plainBoldContentParser, - boldWithoutItalic: boldWithoutItalicParser, - boldWithoutItalicContent: boldWithoutItalicContentParser, - plainItalicContent: plainItalicContentParser, - italicContent: italicContentParser, - italicWithoutBold: italicWithoutBoldParser, - italicWithoutBoldContent: italicWithoutBoldContentParser, - nonBracketWord: nonBracketWordParser, - nonParensWord: nonParensWordParser, - nonDoubleColonOrParensWord: nonDoubleColonOrParensWordParser, - nonDoubleStarWord: nonDoubleStarWordParser, - nonSpecialCharacters: nonSpecialCharactersParser, - nonSingleUnderscoreWord: nonSingleUnderscoreWordParser, - dodCategory: dodCategoryParser, - dodTerm: dodTermParser, -} as const - -export const mdParser: MdParser = P.createLanguage(languageParts) - -//#endregion diff --git a/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx b/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx index 618bc675729..315b3b8d018 100644 --- a/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx +++ b/packages/@ourworldindata/components/src/SimpleMarkdownText.tsx @@ -1,6 +1,7 @@ import React from "react" import { computed } from "mobx" import { Remark } from "react-remark" + type SimpleMarkdownTextProps = { text: string } diff --git a/packages/@ourworldindata/components/src/index.ts b/packages/@ourworldindata/components/src/index.ts index 3ba23fb1bda..f9ba2f288ef 100644 --- a/packages/@ourworldindata/components/src/index.ts +++ b/packages/@ourworldindata/components/src/index.ts @@ -6,22 +6,11 @@ export { } from "./MarkdownTextWrap/MarkdownTextWrap.js" export { SimpleMarkdownText } from "./SimpleMarkdownText.js" - -export { - extractDetailsFromSyntax, - mdParser, - type MarkdownRoot, - type EveryMarkdownChildNode, - type EveryMarkdownNode, - type EveryMarkdownRootNode, -} from "./MarkdownTextWrap/parser.js" - export { getLinkType, getUrlTarget, checkIsInternalLink, convertHeadingTextToId, - markdownToEnrichedTextBlock, } from "./GdocsUtils.js" export { ExpandableToggle } from "./ExpandableToggle/ExpandableToggle.js" diff --git a/packages/@ourworldindata/utils/src/Util.ts b/packages/@ourworldindata/utils/src/Util.ts index 189eaf37e6c..c1cf1796c72 100644 --- a/packages/@ourworldindata/utils/src/Util.ts +++ b/packages/@ourworldindata/utils/src/Util.ts @@ -8,6 +8,8 @@ import { debounce, difference, drop, + dropRightWhile, + dropWhile, escapeRegExp, extend, findLastIndex, @@ -76,6 +78,8 @@ export { debounce, difference, drop, + dropRightWhile, + dropWhile, escapeRegExp, extend, findLastIndex, @@ -1810,3 +1814,18 @@ export const formatAuthors = ({ return authorsText } + +/** Works for: + * #dod:text + * #dod:text-hyphenated + * #dod:text_underscored + * #dod:text_underscored-and-hyphenated + * Duplicated in parser.ts + */ +export const detailOnDemandRegex = /#dod:([\w\-_]+)/ + +export function extractDetailsFromSyntax(str: string): string[] { + return [...str.matchAll(new RegExp(detailOnDemandRegex, "g"))].map( + ([_, term]) => term + ) +} diff --git a/packages/@ourworldindata/utils/src/index.ts b/packages/@ourworldindata/utils/src/index.ts index 8314ca62ba1..02273e014b1 100644 --- a/packages/@ourworldindata/utils/src/index.ts +++ b/packages/@ourworldindata/utils/src/index.ts @@ -369,6 +369,8 @@ export { debounce, difference, drop, + dropRightWhile, + dropWhile, extend, findLastIndex, flatten, @@ -422,6 +424,8 @@ export { without, zip, lowercaseObjectKeys, + detailOnDemandRegex, + extractDetailsFromSyntax, } from "./Util.js" export { isPresent } from "./isPresent.js" @@ -621,5 +625,4 @@ export { IMAGES_DIRECTORY, gdocUrlRegex, gdocIdRegex, - detailOnDemandRegex, } from "./GdocsConstants.js" diff --git a/tsconfig.json b/tsconfig.json index 65d5abae85e..fb046e9f480 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,20 +7,47 @@ }, "files": [], "references": [ - { "path": "./gridLang" }, - { "path": "./gitCms" }, - { "path": "./explorer" }, - { "path": "./datapage" }, - { "path": "./site" }, - { "path": "./settings" }, - { "path": "./adminSiteClient" }, - { "path": "./adminSiteServer" }, - { "path": "./devTools/svgTester" }, - { "path": "./devTools/schema" }, - { "path": "./devTools/schemaProcessor" }, - { "path": "./devTools/dodParserTestGenerator" }, - { "path": "./devTools/uploadWordpressImagesToObjStorage" }, - { "path": "./devTools/explorerTools" }, - { "path": "./devTools/regionsUpdater" } + { + "path": "./gridLang" + }, + { + "path": "./gitCms" + }, + { + "path": "./explorer" + }, + { + "path": "./datapage" + }, + { + "path": "./site" + }, + { + "path": "./settings" + }, + { + "path": "./adminSiteClient" + }, + { + "path": "./adminSiteServer" + }, + { + "path": "./devTools/svgTester" + }, + { + "path": "./devTools/schema" + }, + { + "path": "./devTools/schemaProcessor" + }, + { + "path": "./devTools/uploadWordpressImagesToObjStorage" + }, + { + "path": "./devTools/explorerTools" + }, + { + "path": "./devTools/regionsUpdater" + } ] } diff --git a/yarn.lock b/yarn.lock index ebeef571178..2ae9655b992 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4422,7 +4422,7 @@ __metadata: languageName: node linkType: hard -"@types/mdast@npm:^3.0.0": +"@types/mdast@npm:^3.0, @types/mdast@npm:^3.0.0": version: 3.0.14 resolution: "@types/mdast@npm:3.0.14" dependencies: @@ -10433,6 +10433,7 @@ __metadata: "@types/jsonwebtoken": "npm:^9.0.0" "@types/lodash": "npm:^4.14.185" "@types/md5": "npm:^2.3.2" + "@types/mdast": "npm:^3.0" "@types/minimist": "npm:^1.2.2" "@types/mousetrap": "npm:^1.6.9" "@types/mysql": "npm:^2.15.21" @@ -10517,6 +10518,7 @@ __metadata: lodash: "npm:^4.17.20" mathjax-full: "npm:^3.1.0" md5: "npm:^2.3.0" + mdast-util-from-markdown: "npm:^0.8.0" minimist: "npm:^1.2.6" mobx: "npm:^5.15.7" mobx-formatters: "npm:^1.0.2"