Skip to content

Commit

Permalink
🐛 Request DOI metadata as CSL-JSON after BibTeX (#1073)
Browse files Browse the repository at this point in the history
* feat: support CSL-JSON from doi.org too
* fix: run built-in cleanup
* refactor: rename DOI function
* fix: export BibTeX using citation-js
* 🔧 Handle totally non-existent dois
* 🐛 Ensure citation nodes resolved from CSL-JSON have label
---------

Co-authored-by: Franklin Koch <[email protected]>
Co-authored-by: Rowan Cockett <[email protected]>
  • Loading branch information
3 people authored Apr 10, 2024
1 parent d17f680 commit 5c9338a
Show file tree
Hide file tree
Showing 19 changed files with 575 additions and 182 deletions.
7 changes: 7 additions & 0 deletions .changeset/many-pianos-approve.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"citation-js-utils": minor
"myst-to-jats": patch
"myst-cli": patch
---

Load citations from CSL and non-CSL
5 changes: 5 additions & 0 deletions .changeset/ten-rockets-buy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'mystmd': patch
---

Add end-to-end tests for various DOIs
113 changes: 86 additions & 27 deletions packages/citation-js-utils/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { OutputOptions } from '@citation-js/core';
import { Cite } from '@citation-js/core';
import { clean as cleanCSL } from '@citation-js/core/lib/plugins/input/csl.js';
import sanitizeHtml from 'sanitize-html';

import '@citation-js/plugin-bibtex';
Expand All @@ -8,10 +8,10 @@ import '@citation-js/plugin-csl';
const DOI_IN_TEXT = /(10.\d{4,9}\/[-._;()/:A-Z0-9]*[A-Z0-9])/i;

// This is duplicated in citation-js types, which are not exported
export type CitationJson = {
export type CSL = {
type?: 'article-journal' | string;
id: string;
author?: { given: string; family: string }[];
author?: { given: string; family: string; literal?: string }[];
issued?: { 'date-parts'?: number[][]; literal?: string };
publisher?: string;
title?: string;
Expand Down Expand Up @@ -47,14 +47,6 @@ function cleanRef(citation: string) {
return cleanHtml.replace(/^1\./g, '').replace(/&amp;/g, '&').trim();
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
const defaultOpts: OutputOptions = {
format: 'string',
type: 'json',
style: 'ris',
lang: 'en-US',
};

export enum CitationJSStyles {
'apa' = 'citation-apa',
'vancouver' = 'citation-vancouver',
Expand All @@ -66,22 +58,15 @@ export enum InlineCite {
't' = 't',
}

const defaultString: OutputOptions = {
format: 'string',
lang: 'en-US',
type: 'html',
style: CitationJSStyles.apa,
};

export function yearFromCitation(data: CitationJson) {
export function yearFromCitation(data: CSL) {
let year: number | string | undefined = data.issued?.['date-parts']?.[0]?.[0];
if (year) return year;
year = data.issued?.['literal']?.match(/\b[12][0-9]{3}\b/)?.[0];
if (year) return year;
return 'n.d.';
}

export function getInlineCitation(data: CitationJson, kind: InlineCite, opts?: InlineOptions) {
export function getInlineCitation(data: CSL, kind: InlineCite, opts?: InlineOptions) {
let authors = data.author;
if (!authors || authors.length === 0) {
authors = data.editor;
Expand Down Expand Up @@ -112,7 +97,7 @@ export function getInlineCitation(data: CitationJson, kind: InlineCite, opts?: I
}
if (authors.length > 2) {
return [
{ type: 'text', value: `${prefix}${authors[0].family} ` },
{ type: 'text', value: `${prefix}${authors[0].family ?? authors[0].literal} ` },
{ type: 'emphasis', children: [{ type: 'text', value: 'et al.' }] },
{ type: 'text', value: `${yearPart}` },
];
Expand All @@ -129,7 +114,9 @@ export type CitationRenderer = Record<
inline: (kind?: InlineCite, opts?: InlineOptions) => InlineNode[];
getDOI: () => string | undefined;
getURL: () => string | undefined;
cite: CitationJson;
cite: CSL;
getLabel: () => string;
exportBibTeX: () => string;
}
>;

Expand Down Expand Up @@ -171,12 +158,58 @@ export function firstNonDoiUrl(str?: string, doi?: string) {
return matches.map((match) => match[0]).find((match) => !doi || !match.includes(doi));
}

/**
* Parse a citation style of the form `citation-<style>` into its `<style>`
*
* @param style: citation style string
*/
function parseCitationStyle(style: string): string {
const [styleType, styleFormat] = style.split('-');
if (styleType !== 'citation') {
throw new Error(`unexpected citation style: ${style}`);
}
return styleFormat;
}

/**
* Parse a BibTeX string into an array of CSL items
*
* @param source - BibTeX string
*
*/
export function parseBibTeX(source: string): CSL[] {
return new Cite(source).data;
}

/**
* Parse CSL-JSON into an array of "clean" CSL items
*
* @param source - array of unclean CSL items
*/
export function parseCSLJSON(source: object[]): CSL[] {
return cleanCSL(source);
}

/**
* Compatability shim for existing callers of getCitations
* Replaced by getCitationRenderers
*
* @param bibtex - BibTeX string
*/
export async function getCitations(bibtex: string): Promise<CitationRenderer> {
const cite = new Cite();
const p = await Cite.async(bibtex);
const csl = parseBibTeX(bibtex);
return await getCitationRenderers(csl);
}

/**
* Build renderers for the given array of CSL items
*
* @param data - array of CSL items
*/
export async function getCitationRenderers(data: CSL[]): Promise<CitationRenderer> {
const cite = new Cite();
return Object.fromEntries(
p.data.map((c: any): [string, CitationRenderer[0]] => {
data.map((c): [string, CitationRenderer[0]] => {
const matchDoi = c.URL?.match(DOI_IN_TEXT) ?? c.note?.match(DOI_IN_TEXT);
if (!c.DOI && matchDoi) {
c.DOI = matchDoi[0];
Expand All @@ -189,17 +222,43 @@ export async function getCitations(bibtex: string): Promise<CitationRenderer> {
},
render(style?: CitationJSStyles) {
return replaceUrlsWithAnchorElement(
cleanRef(cite.set(c).get({ ...defaultString, style: style ?? CitationJSStyles.apa })),
cleanRef(
cite.set(c).format('bibliography', {
template: parseCitationStyle(style ?? (CitationJSStyles.apa as string)),
format: 'html',
lang: 'en-US',
}) as string,
),
c.DOI,
);
},
getDOI(): string | undefined {
return c.DOI || undefined;
},
getURL(): string | undefined {
return firstNonDoiUrl(cleanRef(cite.set(c).get(defaultString)), c.DOI) ?? doiUrl(c.DOI);
return (
firstNonDoiUrl(
cleanRef(
cite.set(c).format('bibliography', {
template: parseCitationStyle(CitationJSStyles.apa as string),
format: 'html',
lang: 'en-US',
}) as string,
),
c.DOI,
) ?? doiUrl(c.DOI)
);
},
cite: c,
getLabel(): string {
const bibtexObjects = cite.set(c).format('bibtex', { format: 'object' }) as {
label: string;
}[];
return bibtexObjects[0]?.label;
},
exportBibTeX(): string {
return cite.set(c).format('bibtex', { format: 'text' }) as string;
},
},
];
}),
Expand Down
17 changes: 13 additions & 4 deletions packages/citation-js-utils/tests/basic.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import { describe, expect, it } from 'vitest';
import { getCitations, CitationJSStyles, yearFromCitation, firstNonDoiUrl } from '../src';
import {
getCitationRenderers,
parseBibTeX,
CitationJSStyles,
yearFromCitation,
firstNonDoiUrl,
} from '../src';
import {
bibtex,
doiInNote,
Expand All @@ -13,23 +19,26 @@ const key = 'Cockett2015SimPEG';

describe('Test reference rendering', () => {
it('APA', async () => {
const citations = await getCitations(bibtex);
const data = parseBibTeX(bibtex);
const citations = await getCitationRenderers(data);
expect(Object.keys(citations).length).toBe(1);
const cite = citations[key];
expect(cite.render()).toEqual(TEST_APA_HTML);
expect(cite.render(CitationJSStyles.apa)).toEqual(TEST_APA_HTML);
expect(cite.getDOI()).toEqual('10.1016/j.cageo.2015.09.015');
});
it('Vancouver', async () => {
const citations = await getCitations(bibtex);
const data = parseBibTeX(bibtex);
const citations = await getCitationRenderers(data);
const cite = citations[key];
expect(cite.render(CitationJSStyles.vancouver)).toEqual(TEST_VANCOUVER_HTML);
});
it.each([
['url', doiInURL],
['note', doiInNote],
])('Extract the DOI from the %s', async (_, src) => {
const citations = await getCitations(src);
const data = parseBibTeX(src);
const citations = await getCitationRenderers(data);
expect(citations['cury2020sparse'].getDOI()).toBe(TEST_DOI_IN_OTHER_FIELD);
});
});
Expand Down
11 changes: 7 additions & 4 deletions packages/citation-js-utils/types/citation-js/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
// https://fettblog.eu/typescript-react-extending-jsx-elements/
declare module '@citation-js/plugin-bibtex' {}
declare module '@citation-js/plugin-csl' {}
declare module '@citation-js/core/lib/plugins/input/csl.js' {
export function clean(data: any): any {}
}
declare module '@citation-js/core' {
export type OutputOptions = {
format: 'string';
Expand Down Expand Up @@ -38,13 +41,13 @@ declare module '@citation-js/core' {
} & Record<string, any>;

export class Cite {
constructor(input?: string | CSL);
constructor(input?: any);

static async(data: string | Cite): Promise<Cite>;
static async(data: any): Promise<Cite>;

set(data: string | Cite): this;
set(data: any): this;

get: (opts: OutputOptions) => string;
format: (format: string, options: any) => string | object[];

data: CSL[];
}
Expand Down
35 changes: 0 additions & 35 deletions packages/myst-cli/src/build/utils/bibtex.spec.ts

This file was deleted.

30 changes: 1 addition & 29 deletions packages/myst-cli/src/build/utils/bibtex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,6 @@ import type { ISession } from '../../session/types.js';
import { addWarningForFile } from '../../utils/addWarningForFile.js';
import type { References } from 'myst-common';

/**
* Extract a single entry from the entire content of a bibtex file
*
* Look for the pattern '@article{key' then finds the closing bracket
* and returns that substring. The "article" prefix may be any
* alpha word.
*/
export function extractBibtex(key: string, bibtex: string) {
const match = bibtex.match(new RegExp(`@[a-zA-Z]*{${key}`, 'g'));
if (!match) return;
const start = bibtex.indexOf(match[0]);
let bracketCount = 0;
let ind = start + match[0].length;
while (bibtex[ind] && (bibtex[ind] !== '}' || bracketCount !== 0)) {
if (bibtex[ind - 1] && bibtex[ind - 1] !== '\\') {
if (bibtex[ind] === '{') bracketCount++;
if (bibtex[ind] === '}') bracketCount--;
}
ind++;
}
return bibtex[ind] ? bibtex.substring(start, ind + 1) : undefined;
}

/**
* Write new bibtex file from citation renderer data and reference order
*
Expand All @@ -48,12 +25,7 @@ export function writeBibtexFromCitationRenderers(
const citationLookup: Record<string, string> = {};
Object.values(cache.$citationRenderers).forEach((renderers) => {
Object.entries(renderers).forEach(([key, renderer]) => {
const bibtexContent = (renderer.cite._graph as any[]).find((item) => {
return item.type === '@biblatex/text';
});
if (bibtexContent?.data) {
citationLookup[key] = extractBibtex(key, bibtexContent.data) ?? bibtexContent.data;
}
citationLookup[key] = renderer.exportBibTeX();
});
});
const bibtexContent: string[] = [];
Expand Down
10 changes: 7 additions & 3 deletions packages/myst-cli/src/process/citations.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import fs from 'node:fs';
import type { CitationRenderer } from 'citation-js-utils';
import { getCitations } from 'citation-js-utils';
import { getCitationRenderers, parseBibTeX } from 'citation-js-utils';
import { tic, isUrl } from 'myst-cli-utils';
import { RuleId, plural } from 'myst-common';
import type { ISession, ISessionWithCache } from '../session/types.js';
import { castSession } from '../session/cache.js';
import { selectors } from '../store/index.js';
import { addWarningForFile } from '../utils/addWarningForFile.js';

export async function loadCitations(session: ISession, path: string): Promise<CitationRenderer> {
export async function loadBibTeXCitationRenderers(
session: ISession,
path: string,
): Promise<CitationRenderer> {
const toc = tic();
let data: string;
if (isUrl(path)) {
Expand All @@ -23,7 +26,8 @@ export async function loadCitations(session: ISession, path: string): Promise<Ci
session.log.debug(`Loading citations at "${path}"`);
data = fs.readFileSync(path).toString();
}
const renderer = await getCitations(data);
const csl = parseBibTeX(data);
const renderer = await getCitationRenderers(csl);
session.log.debug(toc(`Read ${plural('%s citations(s)', renderer)} from ${path} in %s.`));
return renderer;
}
Expand Down
Loading

0 comments on commit 5c9338a

Please sign in to comment.