From f51dff302ae5cd65d4d1c702beab279ed9bd66b6 Mon Sep 17 00:00:00 2001 From: Martynas Bagdonas Date: Mon, 10 Jun 2024 13:00:20 +0300 Subject: [PATCH] Various improvements to text analyzer --- src/core/module/content-rect.js | 3 - src/core/module/link/annotation-overlays.js | 9 +- src/core/module/link/link.js | 40 +- src/core/module/link/matched-overlays.js | 145 +---- src/core/module/link/parsed-overlays.js | 9 +- src/core/module/module.js | 102 ++-- src/core/module/outline-extractor.js | 7 +- src/core/module/page-label.js | 86 +++ src/core/module/reference-extractor.js | 535 ------------------ src/core/module/reference/extractor/common.js | 211 +++++++ .../module/reference/extractor/extractor.js | 71 +++ .../reference/extractor/first-line-indent.js | 230 ++++++++ .../extractor/list-number-spacing.js | 110 ++++ .../reference/extractor/paragraph-spacing.js | 129 +++++ .../matcher.js} | 283 ++++++--- src/core/module/reference/reference.js | 31 + src/core/module/util.js | 4 +- src/core/worker.js | 13 + src/display/api.js | 4 + 19 files changed, 1212 insertions(+), 810 deletions(-) delete mode 100644 src/core/module/reference-extractor.js create mode 100644 src/core/module/reference/extractor/common.js create mode 100644 src/core/module/reference/extractor/extractor.js create mode 100644 src/core/module/reference/extractor/first-line-indent.js create mode 100644 src/core/module/reference/extractor/list-number-spacing.js create mode 100644 src/core/module/reference/extractor/paragraph-spacing.js rename src/core/module/{reference-matcher.js => reference/matcher.js} (56%) create mode 100644 src/core/module/reference/reference.js diff --git a/src/core/module/content-rect.js b/src/core/module/content-rect.js index ffe81fd2685d9..7ef4e5c88ae82 100644 --- a/src/core/module/content-rect.js +++ b/src/core/module/content-rect.js @@ -39,12 +39,9 @@ export async function getContentRect(pdfDocument, structuredCharsProvider) { let startPage = Math.max(pageIndex - 2, 0); let endPage = Math.min(pageIndex + 2, numPages - 1); - let x; - let combinedLines = []; for (let i = startPage; i <= endPage; i++) { let chars = await structuredCharsProvider(i); - if (!x) x = chars[2743]; let lines = getLinesFromChars(chars); combinedLines.push(...lines); } diff --git a/src/core/module/link/annotation-overlays.js b/src/core/module/link/annotation-overlays.js index 85a59ae0622bf..31307b625cc6b 100644 --- a/src/core/module/link/annotation-overlays.js +++ b/src/core/module/link/annotation-overlays.js @@ -5,9 +5,8 @@ import { getSortIndex, } from '../util.js'; -async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex){ +async function _getLinkAnnotationOverlays(pdfDocument, chars, pageIndex){ let overlays = []; - let chars = await structuredCharsProvider(pageIndex); let page = await pdfDocument.getPage(pageIndex); let annotations = await page._parsedAnnotations; for (let annotation of annotations) { @@ -60,7 +59,7 @@ async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, overlay.url = url; overlays.push(overlay); } else if (annotation.dest) { - overlay.type = 'internal-link' + overlay.type = 'internal-link'; let destinationPosition = await getPositionFromDestination(pdfDocument, annotation.dest); if (destinationPosition) { overlay.destinationPosition = destinationPosition; @@ -95,8 +94,8 @@ async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, return combinedOverlays; } -export async function getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex) { - let linkOverlays = await _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex); +export async function getAnnotationOverlays(pdfDocument, chars, pageIndex) { + let linkOverlays = await _getLinkAnnotationOverlays(pdfDocument, chars, pageIndex); for (let linkOverlay of linkOverlays) { delete linkOverlay.offsetFrom; delete linkOverlay.offsetTo; diff --git a/src/core/module/link/link.js b/src/core/module/link/link.js index 05029e1ee7a5f..1be3aea753daa 100644 --- a/src/core/module/link/link.js +++ b/src/core/module/link/link.js @@ -3,10 +3,11 @@ import { getAnnotationOverlays } from './annotation-overlays.js'; import { getMatchedOverlays } from './matched-overlays.js'; import { overlaysIntersect } from '../util.js'; -export async function getRegularLinkOverlays(pdfDocument, structuredCharsProvider, pageIndex) { - let annotationOverlays = await getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex); - let parsedOverlays = await getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex); +export async function getRegularLinkOverlays(pdfDocument, chars, pageIndex) { + let annotationOverlays = await getAnnotationOverlays(pdfDocument, chars, pageIndex); + let parsedOverlays = getParsedOverlays(chars); let overlays = [...annotationOverlays]; + // Add parsed overlays that doesn't intersect with annotation overlays for (let parsedOverlay of parsedOverlays) { if (!annotationOverlays.some(x => overlaysIntersect(x, parsedOverlay))) { overlays.push(parsedOverlay); @@ -15,9 +16,32 @@ export async function getRegularLinkOverlays(pdfDocument, structuredCharsProvide return overlays; } -export async function getLinkOverlays(pdfDocument, structuredCharsProvider, contentRect, pageIndex){ - let annotationOverlays = await getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex); - let parsedOverlays = await getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex); - let matchedOverlays = await getMatchedOverlays(pdfDocument, structuredCharsProvider, pageIndex, annotationOverlays, contentRect); - return [...annotationOverlays, ...parsedOverlays, ...matchedOverlays]; +export async function getLinkOverlays(pdfDocument, structuredCharsProvider, contentRect){ + let maxPages = Math.min(50, pdfDocument.catalog.numPages); + let pages = new Map(); + for (let i = 0; i < maxPages; i++) { + let chars = await structuredCharsProvider(i); + let annotationOverlays = await getAnnotationOverlays(pdfDocument, chars, i); + let parsedOverlays = getParsedOverlays(chars); + let matchedOverlays = await getMatchedOverlays(pdfDocument, structuredCharsProvider, i, annotationOverlays, contentRect); + + let overlays = [...annotationOverlays]; + + for (let matchedOverlay of matchedOverlays) { + if (!overlays.some(x => overlaysIntersect(x, matchedOverlay))) { + overlays.push(matchedOverlay); + } + } + + for (let parsedOverlay of parsedOverlays) { + if (!overlays.some(x => overlaysIntersect(x, parsedOverlay))) { + overlays.push(parsedOverlay); + } + } + + if (overlays.length) { + pages.set(i, overlays); + } + } + return pages; } diff --git a/src/core/module/link/matched-overlays.js b/src/core/module/link/matched-overlays.js index 1a482bc89c774..76086d3513912 100644 --- a/src/core/module/link/matched-overlays.js +++ b/src/core/module/link/matched-overlays.js @@ -189,132 +189,6 @@ function matchCandidates(candidates, pageIndex) { return matches; } -function expandRect(currentRect, otherRects, surroundingRect) { - let [x1, y1, x2, y2] = currentRect; - let collision = { top: false, bottom: false, left: false, right: false }; - - function intersects(rectA, rectB) { - return rectA[0] < rectB[2] && rectA[2] > rectB[0] && rectA[1] < rectB[3] && rectA[3] > rectB[1]; - } - - while (!collision.top || !collision.bottom || !collision.left || !collision.right) { - if (!collision.top && y1 > surroundingRect[1]) { - y1 -= Math.min(10, y1 - surroundingRect[1]); - if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) { - y1 += Math.min(10, y1 - surroundingRect[1]); // Revert if collision - collision.top = true; - } - } else { - collision.top = true; - } - - if (!collision.bottom && y2 < surroundingRect[3]) { - y2 += Math.min(10, surroundingRect[3] - y2); - if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) { - y2 -= Math.min(10, surroundingRect[3] - y2); // Revert if collision - collision.bottom = true; - } - } else { - collision.bottom = true; - } - - if (!collision.left && x1 > surroundingRect[0]) { - x1 -= Math.min(10, x1 - surroundingRect[0]); - if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) { - x1 += Math.min(10, x1 - surroundingRect[0]); // Revert if collision - collision.left = true; - } - } else { - collision.left = true; - } - - if (!collision.right && x2 < surroundingRect[2]) { - x2 += Math.min(10, surroundingRect[2] - x2); - if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) { - x2 -= Math.min(10, surroundingRect[2] - x2); // Revert if collision - collision.right = true; - } - } else { - collision.right = true; - } - } - - return [x1, y1, x2, y2]; -} - -function getRect(destination, chars, contentRect) { - let paragraph = { from: destination.offsetFrom, to: destination.offsetTo }; - - while (chars[paragraph.from - 1] && !chars[paragraph.from - 1].paragraphBreakAfter) { - paragraph.from--; - } - - while (!chars[paragraph.to].paragraphBreakAfter) { - paragraph.to++; - } - - let rect = getBoundingRect(chars.slice(paragraph.from, paragraph.to + 1)); - - - - - let paragraphs = []; // This will hold the start and end indices of each paragraph - let start = 0; // Start index of the current paragraph - - for (let i = 0; i < chars.length; i++) { - if (chars[i].paragraphBreakAfter || i === chars.length - 1) { - // If current char has .paragraphBreakAfter or it's the last element of the array - let end = i; // End index of the current paragraph - - let rect = getBoundingRect(chars.slice(start, end + 1)); - - let sumParagraph = (rect[2] - rect[0]) * (rect[3] - rect[1]); - - let sumLines = 0; - - let linesNum = 0; - let lineStart = start; - for (let k = start; k <= end; k++) { - if (chars[k].lineBreakAfter) { - let lineEnd = k + 1; - let rect = getBoundingRect(chars.slice(lineStart, lineEnd)); - // console.log('uuu', chars.slice(lineStart, lineEnd).map(x => x.c).join(''), rect[2] - rect[0]); - sumLines += (rect[2] - rect[0]) * (rect[3] - rect[1]); - linesNum++; - lineStart = k + 1; - } - } - - let sum = 0; - for (let j = start; j <= end; j++) { - let char = chars[j]; - sum += (char.rect[2] - char.rect[0]) * (char.rect[3] - char.rect[1]); - } - - let densityRatio = sum / sumLines; - if (end - start > 50 && densityRatio > 0.8 && linesNum >= 2 && !(paragraph.from>=start && paragraph.from <=end)) { - paragraphs.push({ - start: start, - end: end, - densityRatio, - rect, - text: chars.slice(start, end + 1).map(x => x.c).join('') - }); - } - - - - start = i + 1; // The next paragraph starts after the current one ends - } - } - - let rects = paragraphs.map(x => x.rect); - - let expandedRect = expandRect(rect, rects, [0, 0, 595.276, 790.866]); - - return expandedRect; -} - export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, pageIndex, annotationOverlays, contentRect) { let MAX_PAGES_AROUND = 5; let from = Math.max(pageIndex - MAX_PAGES_AROUND, 0); @@ -330,11 +204,6 @@ export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, p let matches = matchCandidates(allCandidates, pageIndex); - for (let match of matches) { - let destination = match.destination; - destination.rect = getRect(destination, pages[destination.pageIndex], contentRect); - } - let overlays = []; for (let match of matches) { let sourceChars = pages[match.source.pageIndex].slice(match.source.offsetFrom, match.source.offsetTo + 1); @@ -342,14 +211,8 @@ export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, p let destinationChars = pages[match.destination.pageIndex].slice(match.destination.offsetFrom, match.destination.offsetTo + 1); let destinationRect = getBoundingRect(destinationChars); - - let previewRect = match.destination.rect; - previewRect = [ - Math.max(previewRect[0], contentRect[0]), - Math.max(previewRect[1], contentRect[1]), - Math.min(previewRect[2], contentRect[2]), - Math.min(previewRect[3], contentRect[3]), - ]; + // destinationRect[2] = destinationRect[0]; + // destinationRect[1] = destinationRect[3]; let overlay = { type: 'internal-link', @@ -363,10 +226,6 @@ export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, p pageIndex: match.destination.pageIndex, rects: [destinationRect], }, - previewPosition: { - pageIndex: match.destination.pageIndex, - rects: [previewRect], - }, }; overlays.push(overlay); diff --git a/src/core/module/link/parsed-overlays.js b/src/core/module/link/parsed-overlays.js index 2590ce98a0ee8..8777ad61fc991 100644 --- a/src/core/module/link/parsed-overlays.js +++ b/src/core/module/link/parsed-overlays.js @@ -1,8 +1,9 @@ import { getRangeRects, getSortIndex } from '../util.js'; -export async function getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex) { - let chars = await structuredCharsProvider(pageIndex); - +export function getParsedOverlays(chars) { + if (!chars.length) { + return []; + } let sequences = []; let sequence = { from: 0, to: 0, lbp: [] }; @@ -66,6 +67,8 @@ export async function getParsedOverlays(pdfDocument, structuredCharsProvider, pa } } + let { pageIndex } = chars[0]; + let overlays = []; for (let link of links) { let rects = getRangeRects(chars, link.from, link.to - 1); diff --git a/src/core/module/module.js b/src/core/module/module.js index 58b8bd0b5745a..9444cc644ab8a 100644 --- a/src/core/module/module.js +++ b/src/core/module/module.js @@ -1,15 +1,15 @@ import { getStructuredChars } from './structure.js'; -import { getLinkOverlays } from './link/link.js'; -import { getPageLabel } from './page-label.js'; -import { - getCitationOverlays, - getReferenceOverlays -} from './reference-matcher.js'; -import { extractReferences } from './reference-extractor.js'; +import { getLinkOverlays, getRegularLinkOverlays } from './link/link.js'; +import { getPageLabel, getPageLabels } from './page-label.js'; +// import { +// getCitationOverlays, +// getReferenceOverlays +// } from './reference-matcher.js'; import { getExistingOutline } from './outline-reader.js'; import { extractOutline } from './outline-extractor.js'; import { getContentRect } from './content-rect.js'; import { intersectRects, overlaysIntersect } from './util.js'; +import { getCitationAndReferenceOverlays } from './reference/reference.js'; export class Module { constructor(pdfDocument) { @@ -22,12 +22,17 @@ export class Module { this._initializing = false; } - _structuredCharsProvider = async (pageIndex, onlyContent) => { + _structuredCharsProvider = async (pageIndex, priority) => { + let cached = this._structuredCharsCache.get(pageIndex); if (cached) { return cached; } + if (!priority) { + await new Promise((resolve) => setTimeout(resolve)); + } + cached = this._temporaryStructuredCharsCache.get(pageIndex); if (cached) { if (this._contentRect) { @@ -96,38 +101,69 @@ export class Module { return chars; }; - async getPageData({ pageIndex, metadataPagesField }) { - if (!this._initializing) { - this._initializeDocument(); - this._initializing = true; + async getPageData({ pageIndex }) { + let page = await this._pdfDocument.getPage(pageIndex); + let r = Math.random().toString(); + let chars = await this._structuredCharsProvider(pageIndex, true); + let overlays = await getRegularLinkOverlays(this._pdfDocument, chars, pageIndex); + return { + partial: true, + chars, + overlays, + viewBox: page.view, + }; + } + + async getProcessedData({ metadataPagesField } = {}) { + const MAX_PAGES = 100; + await this._pdfDocument.pdfManager.ensureDoc("numPages"); + this._contentRect = await getContentRect(this._pdfDocument, this._structuredCharsProvider); + let citationAndReferenceOverlays = await getCitationAndReferenceOverlays(this._pdfDocument, this._structuredCharsProvider, 100); + + let pages = new Map(); + + for (let overlay of citationAndReferenceOverlays) { + let { pageIndex } = overlay.position; + if (overlay.type === 'reference') { + continue; + } + let page = pages.get(pageIndex); + if (!page) { + page = { overlays: [] }; + pages.set(pageIndex, page); + } + page.overlays.push(overlay); } - await this._initializePromise; - let chars = await this._structuredCharsProvider(pageIndex); - let pageLabel = await getPageLabel(this._pdfDocument, this._structuredCharsProvider, pageIndex); - let linkOverlays = await getLinkOverlays(this._pdfDocument, this._structuredCharsProvider, this._contentRect, pageIndex); - let citationOverlays = await getCitationOverlays(this._pdfDocument, this._structuredCharsProvider, pageIndex, this._referenceData, linkOverlays); - let referenceOverlays = [];// await getReferenceOverlays(this._referenceData, pageIndex); + let linkOverlaysMap = await getLinkOverlays(this._pdfDocument, this._structuredCharsProvider, this._contentRect); - let overlays = [...citationOverlays, ...referenceOverlays]; - // Exclude link overlays that intersect reference overlays and aren't a bibliography record or external url link - // Don't include link annotations overlays that overlap with citation overlays - for (let linkOverlay of linkOverlays) { - if (!citationOverlays.some(x => overlaysIntersect(x, linkOverlay))) { - overlays.push(linkOverlay); + for (let [pageIndex, linkOverlays] of linkOverlaysMap) { + // Exclude link overlays that intersect reference overlays and aren't a bibliography record or external url link + // Don't include link annotations overlays that overlap with citation overlays + for (let linkOverlay of linkOverlays) { + let page = pages.get(pageIndex); + if (!page) { + page = { overlays: [] }; + pages.set(pageIndex, page); + } + if (!page.overlays.some(x => overlaysIntersect(x, linkOverlay))) { + page.overlays.push(linkOverlay); + } } } - let page = await this._pdfDocument.getPage(pageIndex); + for (let [pageIndex, page] of pages) { + page.viewBox = (await this._pdfDocument.getPage(pageIndex)).view; + page.chars = await this._structuredCharsProvider(pageIndex); + } - return { - pageLabel, - chars, - overlays, - viewBox: page.view, - }; + let pageLabels = await getPageLabels(this._pdfDocument, this._structuredCharsProvider); + + pages = Object.fromEntries(pages); + + return { pageLabels, pages }; } async getOutline() { @@ -140,9 +176,7 @@ export class Module { async _initializeDocument() { // As soon as contentRect is set, extractReferences below can use it - this._contentRect = await getContentRect(this._pdfDocument, this._structuredCharsProvider); - await this._pdfDocument.pdfManager.ensureDoc("numPages"); - this._referenceData = await extractReferences(this._pdfDocument, this._structuredCharsProvider); + this._initializePromiseResolve(); this._initialized = true; } diff --git a/src/core/module/outline-extractor.js b/src/core/module/outline-extractor.js index 2967fe0072c30..ab97833e88485 100644 --- a/src/core/module/outline-extractor.js +++ b/src/core/module/outline-extractor.js @@ -67,6 +67,10 @@ function getItemsWithDepth(parentItems, rangeGroups, depth) { } } + if (!depthItemGroups.length) { + return []; + } + return depthItemGroups.sort((a, b) => b.length - a.length)[0]; } @@ -85,6 +89,7 @@ export async function extractOutline(pdfDocument, structuredCharsProvider) { let pagesNum = await pdfDocument.pdfManager.ensureDoc('numPages'); let from = 0; let to = pagesNum - 1; + to = Math.min(to, 100); let pages = []; for (let i = from; i <= to; i++) { @@ -288,7 +293,7 @@ export async function extractOutline(pdfDocument, structuredCharsProvider) { } } // Try to use next available font ranges after the H1, but only for H2 level - else if (fontRanges.length >= 1) { + else if (fontRanges.length) { let ranges = fontRanges[0]; if (!rangeGroupHasDuplicates(ranges)) { h2 = fontRanges[0]; diff --git a/src/core/module/page-label.js b/src/core/module/page-label.js index 9d51229a0aa97..6a6e910e49da3 100644 --- a/src/core/module/page-label.js +++ b/src/core/module/page-label.js @@ -260,3 +260,89 @@ export async function getPageLabel(pdfDocument, structuredCharsProvider, pageInd return null; } + + +function arabicToRoman(num) { + const romanKeys = { + M: 1000, + CM: 900, + D: 500, + CD: 400, + C: 100, + XC: 90, + L: 50, + XL: 40, + X: 10, + IX: 9, + V: 5, + IV: 4, + I: 1 + }; + let roman = ''; + + for (let key in romanKeys) { + while (num >= romanKeys[key]) { + roman += key; + num -= romanKeys[key]; + } + } + + return roman; +} + +export function predictPageLabels(extractedPageLabels, catalogPageLabels, pagesCount) { + let pageLabels = []; + + if (!catalogPageLabels || !catalogPageLabels.length) { + for (let i = 0; i < pagesCount; i++) { + pageLabels[i] = (i + 1).toString(); + } + return pageLabels; + } + + for (let i = 0; i < pagesCount; i++) { + pageLabels[i] = '-'; + } + + let allPageLabels = Object.values(extractedPageLabels).sort((a, b) => a.pageIndex - b.pageIndex); + if ( + catalogPageLabels + && catalogPageLabels.length === pagesCount + && ( + allPageLabels[0] && catalogPageLabels[allPageLabels[0].pageIndex] === allPageLabels[0].chars.map(x => x.u).join('') + || allPageLabels.length === 0 + ) + ) { + for (let i = 0; i < pagesCount; i++) { + pageLabels[i] = catalogPageLabels[i]; + } + } + + let firstArabicPageLabel = Object.values(extractedPageLabels).filter(x => x.type === 'arabic')[0]; + + if (firstArabicPageLabel) { + let startInteger = firstArabicPageLabel.integer - firstArabicPageLabel.pageIndex; + for (let i = 0; i < pagesCount; i++) { + if (startInteger + i >= 1) { + pageLabels[i] = (startInteger + i).toString(); + } + } + } + return pageLabels; +} + +export async function getPageLabels(pdfDocument, structuredCharsProvider) { + let extractedLabels = {}; + for (let i = 0; i < 25; i++) { + let pageLabel = await getPageLabel(pdfDocument, structuredCharsProvider, i); + if (pageLabel) { + extractedLabels[i] = pageLabel; + } + } + + let catalogPageLabels = await pdfDocument.pdfManager.ensureCatalog("pageLabels"); + + let pageLabels = predictPageLabels(extractedLabels, catalogPageLabels, pdfDocument.catalog.numPages) + + return pageLabels; +} diff --git a/src/core/module/reference-extractor.js b/src/core/module/reference-extractor.js deleted file mode 100644 index e141fc609becc..0000000000000 --- a/src/core/module/reference-extractor.js +++ /dev/null @@ -1,535 +0,0 @@ -import { - getBoundingRect, - getCenterRect, - intersectRects, - getClusters -} from './util.js'; -import { getRegularLinkOverlays } from './link/link.js'; - -function removeASCIISymbolsAndNumbers(inputString) { - let result = ''; - for (let i = 0; i < inputString.length; i++) { - let charCode = inputString.charCodeAt(i); - // Check if the character is a letter (either lowercase or uppercase) - if ((charCode >= 65 && charCode <= 90) || (charCode >= 97 && charCode <= 122) || charCode >= 128) { - result += inputString[i]; - } - } - return result; -} - -function getReferencesTitleOffset(chars) { - let titles = ['references', 'bibliography', 'literature', 'bibliographie', 'literatur']; - - let paragraphs = []; // This will hold the start and end indices of each paragraph - let start = 0; // Start index of the current paragraph - - for (let i = 0; i < chars.length; i++) { - if (chars[i].paragraphBreakAfter || i === chars.length - 1) { - // If current char has .paragraphBreakAfter or it's the last element of the array - let end = i + 1; // End index of the current paragraph - paragraphs.push({ start: start, end: end }); - start = i + 1; // The next paragraph starts after the current one ends - } - } - - for (let paragraph of paragraphs) { - let { start, end } = paragraph; - let text = chars.slice(start, end).map(x => x.c).join(''); - text = text.toLowerCase(); - text = removeASCIISymbolsAndNumbers(text); - for (let title of titles) { - if (text.startsWith(title) || text.endsWith(title)) { - return end; - } - } - } - return -1; -} - -function splitIntoWords(chars) { - let words = []; - let currentWordChars = []; - let isNewLine = true; - let offsetFrom = 0; - for (let i = 0; i < chars.length; i++) { - let char = chars[i]; - - currentWordChars.push(char); - - if (char.spaceAfter || char.lineBreakAfter || char.paragraphBreakAfter || i === chars.length - 1) { - let charBeforeWord = chars[offsetFrom - 1]; - - let number = currentWordChars.filter(char => char.c >= '0' && char.c <= '9').map(x => x.c); - if ((charBeforeWord && (charBeforeWord.lineBreakAfter || charBeforeWord.paragraphBreakAfter) || !charBeforeWord) - && number.length > 0 && number.length <= 3 && currentWordChars.length <= number.length + 2) { - // Push the word object if it contains a number - words.push({ - chars: currentWordChars.slice(), // Use slice to copy the array - offsetFrom: offsetFrom, - offsetTo: i, // The current character is part of the word - rect: getBoundingRect(currentWordChars), - distanceToNextChar: null, - pageIndex: char.pageIndex, - number: parseInt(number), - }); - } - currentWordChars = []; // Reset for the next word - offsetFrom = i + 1; - } - } - - // Calculate the distance from the last number of the current word to the next character in chars - for (let j = 0; j < words.length; j++) { - const word = words[j]; - let lastNumberCharIndex = -1; - - // Find the last number in the word - for (let k = word.chars.length - 1; k >= 0; k--) { - if (word.chars[k].c >= '0' && word.chars[k].c <= '9') { - lastNumberCharIndex = k; - break; - } - } - - if (lastNumberCharIndex !== -1) { - const lastNumberChar = word.chars[lastNumberCharIndex]; - let nextChar = chars[word.offsetTo + 1]; // Get the next character in the chars array after the current word - - if (nextChar) { - // Calculate the distance using the rect properties - words[j].distanceToNextChar = nextChar.rect[0] - lastNumberChar.rect[2]; - words[j].distanceToNextChar2 = nextChar.rect[0] - word.chars[0].rect[0]; - } - } - } - - return words; -} - - - - -function extractBySequence(chars) { - // Filter words that have a white space before it and an extractable integer [1], 1. - // Find the longest sequence? - - - let words = splitIntoWords(chars); - - if (!words.length) { - return []; - } - - let clusters1 = getClusters(words, 'distanceToNextChar', 2); - - let cluster1 = clusters1.reduce((a, b) => a.length > b.length ? a : b, []); - - let clusters2 = getClusters(words, 'distanceToNextChar2', 2); - - let cluster2 = clusters2.reduce((a, b) => a.length > b.length ? a : b, []); - - if (cluster1.length > cluster2.length) { - words = cluster1; - } - else { - words = cluster2; - } - - words.sort((a, b) => a.offsetFrom - b.offsetFrom); - - let lastPageIndex = words[0].pageIndex; - - - let finalWords = []; - - let groupedChars = []; - for (let i = 0; i < words.length; i++) { - if (i < words.length - 1) { - // Slice from the current word's offsetFrom to the next word's offsetFrom - groupedChars.push(chars.slice(words[i].offsetFrom, words[i + 1].offsetFrom)); - } else { - // For the last word, slice from its offsetFrom to the end of the chars array - // And try to find paragraph end - - let chars2 = chars.slice(words[i].offsetFrom); - let endIndex = chars2.findIndex(x => x.paragraphBreakAfter); - chars2 = chars2.slice(0, endIndex + 1); - groupedChars.push(chars2); - } - } - - let references = []; - for (let chars of groupedChars) { - let pageIndex = chars[0].pageIndex; - let chars1 = []; - let chars2 = []; - for (let char of chars) { - if (char.pageIndex === pageIndex) { - chars1.push(char); - } else { - chars2.push(char); - } - } - - let position = { - pageIndex, - rects: [getBoundingRect(chars1)], - }; - if (chars2.length) { - position.nextPageRects = [getBoundingRect(chars2)]; - } - - let text = []; - for (let char of chars) { - text.push(char.c); - if (char.spaceAfter) { - text.push(' '); - } - } - text = text.join(''); - - let result = text.match(/\d+/); - let index = result ? parseInt(result[0], 10) : null; - - let reference = { - text, - chars, - position - }; - - if (index) { - reference.index = index; - } - - references.push(reference); - } - - return references; -} - -function extractByLayout(chars) { - let lines = []; - let currentLineChars = []; - let offsetFrom = 0; - for (let i = 0; i < chars.length; i++) { - let char = chars[i]; - currentLineChars.push(char); - if (char.lineBreakAfter) { - lines.push({ - chars: currentLineChars, - offsetFrom, - offsetTo: i, - rect: getBoundingRect(currentLineChars), - }); - offsetFrom = i + 1; - currentLineChars = []; - } - } - - - let deltas = []; - for (let i = 1; i < lines.length; i++) { - let prevLine = lines[i - 1]; - let line = lines[i]; - - let delta = line.rect[0] - prevLine.rect[0]; - if (delta > 5) { - deltas.push({ - offset: prevLine.offsetFrom, - delta, - }); - } - } - - let clusters = getClusters(deltas, 'delta', 1); - - if (!clusters.length) { - return []; - } - - let paragraphBreaks = clusters[0]; - - // Extracting by layout depends on first line of each reference being shifted more on the left - // than other lines. But some lines can fit in a single line therefore the paragraph - // break before that line won't be added. - // Therefore, we try to detect first characters of each line that align with - // other reference beginnings - let extraParagraphBreaks = []; - let breakers = paragraphBreaks.map(x => ({ x: chars[x.offset].rect[0] })); - let breakClusters = getClusters(breakers, 'x', 1); - let breakPoints = breakClusters.map(x => x[0].x); - let paragraphBreakSet = new Set(paragraphBreaks.map(x => x.offset)); - for (let i = 0; i < chars.length; i++) { - let prevChar = chars[i-1]; - let char = chars[i]; - if ((!prevChar || prevChar.rect[0] > char.rect[0]) && - !paragraphBreakSet.has(i) && breakPoints.some(x => Math.abs(x - char.rect[0]) < 1)) { - extraParagraphBreaks.push({ offset: i }); - } - } - // For now make sure that extra breaks are no more than 20% - if (extraParagraphBreaks.length / paragraphBreaks.length <= 0.2) { - paragraphBreaks.push(...extraParagraphBreaks); - } - - paragraphBreaks.sort((a, b) => a.offset - b.offset); - - let groupedChars = []; - for (let i = 0; i < paragraphBreaks.length; i++) { - if (i < paragraphBreaks.length - 1) { - // Slice from the current word's offsetFrom to the next word's offsetFrom - groupedChars.push(chars.slice(paragraphBreaks[i].offset, paragraphBreaks[i + 1].offset)); - } else { - // For the last word, slice from its offsetFrom to the end of the chars array - // And try to find paragraph end - - let chars2 = chars.slice(paragraphBreaks[i].offset); - groupedChars.push(chars2); - } - } - - groupedChars = groupedChars.filter(x => x.length) - - let references = []; - for (let chars of groupedChars) { - let pageIndex = chars[0].pageIndex; - let chars1 = []; - let chars2 = []; - for (let char of chars) { - if (char.pageIndex === pageIndex) { - chars1.push(char); - } else { - chars2.push(char); - } - } - - let position = { - pageIndex, - rects: [getBoundingRect(chars1)], - }; - if (chars2.length) { - position.nextPageRects = [getBoundingRect(chars2)]; - } - - let text = []; - for (let char of chars) { - text.push(char.c); - if (char.spaceAfter) { - text.push(' '); - } - } - text = text.join(''); - - references.push({ - text, - chars, - position, - }); - } - - return references; -} - -function extractByParagraphSpacing(chars) { - let lines = []; - let currentLineChars = []; - let offsetFrom = 0; - for (let i = 0; i < chars.length; i++) { - let char = chars[i]; - currentLineChars.push(char); - if (char.lineBreakAfter) { - lines.push({ - chars: currentLineChars, - offsetFrom, - offsetTo: i, - rect: getBoundingRect(currentLineChars), - }); - offsetFrom = i + 1; - currentLineChars = []; - } - } - - let spacings = []; - for (let i = 1; i < lines.length; i++) { - let prevLine = lines[i - 1]; - let line = lines[i]; - - let spacing = prevLine.rect[1] - line.rect[3]; - if (spacing > 0) { - spacings.push({ - offset: line.offsetFrom, - spacing, - }); - } - } - - let clusters = getClusters(spacings, 'spacing', 0.5); - - let values = clusters[0].map(x => x.spacing); - - let average = values.reduce((acc, val) => acc + val, 0) / values.length; - - let paragraphBreaks = spacings.filter(x => x.spacing > average + 1); - - if (!paragraphBreaks.length) { - return []; - } - - // Put chars before the first paragraph break - let groupedChars = [chars.slice(0, paragraphBreaks[0].offset)]; - for (let i = 0; i < paragraphBreaks.length; i++) { - if (i < paragraphBreaks.length - 1) { - // Slice from the current word's offsetFrom to the next word's offsetFrom - groupedChars.push(chars.slice(paragraphBreaks[i].offset, paragraphBreaks[i + 1].offset)); - } else { - // For the last word, slice from its offsetFrom to the end of the chars array - // And try to find paragraph end - - let chars2 = chars.slice(paragraphBreaks[i].offset); - - groupedChars.push(chars2); - } - } - - let references = []; - for (let chars of groupedChars) { - let pageIndex = chars[0].pageIndex; - let chars1 = []; - let chars2 = []; - for (let char of chars) { - if (char.pageIndex === pageIndex) { - chars1.push(char); - } else { - chars2.push(char); - } - } - - let position = { - pageIndex, - rects: [getBoundingRect(chars1)], - }; - if (chars2.length) { - position.nextPageRects = [getBoundingRect(chars2)]; - } - - let text = []; - for (let char of chars) { - text.push(char.c); - if (char.spaceAfter) { - text.push(' '); - } - } - text = text.join(''); - - references.push({ - text, - chars, - position - }); - } - - let lowerCaseNum = 0; - - for (let reference of references) { - if (reference.text[0] === reference.text[0].toLowerCase()) { - lowerCaseNum++; - } - } - - if (lowerCaseNum >= 5) { - return []; - } - - return references; -} - -// TODO: In Mills - 2015 some lines a single therefore they won't be break. -// Fix that. Use line that aligns other reference start and break everything else that is next to this line - -export async function extractReferences(pdfDocument, structuredCharsProvider) { - let pagesProcessed = 0; - - let refPageIdx = null; - let refPageOffset = null; - for (let i = pdfDocument.catalog.numPages - 1; i >= 0 && pagesProcessed <= 10; i--) { - let chars = await structuredCharsProvider(i); - - // Clone chars because we are modifying (pageIndex, url) them - chars = chars.map(char => ({ ...char })); - - let offset = getReferencesTitleOffset(chars); - if (offset !== -1 ) { - refPageIdx = i; - refPageOffset = offset; - break; - } - pagesProcessed++; - } - - if (refPageIdx !== null) { - const MAX_REF_PAGES = 10; - let combinedChars = []; - for (let i = refPageIdx; i < refPageIdx + MAX_REF_PAGES && i < pdfDocument.catalog.numPages; i++) { - let chars = await structuredCharsProvider(i); - for (let char of chars) { - char.pageIndex = i; - } - if (i === refPageIdx) { - chars = chars.slice(refPageOffset); - } - combinedChars.push(...chars); - } - - combinedChars = combinedChars.filter(x => !x.isolated); - - let groups = [ - extractBySequence(combinedChars), - extractByParagraphSpacing(combinedChars), - extractByLayout(combinedChars), - ]; - - let bestGroup = null; - for (let group of groups) { - if (bestGroup === null || bestGroup.length < group.length) { - bestGroup = group; - } - } - - let references = bestGroup; - - // await getPageReferences(structuredCharsProvider, 0, pdfDocument.catalog.numPages, references); - - await addUrls(pdfDocument, structuredCharsProvider, references); - - - let start = { pageIndex: refPageIdx, offset: refPageOffset }; - - return { references, start }; - } -} - -async function addUrls(pdfDocument, structuredCharsProvider, references) { - let allPageIndexes = references.map(x => x.position.pageIndex); - let minPageIndex = Math.min(...allPageIndexes); - let maxPageIndex = Math.max(...allPageIndexes); - for (let i = minPageIndex; i <= maxPageIndex; i++) { - let linkOverlays = await getRegularLinkOverlays(pdfDocument, structuredCharsProvider, i); - linkOverlays = linkOverlays.filter(x => x.type === 'external-link'); - for (let linkOverlay of linkOverlays) { - let { rects } = linkOverlay.position; - for (let reference of references) { - for (let char of reference.chars) { - if (char.pageIndex === linkOverlay.position.pageIndex) { - for (let rect of rects) { - let centerRect = getCenterRect(char.rect); - if (intersectRects(centerRect, rect)) { - char.url = linkOverlay.url; - } - } - } - } - } - } - } -} diff --git a/src/core/module/reference/extractor/common.js b/src/core/module/reference/extractor/common.js new file mode 100644 index 0000000000000..07840478243bc --- /dev/null +++ b/src/core/module/reference/extractor/common.js @@ -0,0 +1,211 @@ +import { getBoundingRect, getClusters } from '../../util.js'; + +export function splitByPageIndexContinuity(clusters) { + let newClusters = []; + for (let i = 0; i < clusters.length; i++) { + let cluster = clusters[i]; + let newCluster = []; + for (let j = 0; j < cluster.length; j++) { + let previousItem = cluster[j - 1]; + let item = cluster[j]; + if (previousItem && item.pageIndex - previousItem.pageIndex > 1) { + newClusters.push(newCluster); + newCluster = []; + } + newCluster.push(item); + } + newClusters.push(newCluster); + } + return newClusters; +} + +export function hasValidYear(chars) { + // Convert array of objects to a string of characters + let text = chars.map(x => x.c).join(''); + let numbers = (text.match(/\d+/g) || []).map(x => parseInt(x)); + + for (let number of numbers) { + if (number >= 1800 && number <= new Date().getFullYear()) { + return true; + } + } + return false; +} + +function removeASCIISymbolsAndNumbers(inputString) { + let result = ''; + for (let i = 0; i < inputString.length; i++) { + let charCode = inputString.charCodeAt(i); + // Check if the character is a letter (either lowercase or uppercase) + if ((charCode >= 65 && charCode <= 90) || (charCode >= 97 && charCode <= 122) || charCode >= 128) { + result += inputString[i]; + } + } + return result; +} + +function isASCIISymbolOrNumber(char) { + let charCode = char.charCodeAt(0); + return charCode < 65 || (90 < charCode && charCode < 97) || charCode > 122; +} + +export function canStartWithChar(char) { + return ( + !isASCIISymbolOrNumber(char.c) && + // Is uppercase, if has an uppercase version (not the case for Asian languages) + char.c === char.c.toUpperCase() + ); +} + +export function getReferencesTitleOffset(chars) { + let titles = ['references', 'bibliography', 'literature', 'bibliographie', 'literatur']; + let paragraphs = []; + let start = 0; + for (let i = 0; i < chars.length; i++) { + if (chars[i].paragraphBreakAfter || i === chars.length - 1 /*|| chars[i].lineBreakAfter && Math.abs(chars[i].fontSize - chars[i+1].fontSize) > 0.1*/) { + paragraphs.push({ start: start, end: i, text: chars.slice(start, i + 1).map(x => x.c).join('') }); + start = i + 1; + } + } + let results = []; + for (let paragraph of paragraphs) { + let { start, end } = paragraph; + if (end - start > 30 || chars.length - end < 300) { + continue; + } + let text = chars.slice(start, end + 1).map(x => x.c).join(''); + text = text.toLowerCase(); + text = removeASCIISymbolsAndNumbers(text); + for (let title of titles) { + if (text.startsWith(title)) { + results.push({ end, text, fontSize: chars[start].fontSize}) + } + else if (text.endsWith(title)) { + results.push({ end, text, fontSize: chars[end].fontSize}) + } + } + } + let clusters = getClusters(results, 'fontSize', 0.1); + let cluster = clusters.reduce((a, b) => a.fontSize > b.fontSize ? a : b, []); + if (cluster.length === 1) { + let result = cluster[0]; + let pagesCount = chars.at(-1).pageIndex + 1; + let pos = (chars[result.end].pageIndex + 1) / pagesCount; + if (pos >= 0.5) { + return result.end + 1; + } + } + return 0; +} + +export function getGroupsFromClusters(chars, clusters) { + let groups = []; + for (let cluster of clusters) { + let breakPoints = cluster; + if (!breakPoints.length) { + continue; + } + let group = []; + for (let i = 0; i < breakPoints.length; i++) { + let breakPoint = breakPoints[i]; + let chars2; + if (i < breakPoints.length - 1) { + chars2 = chars.slice(breakPoint.offset, breakPoints[i + 1].offset); + } else { + let finalBreak = breakPoint.offset; + for (let j = finalBreak; j < chars.length; j++) { + finalBreak = j; + if (chars[j].paragraphBreakAfter) { + break; + } + } + chars2 = chars.slice(breakPoint.offset, finalBreak + 1); + } + group.push({ + valid: hasValidYear(chars2) && chars2.length > 50 && chars2.length < 1000 && chars2[0].pageIndex === chars2.at(-1).pageIndex, + chars: chars2, + text: chars2.map(x => x.c).join(''), + pageIndex: chars2[0].pageIndex, + offset: breakPoint.offset, + spacing: breakPoint.spacing, + delta: breakPoint.delta, + }); + } + + groups.push(group); + } + return groups; +} + +export function getReferencesFromGroup(group, useIndex) { + let references = []; + for (let { chars } of group) { + let pageIndex = chars[0].pageIndex; + let chars1 = []; + let chars2 = []; + for (let char of chars) { + if (char.pageIndex === pageIndex) { + chars1.push(char); + } else { + chars2.push(char); + } + } + + let position = { + pageIndex, + rects: [getBoundingRect(chars1)], + }; + if (chars2.length) { + position.nextPageRects = [getBoundingRect(chars2)]; + } + + let text = []; + for (let char of chars) { + text.push(char.c); + if (char.spaceAfter) { + text.push(' '); + } + } + text = text.join(''); + + let reference = { + text, + chars, + position + }; + + if (useIndex) { + let result = text.match(/\d+/); + let index = result ? parseInt(result[0], 10) : null; + + if (index) { + reference.index = index; + } + } + references.push(reference); + } + return references; +} + +export function getLinesFromChars(chars) { + let lines = []; + let currentLineChars = []; + let offsetFrom = 0; + for (let i = 0; i < chars.length; i++) { + let char = chars[i]; + currentLineChars.push(char); + if (char.lineBreakAfter) { + lines.push({ + chars: currentLineChars, + offset: offsetFrom, + rect: getBoundingRect(currentLineChars), + pageIndex: char.pageIndex, + }); + offsetFrom = i + 1; + currentLineChars = []; + } + } + return lines; +} + + diff --git a/src/core/module/reference/extractor/extractor.js b/src/core/module/reference/extractor/extractor.js new file mode 100644 index 0000000000000..4629d1800ce4f --- /dev/null +++ b/src/core/module/reference/extractor/extractor.js @@ -0,0 +1,71 @@ +import { getCenterRect, intersectRects } from '../../util.js'; +import { getRegularLinkOverlays } from '../../link/link.js'; +import { extractByListNumberSpacing } from './list-number-spacing.js'; +import { extractByFirstLineIndent } from './first-line-indent.js'; +import { extractByParagraphSpacing } from './paragraph-spacing.js'; +import { getReferencesTitleOffset } from './common.js'; + +// TODO: In Mills - 2015 some lines a single therefore they won't be break. +// Fix that. Use line that aligns other reference start and break everything else that is next to this line + +export async function extractReferences(pdfDocument, combinedChars) { + let sectionOffset = getReferencesTitleOffset(combinedChars); + + let groups = [ + await extractByListNumberSpacing(combinedChars, sectionOffset), + await extractByFirstLineIndent(combinedChars, sectionOffset), + await extractByParagraphSpacing(combinedChars, sectionOffset), + ]; + + groups = groups.filter(x => x); + + let bestGroup = null; + for (let group of groups) { + if (bestGroup === null || bestGroup.references.length < group.references.length) { + bestGroup = group; + } + } + + if (!bestGroup) { + return null; + } + + let { references, offset } = bestGroup; + + await addUrls(pdfDocument, combinedChars, references); + + return { references, offset }; +} + +async function addUrls(pdfDocument, combinedChars, references) { + // Uncombine chars + let charPagesMap = new Map(); + for (let char of combinedChars) { + if (!charPagesMap.has(char.pageIndex)) { + charPagesMap.set(char.pageIndex, []); + } + charPagesMap.get(char.pageIndex).push(char); + } + let allPageIndexes = references.map(x => x.position.pageIndex); + let minPageIndex = Math.min(...allPageIndexes); + let maxPageIndex = Math.max(...allPageIndexes); + for (let i = minPageIndex; i <= maxPageIndex; i++) { + let linkOverlays = await getRegularLinkOverlays(pdfDocument, charPagesMap.get(i), i); + linkOverlays = linkOverlays.filter(x => x.type === 'external-link'); + for (let linkOverlay of linkOverlays) { + let { rects } = linkOverlay.position; + for (let reference of references) { + for (let char of reference.chars) { + if (char.pageIndex === linkOverlay.position.pageIndex) { + for (let rect of rects) { + let centerRect = getCenterRect(char.rect); + if (intersectRects(centerRect, rect)) { + char.url = linkOverlay.url; + } + } + } + } + } + } + } +} diff --git a/src/core/module/reference/extractor/first-line-indent.js b/src/core/module/reference/extractor/first-line-indent.js new file mode 100644 index 0000000000000..636c22cf46097 --- /dev/null +++ b/src/core/module/reference/extractor/first-line-indent.js @@ -0,0 +1,230 @@ +import { getClusters } from '../../util.js'; +import { + canStartWithChar, + getGroupsFromClusters, + getLinesFromChars, + getReferencesFromGroup, + splitByPageIndexContinuity, +} from './common.js'; + +// TODO: Rename to hanging indent, and don't do regular indent with this extractor, +// because normal indent is much more common and results in more false results. +// Instead, paragraph spacing should be used for that + +function isValidReference2(chars) { + let starts = []; + for (let i = 1; i < chars.length - 1; i++) { + if (chars[i - 1].lineBreakAfter) { + starts.push(i); + } + } + + // TODO: take into account chaning pageinxdex + if (starts.length >= 2) { + for (let i = 0; i < starts.length - 1; i++) { + let lineStartChar = chars[starts[i]]; + let nextLineStartChar = chars[starts[i + 1]]; + if (lineStartChar.rect[1] > nextLineStartChar.rect[1] && + lineStartChar.pageIndex === nextLineStartChar.pageIndex) { + if (Math.abs(lineStartChar.rect[0] - nextLineStartChar.rect[0]) > 2) { + return false; + } + } + } + } + + if (starts.length >= 1) { + let char1 = chars[0]; + let char2 = chars[starts[0]]; + if (Math.abs(char1.rect[0] - char2.rect[0]) < 2) { + return false; + } + } + + return true; +} + +function findClosestSmallerAndHigher(arr, target) { + let closestSmaller = null; // Initialize closest smaller as null + let closestHigher = null; // Initialize closest higher as null + + for (let i = 0; i < arr.length; i++) { + const num = arr[i]; + if (num < target) { + // For closest smaller: We want the maximum number that is still less than the target + if (closestSmaller === null || num > closestSmaller) { + closestSmaller = num; + } + } else if (num > target) { + // For closest higher: We want the minimum number that is still greater than the target + if (closestHigher === null || num < closestHigher) { + closestHigher = num; + } + } + } + + let res = []; + if (closestSmaller) { + res.push(closestSmaller); + } + + if (closestHigher) { + res.push(closestHigher); + } + + return res; +} + +function addExtraBreakPoints(chars, clusters, sectionOffset) { + for (let cluster of clusters) { + let paragraphBreaks = cluster.map(x => x.offset); + let offsetStart = Math.min(...paragraphBreaks); + let offsetEnd = Math.max(...paragraphBreaks); + + let starPageIndex = chars[offsetStart].pageIndex; + let endPageIndex = chars[offsetEnd].pageIndex; + + // Note: This is useful for adding extra breaks for single-line references before and after + // offsetStart and offsetEnd, but also includes random lines, therefore, at least for the beginning, + // do this only together with references section title detection + // + // while (offsetStart > 0 && chars[offsetStart].pageIndex >= starPageIndex) { + // offsetStart--; + // } + + if (sectionOffset && chars[sectionOffset].pageIndex === starPageIndex) { + offsetStart = sectionOffset; + } + + while (offsetEnd < chars.length - 1 && chars[offsetEnd].pageIndex <= endPageIndex) { + offsetEnd++; + } + + let paragraphBreakSet = new Set(paragraphBreaks); + + let extraParagraphBreaks = [] + + for (let i = offsetStart; i < offsetEnd; i++) { + let prevChar = chars[i - 1]; + let char = chars[i]; + if ( + (prevChar && prevChar.lineBreakAfter) && + canStartWithChar(char) && + !paragraphBreakSet.has(i) + ) { + let closestBreaks = findClosestSmallerAndHigher(paragraphBreaks, i); + for (let closestBreak of closestBreaks) { + let otherChar = chars[closestBreak]; + if (Math.abs(otherChar.rect[0] - char.rect[0]) < 1) { + extraParagraphBreaks.push(i); + break; + } + } + } + } + + for (let extraParagraphBreak of extraParagraphBreaks) { + cluster.push({ + offset: extraParagraphBreak, + text: '---', + }); + cluster.sort((a, b) => a.offset - b.offset) + } + } +} + +function keepConcentratedItemsOnly(groups) { + let groups2 = []; + for (let group of groups) { + let pageItemCount = new Map(); + for (let item of group) { + if (item.valid) { + let count = pageItemCount.get(item.pageIndex) || 0; + count++; + pageItemCount.set(item.pageIndex, count); + } + } + + let group2 = []; + + for (let item of group) { + if ( + pageItemCount.get(item.pageIndex) >= 5 || + pageItemCount.get(item.pageIndex - 1) >= 5 || + pageItemCount.get(item.pageIndex + 1) >= 5 + ) { + group2.push(item); + } + } + if (group2.length) { + groups2.push(group2); + } + } + return groups2; +} + +export async function extractByFirstLineIndent(chars, sectionOffset) { + let lines = getLinesFromChars(chars); + let breakPoints = []; + for (let i = 1; i < lines.length; i++) { + + let beforeLine = lines[i - 2]; + let prevLine = lines[i - 1]; + let line = lines[i]; + let nextLine = lines[i + 1]; + + let tooBigLinespacing = false; + + if (nextLine) { + let lineSpacing = prevLine.rect[1] - line.rect[3]; + let nextLineSpacing = line.rect[1] - nextLine.rect[3]; + if (!(lineSpacing < nextLineSpacing || Math.abs(lineSpacing - nextLineSpacing) < 0.5)) { + tooBigLinespacing = true; + } + } + + if (beforeLine) { + let lineSpacing = prevLine.rect[1] - line.rect[3]; + let beforeLineSpacing = beforeLine.rect[1] - prevLine.rect[3]; + if (!(lineSpacing < beforeLineSpacing || Math.abs(lineSpacing - beforeLineSpacing) < 0.5)) { + tooBigLinespacing = true; + } + } + + if (sectionOffset && prevLine.offset < sectionOffset) { + continue; + } + let delta = line.rect[0] - prevLine.rect[0]; + if (Math.abs(delta) > 5 && Math.abs(delta) < 50 && canStartWithChar(prevLine.chars[0]) && !tooBigLinespacing) { + breakPoints.push({ + offset: prevLine.offset, + delta, + text: prevLine.chars.map(x => x.c).join(''), + pageIndex: prevLine.pageIndex, + }); + } + } + + let clusters = getClusters(breakPoints, 'delta', 1); + clusters.forEach(x => x.sort((a, b) => a.offset - b.offset)); + clusters = splitByPageIndexContinuity(clusters); + addExtraBreakPoints(chars, clusters, sectionOffset); + let groups = getGroupsFromClusters(chars, clusters); + groups = keepConcentratedItemsOnly(groups); + + groups = groups.filter(group => group.filter(item => item.valid).length / group.length >= 0.8); + + if (!sectionOffset) { + groups = groups.filter(x => (chars[x[0].offset].pageIndex + 1) / (chars.at(-1).pageIndex + 1) >= 0.75); + } + + if (groups.length === 0) { + return null; + } + + let group = groups.reduce((a, b) => a.length > b.length ? a : b, []); + + let references = getReferencesFromGroup(group); + let offset = group[0].offset; + return { references, offset }; +} diff --git a/src/core/module/reference/extractor/list-number-spacing.js b/src/core/module/reference/extractor/list-number-spacing.js new file mode 100644 index 0000000000000..616f762f92c22 --- /dev/null +++ b/src/core/module/reference/extractor/list-number-spacing.js @@ -0,0 +1,110 @@ +import { getBoundingRect, getClusters } from '../../util.js'; +import { + getGroupsFromClusters, + getReferencesFromGroup, + splitByPageIndexContinuity, +} from './common.js'; + +function getBreakPoints(chars, sectionOffset) { + let breakPoints = []; + let currentWordChars = []; + let isNewLine = true; + let offsetFrom = sectionOffset; + + for (let i = offsetFrom; i < chars.length; i++) { + let char = chars[i]; + + currentWordChars.push(char); + + if (char.spaceAfter || char.lineBreakAfter || char.paragraphBreakAfter || i === chars.length - 1) { + let charBeforeWord = chars[offsetFrom - 1]; + let charAfterWord = chars[offsetFrom + currentWordChars.length]; + let charLast = currentWordChars.at(-1); + + let number = currentWordChars.filter(char => char.c >= '0' && char.c <= '9').map(x => x.c); + if ( + ( + charBeforeWord && (charBeforeWord.lineBreakAfter || charBeforeWord.paragraphBreakAfter) + || !charBeforeWord + ) && + ( + !charLast.lineBreakAfter + ) + && (currentWordChars[0].c !== '(' || currentWordChars.at(-1).c === ')') + && (currentWordChars[0].c !== '[' || currentWordChars.at(-1).c === ']') + && (currentWordChars.at(-1).c !== ')' || currentWordChars[0].c === '(') + && (currentWordChars.at(-1).c !== ']' || currentWordChars[0].c === '[') + && !currentWordChars.some(x => !['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')', '[', ']', '.', ':'].includes(x.c)) + // && charAfterWord.c === charAfterWord.c.toUpperCase() + && number.length > 0 && number.length <= 3 && currentWordChars.length <= number.length + 2) { + // Push the word object if it contains a number + breakPoints.push({ + text: currentWordChars.map(x => x.c).join(''), + chars: currentWordChars.slice(), // Use slice to copy the array + offset: offsetFrom, + rect: getBoundingRect(currentWordChars), + distanceToNextChar: null, + pageIndex: char.pageIndex, + number: parseInt(number), + }); + } + currentWordChars = []; // Reset for the next word + offsetFrom = i + 1; + } + } + + // Calculate the distance from the last number of the current word to the next character in chars + for (let j = 0; j < breakPoints.length; j++) { + const word = breakPoints[j]; + let lastNumberCharIndex = -1; + + // Find the last number in the word + for (let k = word.chars.length - 1; k >= 0; k--) { + if (word.chars[k].c >= '0' && word.chars[k].c <= '9') { + lastNumberCharIndex = k; + break; + } + } + + if (lastNumberCharIndex !== -1) { + const lastNumberChar = word.chars[lastNumberCharIndex]; + let nextChar = chars[word.offset + word.chars.length]; // Get the next character in the chars array after the current word + + if (nextChar) { + // Calculate the distance using the rect properties + breakPoints[j].distanceToNextChar = nextChar.rect[0] - lastNumberChar.rect[2]; + breakPoints[j].distanceToNextChar2 = nextChar.rect[0] - word.chars[0].rect[0]; + } + } + } + + return breakPoints; +} + +export async function extractByListNumberSpacing(chars, sectionOffset) { + // Filter words that have a white space before it and an extractable integer [1], 1. + // Find the longest sequence? + let breakPoints = getBreakPoints(chars, sectionOffset); + if (!breakPoints.length) { + return null; + } + let clusters1 = getClusters(breakPoints, 'distanceToNextChar', 1); + let clusters2 = getClusters(breakPoints, 'distanceToNextChar2', 1); + let clusters = [...clusters1, ...clusters2]; + clusters.forEach(x => x.sort((a, b) => a.offset - b.offset)); + clusters = splitByPageIndexContinuity(clusters); + let groups = getGroupsFromClusters(chars, clusters); + if (!sectionOffset) { + groups = groups.filter(x => (chars[x[0].offset].pageIndex + 1) / (chars.at(-1).pageIndex + 1) >= 0.75); + } + + if (groups.length === 0) { + return null; + } + + let group = groups.reduce((a, b) => a.length > b.length ? a : b, []); + + let references = getReferencesFromGroup(group, true); + let offset = group[0].offset; + return { references, offset }; +} diff --git a/src/core/module/reference/extractor/paragraph-spacing.js b/src/core/module/reference/extractor/paragraph-spacing.js new file mode 100644 index 0000000000000..78dd1d03f1c2e --- /dev/null +++ b/src/core/module/reference/extractor/paragraph-spacing.js @@ -0,0 +1,129 @@ +import { getBoundingRect, getClusters } from '../../util.js'; +import { + canStartWithChar, + getGroupsFromClusters, + getLinesFromChars, getReferencesFromGroup, + splitByPageIndexContinuity, +} from './common.js'; + +function removeItemsWithInvalidSpacing(groups) { + let groups2 = []; + for (let group of groups) { + let group2 = []; + for (let item of group) { + let ignore = false; + for (let j = 0; j < item.chars.length - 1; j++) { + let char = item.chars[j]; + let nextChar = item.chars[j + 1]; + if (char.lineBreakAfter && !(char.rect[3] < nextChar.rect[1]) && + char.rect[1] - nextChar.rect[3] > item.spacing) { + ignore = true; + break; + } + } + if (!ignore) { + group2.push(item); + } + } + if (group2.length) { + groups2.push(group2); + } + } + return groups2; +} + +function keepContinousSequencesOnly(groups) { + // Keep only the longest continuous sequence of references in each group + // Note: The 'ignore' above is responsible for broking continuity + let groups2 = []; + for (let group of groups) { + let currentSequence = []; + let longestSequence = []; + for (let i = 0; i < group.length; i++) { + let prevItem = group[i - 1]; + let curItem = group[i]; + if (!prevItem || prevItem.offset + prevItem.chars.length !== curItem.offset) { + if (currentSequence.length > longestSequence.length) { + longestSequence = currentSequence.slice(); + } + currentSequence = [curItem]; + } else { + currentSequence.push(curItem); + } + } + if (currentSequence.length > longestSequence.length) { + longestSequence = currentSequence.slice(); + } + if (longestSequence.length > 0) { + groups2.push(longestSequence); + } + } + return groups2; +} + +function keepDenseGroupsOnly(groups) { + // Filter groups containing at least 5 valid references in a page + let groups2 = []; + // Count the maximum number of valid items in page for each group + for (let group of groups) { + let pageItemCount = {}; + for (let item of group) { + if (item.valid) { + if (pageItemCount[item.pageIndex]) { + pageItemCount[item.pageIndex]++; + } + else { + pageItemCount[item.pageIndex] = 1; + } + } + } + let max = 0; + let values = Object.values(pageItemCount); + if (values.length) { + max = Math.max(...values); + } + if (max >= 5) { + groups2.push(group); + } + } + return groups2; +} + +export async function extractByParagraphSpacing(chars, sectionOffset) { + let lines = getLinesFromChars(chars); + + let breakPoints = []; + for (let i = 1; i < lines.length; i++) { + let prevLine = lines[i - 1]; + let line = lines[i]; + if (sectionOffset && prevLine.offset < sectionOffset) { + continue; + } + let spacing = prevLine.rect[1] - line.rect[3]; + if (spacing > 0 && canStartWithChar(line.chars[0])) { + breakPoints.push({ offset: line.offset, text: line.text, spacing }); + } + } + + let clusters = getClusters(breakPoints, 'spacing', 0.5); + clusters.forEach(x => x.sort((a, b) => a.offset - b.offset)); + clusters = splitByPageIndexContinuity(clusters); + let groups = getGroupsFromClusters(chars, clusters); + groups = removeItemsWithInvalidSpacing(groups); + groups = keepContinousSequencesOnly(groups); + // Filter groups with at least 70% of valid references + groups = groups.filter(x => x.filter(x => x.valid).length / x.length >= 0.7); + groups = keepDenseGroupsOnly(groups); + if (!sectionOffset) { + groups = groups.filter(x => (chars[x[0].offset].pageIndex + 1) / (chars.at(-1).pageIndex + 1) >= 0.75); + } + // We expect to have only one group left here + if (groups.length !== 1) { + return null; + } + let group = groups[0]; + + let references = getReferencesFromGroup(group); + let offset = group[0].offset; + return { references, offset }; +} diff --git a/src/core/module/reference-matcher.js b/src/core/module/reference/matcher.js similarity index 56% rename from src/core/module/reference-matcher.js rename to src/core/module/reference/matcher.js index d0ea68cbc948c..cf2822f04a57f 100644 --- a/src/core/module/reference-matcher.js +++ b/src/core/module/reference/matcher.js @@ -1,4 +1,24 @@ -import { getRectsFromChars, getSortIndex } from './util.js'; +import { + getCenterRect, + getPositionFromDestination, + getRectsFromChars, + getSortIndex, intersectRects +} from '../util.js'; + +/* + - Number must be in a proper line, there should be enough characters (to exclude equations and formulas) + - [] must have \p, \n, ' ' at the left, and ' ', \n, \p a at the right + - If number has internal link and it points to the same then only use this method to determine if numbe is in-text citation + - Also exclude all numbers with internal links that aren't pointing to reference page (to avoid author affiliatons in superscripts) https://forums.zotero.org/discussion/114783/zotero-7-beta-affiliations-interpreted-as-references#latest + - Detect special word before number i.e Eq., Equation, Ref. Reference, etc. + + - Disable reference extraction and matching, link matching for large papers + + + + - Consider using isolation zones if paragraph font is completely different than the main font + - Consider always rendering the full page in preview, but just show it with scrollbars and focused to matched or linked exact point + */ function getPositionFromRects(chars, pageIndex) { let chars1 = []; @@ -22,8 +42,9 @@ function getPositionFromRects(chars, pageIndex) { return position; } -function matchByNameAndYear(combinedChars, pageIndex, references, index, currentPageLength) { +function matchByNameAndYear(combinedChars, references, index) { // TODO: Don't match names after the year + // TODO: Sort results by year offset distance from the name // let index = new Map(); let matches = []; let matchIndex = 0; @@ -120,9 +141,9 @@ function matchByNameAndYear(combinedChars, pageIndex, references, index, current let minOffset = matches.reduce((acc, val) => Math.min(acc, val.referenceOffset), Infinity); - let someInCurrentPage = matches.some(x => x.pageOffset <= currentPageLength - 1); + // let someInCurrentPage = matches.some(x => x.pageOffset <= currentPageLength - 1); - if (hasName && minOffset < 15 && someInCurrentPage) { + if (hasName && minOffset < 15/* && someInCurrentPage*/) { groups2.push(group); } } @@ -153,14 +174,14 @@ function matchByNameAndYear(combinedChars, pageIndex, references, index, current let word = combinedChars.slice(offset, offset + length); - let position = getPositionFromRects(word, pageIndex); + let position = getPositionFromRects(word, word[0].pageIndex); overlays.push({ type: 'citation', word, offset, position, - sortIndex: getSortIndex(pageIndex, offset, 0), + sortIndex: getSortIndex(word[0].pageIndex, offset, 0), references, }); } @@ -170,7 +191,7 @@ function matchByNameAndYear(combinedChars, pageIndex, references, index, current return overlays; } -function matchByNumber(combinedChars, pageIndex, references) { +async function matchByNumber(pdfDocument, combinedChars, references) { let ranges = []; let range = null; @@ -191,6 +212,7 @@ function matchByNumber(combinedChars, pageIndex, references) { ) { range = { type: 'superscript', + pageIndex: char.pageIndex, chars: [char], offsetFrom: i, offsetTo: i, @@ -202,6 +224,7 @@ function matchByNumber(combinedChars, pageIndex, references) { range = { type: 'brackets', chars: [prevChar, char], + pageIndex: char.pageIndex, offsetFrom: i, offsetTo: i, }; @@ -209,6 +232,16 @@ function matchByNumber(combinedChars, pageIndex, references) { range = { type: 'parentheses', chars: [prevChar, char], + pageIndex: char.pageIndex, + offsetFrom: i, + offsetTo: i, + }; + } else { + range = { + type: 'other', + chars: [char], + before: [prevChar], + pageIndex: char.pageIndex, offsetFrom: i, offsetTo: i, }; @@ -218,26 +251,82 @@ function matchByNumber(combinedChars, pageIndex, references) { // After starting character collection above, continue the collection until it's one of the characters // below and font size doesn't change (superscript) else { - let allowed = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', '-', '–', '[', ']', '(', ')']; - - if (allowed.includes(char.c) && Math.abs(prevChar.fontSize - char.fontSize) < 1) { + let allowed = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', '-', '–']; + + if ( + (!prevChar || prevChar.pageIndex === char.pageIndex) && + ( + allowed.includes(char.c) || + ['brackets', 'parentheses'].includes(range.type) && [']', ')'].includes(char.c) + ) && + Math.abs(prevChar.fontSize - char.fontSize) < 1 + ) { range.chars.push(char); range.offsetTo = i; + } + else { + ranges.push(range); + range = null; + } + } + } - if (['brackets', 'parentheses'].includes(range.type) && (range.chars.filter(x => ['(', '[', ']', ')'].includes(x.c)).length % 2 === 0)) { - ranges.push(range); - range = null; + let pageIndexes = Array.from(new Set(ranges.map(x => x.pageIndex))).sort(); + + + let internalLinks = new Map(); + for (let pageIndex of pageIndexes) { + let page = await pdfDocument.getPage(pageIndex); + let annotations = await page._parsedAnnotations; + for (let annotation of annotations) { + annotation = annotation.data; + let { dest, rect } = annotation; + if (!dest || !rect) { + continue; + } + let destinationPosition = await getPositionFromDestination(pdfDocument, dest); + if (destinationPosition) { + + let list = internalLinks.get(pageIndex); + if (!list) { + list = []; + internalLinks.set(pageIndex, list); } + list.push({ + rect, + destinationPosition, + }); } - else { - if (range.type === 'superscript') { - ranges.push(range); + } + } + + + // console.log('internal links', internalLinks) + + for (let range of ranges) { + range.destinationIndexes = []; + let { pageIndex, chars } = range; + let pageInternalLinks = internalLinks.get(pageIndex); + if (pageInternalLinks) { + for (let internalLink of pageInternalLinks) { + for (let i = 0; i < chars.length; i++) { + let char = chars[i]; + let centerRect = getCenterRect(char.rect); + if (intersectRects(centerRect, internalLink.rect)) { + let targetPageIndex = internalLink.destinationPosition.pageIndex; + if (!range.destinationIndexes.includes(targetPageIndex)) { + range.destinationIndexes.push(targetPageIndex); + } + } } - range = null; } } } + ranges = ranges.filter(x => !(x.type === 'other' && !x.destinationIndexes.length)); + + + let overlays = []; for (let range of ranges) { @@ -253,15 +342,14 @@ function matchByNumber(combinedChars, pageIndex, references) { let number = parseInt(part); if (number) { numbers.add(number); - if (fillInterval) { - for (let i = lastNum + 1; i < number; i++) { + // Fill the interval but make sure it doesn't grow uncontrollably + for (let i = lastNum + 1; i < number && numbers.size < 50; i++) { numbers.add(i); } } lastNum = number; - } - else { + } else { if (part.split('').some(x => ['-', '–'].includes(x))) { fillInterval = true; } @@ -278,59 +366,88 @@ function matchByNumber(combinedChars, pageIndex, references) { let reference = references.find(x => x.index === number); if (reference) { refs.push(reference); + range.internalLinkType = 0; + if (range.destinationIndexes.length) { + range.internalLinkType = range.destinationIndexes.includes(reference.position.pageIndex) ? 1 : -1; + } } } - if (numbers.length !== refs.length || !refs.length) { - continue; + range.numbers = numbers; + range.references = refs; + + // if (numbers.length !== refs.length || !refs.length) { + // continue; + // } + } + + ranges = ranges.filter(x => x.internalLinkType !== -1); + ranges = ranges.filter(x => x.numbers.length); + ranges = ranges.filter(x => x.references.length); + + + + // for (let range of ranges) { + // console.log(range.type, range.chars.map(x => x.c).join(''), range); + // } + + + let groups = new Map(); + for (let range of ranges) { + let char = range.chars.find(x => x.c >= '0' && x.c <= '9'); + + let id = range.type + '-' + (Math.round(char.fontSize * 10) / 10).toString() + '-' + char.fontName + '-' + range.internalLinkType; + + let group = groups.get(id); + if (!group) { + group = { + ranges: [], + references: new Set(), + }; + groups.set(id, group); + } + + group.ranges.push(range); + for (let reference of range.references) { + group.references.add(reference); + } + } + + let bestGroup; + for (let [id, group] of groups) { + if (!bestGroup || bestGroup.references.size < group.references.size) { + bestGroup = group; } + } + if (!bestGroup) { + return []; + } + + for (let range of bestGroup.ranges) { + + let { references } = range; let word = range.chars; - let position = getPositionFromRects(word, pageIndex); + let position = getPositionFromRects(word, range.chars[0].pageIndex); overlays.push({ type: 'citation', word, offset: range.offsetFrom, position, - sortIndex: getSortIndex(pageIndex, range.offsetFrom, 0), - references: refs, + sortIndex: getSortIndex(range.chars[0].pageIndex, range.offsetFrom, 0), + references, }); } return overlays; } -export async function getReferenceOverlays(referenceData, pageIndex) { - if (!referenceData) { - return []; - } - let { references } = referenceData; - let overlays = []; - for (let reference of references) { - if (reference.position.pageIndex === pageIndex) { - overlays.push({ - type: 'reference', - position: reference.position, - sortIndex: getSortIndex(pageIndex, reference.chars[0].offset, 0), - references: [reference], - }); - } - } - return overlays; -} - -export async function getCitationOverlays(pdfDocument, structuredCharsProvider, pageIndex, referenceData) { - if (!referenceData) { - return []; - } - let {references, start} = referenceData; - if (pageIndex > start.pageIndex) { - return []; - } - +export async function getOverlays(pdfDocument, combinedChars, references) { let index = new Map(); + for (let reference of references) { + // console.log('reference', reference, references) let word = []; let wordOffsetFrom = 0; for (let i = 0; i < reference.chars.length; i++) { @@ -360,40 +477,52 @@ export async function getCitationOverlays(pdfDocument, structuredCharsProvider, } } - let chars = await structuredCharsProvider(pageIndex); - let combinedChars = chars.slice(); + let citationOverlays; - if (pageIndex === start.pageIndex) { - combinedChars = combinedChars.slice(0, start.offset); + if (references[0].index) { + citationOverlays = await matchByNumber(pdfDocument, combinedChars, references); + } + else { + citationOverlays = matchByNameAndYear(combinedChars, references, index); } - let currentPageLength = combinedChars.length; - for (let char of combinedChars) { - char.pageIndex = pageIndex; - } - if (pageIndex + 1 < pdfDocument.catalog.numPages && pageIndex < start.pageIndex) { - let chars = await structuredCharsProvider(pageIndex + 1); - let index = chars.findIndex(x => x.lineBreakAfter); - chars = chars.slice(0, index + 1); - for (let char of chars) { - char.pageIndex = pageIndex + 1; + + + + let referenceMap = new Map(); + + for (let citationOverlay of citationOverlays) { + for (let reference of citationOverlay.references) { + let group = referenceMap.get(reference); + if (!group) { + group = []; + referenceMap.set(reference, group); + } + group.push(citationOverlay) } - combinedChars.push(...chars); } - combinedChars = combinedChars.filter(x => !x.isolated); + let referenceOverlays = []; + for (let [reference, citationOverlays] of referenceMap) { + let { pageIndex, offset } = reference.chars[0]; + let referenceOverlay = { + type: 'reference', + position: reference.position, + sortIndex: getSortIndex(pageIndex, offset, 0), + references: [reference], + citations: [], + }; - let overlays; + for (let citationOverlay of citationOverlays) { + let { word, offset, position } = citationOverlay; + referenceOverlay.citations.push({ word, offset, position }); + } - if (references[0].index) { - overlays = matchByNumber(combinedChars, pageIndex, references); - } - else { - overlays = matchByNameAndYear(combinedChars, pageIndex, references, index, currentPageLength); + referenceOverlays.push(referenceOverlay); } - return overlays; + return { citationOverlays, referenceOverlays }; } function canJoinMatches(chars, match1, match2, matches) { diff --git a/src/core/module/reference/reference.js b/src/core/module/reference/reference.js new file mode 100644 index 0000000000000..ea98216995278 --- /dev/null +++ b/src/core/module/reference/reference.js @@ -0,0 +1,31 @@ +import { extractReferences } from './extractor/extractor.js'; +import { getOverlays } from './matcher.js'; + +export async function getCitationAndReferenceOverlays(pdfDocument, structuredCharsProvider) { + let { numPages } = pdfDocument.catalog; + + if (numPages > 100) { + return []; + } + + let combinedChars = []; + + for (let i = 0; i < numPages; i++) { + let chars = await structuredCharsProvider(i); + chars = chars.filter(x => !x.isolated).map(x => ({ ...x })); + combinedChars.push(...chars); + } + + let res = await extractReferences(pdfDocument, combinedChars); + if (!res) { + return []; + } + + let { references, offset } = res; + + combinedChars = combinedChars.slice(0, offset); + + let { citationOverlays, referenceOverlays} = await getOverlays(pdfDocument, combinedChars, references); + + return [...citationOverlays, ...referenceOverlays]; +} diff --git a/src/core/module/util.js b/src/core/module/util.js index 934279ef28b8d..1e54f2c5f4f5b 100644 --- a/src/core/module/util.js +++ b/src/core/module/util.js @@ -170,7 +170,9 @@ export function getClusters(objects, property, eps) { for (let i = 1; i < objects.length; i++) { const object = objects[i]; - const distance = Math.abs(object[property] - currentCluster[currentCluster.length - 1][property]); + let min = Math.min(object[property], currentCluster[currentCluster.length - 1][property]); + let max = Math.max(object[property], currentCluster[currentCluster.length - 1][property]); + const distance = max - min; // Add to current cluster if within eps; otherwise, start a new cluster if (distance <= eps) { diff --git a/src/core/worker.js b/src/core/worker.js index ba5b015eedc92..720b79a51f29f 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -837,6 +837,19 @@ class WorkerMessageHandler { return pageData; }); + handler.on("GetProcessedData", async function (data) { + let task = new WorkerTask('GetProcessedData: '); + startWorkerTask(task); + let pageData; + try { + pageData = await pdfManager.pdfDocument.module.getProcessedData(data); + } catch (e) { + console.log(e); + } + finishWorkerTask(task); + return pageData; + }); + handler.on("GetOutline2", async function (data) { let task = new WorkerTask('GetOutline2'); startWorkerTask(task); diff --git a/src/display/api.js b/src/display/api.js index b122a7d0404bb..c8d4a003e5b97 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -951,6 +951,10 @@ class PDFDocumentProxy { return this._transport.messageHandler.sendWithPromise("GetPageData", data); } + getProcessedData(data) { + return this._transport.messageHandler.sendWithPromise("GetProcessedData", data); + } + getOutline2(data) { return this._transport.messageHandler.sendWithPromise("GetOutline2", data); }