Skip to content

Commit

Permalink
Various improvements to text analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed Jun 10, 2024
1 parent 296eb48 commit f51dff3
Show file tree
Hide file tree
Showing 19 changed files with 1,212 additions and 810 deletions.
3 changes: 0 additions & 3 deletions src/core/module/content-rect.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,9 @@ export async function getContentRect(pdfDocument, structuredCharsProvider) {
let startPage = Math.max(pageIndex - 2, 0);
let endPage = Math.min(pageIndex + 2, numPages - 1);

let x;

let combinedLines = [];
for (let i = startPage; i <= endPage; i++) {
let chars = await structuredCharsProvider(i);
if (!x) x = chars[2743];
let lines = getLinesFromChars(chars);
combinedLines.push(...lines);
}
Expand Down
9 changes: 4 additions & 5 deletions src/core/module/link/annotation-overlays.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ import {
getSortIndex,
} from '../util.js';

async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex){
async function _getLinkAnnotationOverlays(pdfDocument, chars, pageIndex){
let overlays = [];
let chars = await structuredCharsProvider(pageIndex);
let page = await pdfDocument.getPage(pageIndex);
let annotations = await page._parsedAnnotations;
for (let annotation of annotations) {
Expand Down Expand Up @@ -60,7 +59,7 @@ async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider,
overlay.url = url;
overlays.push(overlay);
} else if (annotation.dest) {
overlay.type = 'internal-link'
overlay.type = 'internal-link';
let destinationPosition = await getPositionFromDestination(pdfDocument, annotation.dest);
if (destinationPosition) {
overlay.destinationPosition = destinationPosition;
Expand Down Expand Up @@ -95,8 +94,8 @@ async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider,
return combinedOverlays;
}

export async function getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex) {
let linkOverlays = await _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex);
export async function getAnnotationOverlays(pdfDocument, chars, pageIndex) {
let linkOverlays = await _getLinkAnnotationOverlays(pdfDocument, chars, pageIndex);
for (let linkOverlay of linkOverlays) {
delete linkOverlay.offsetFrom;
delete linkOverlay.offsetTo;
Expand Down
40 changes: 32 additions & 8 deletions src/core/module/link/link.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ import { getAnnotationOverlays } from './annotation-overlays.js';
import { getMatchedOverlays } from './matched-overlays.js';
import { overlaysIntersect } from '../util.js';

export async function getRegularLinkOverlays(pdfDocument, structuredCharsProvider, pageIndex) {
let annotationOverlays = await getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex);
let parsedOverlays = await getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex);
export async function getRegularLinkOverlays(pdfDocument, chars, pageIndex) {
let annotationOverlays = await getAnnotationOverlays(pdfDocument, chars, pageIndex);
let parsedOverlays = getParsedOverlays(chars);
let overlays = [...annotationOverlays];
// Add parsed overlays that doesn't intersect with annotation overlays
for (let parsedOverlay of parsedOverlays) {
if (!annotationOverlays.some(x => overlaysIntersect(x, parsedOverlay))) {
overlays.push(parsedOverlay);
Expand All @@ -15,9 +16,32 @@ export async function getRegularLinkOverlays(pdfDocument, structuredCharsProvide
return overlays;
}

export async function getLinkOverlays(pdfDocument, structuredCharsProvider, contentRect, pageIndex){
let annotationOverlays = await getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex);
let parsedOverlays = await getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex);
let matchedOverlays = await getMatchedOverlays(pdfDocument, structuredCharsProvider, pageIndex, annotationOverlays, contentRect);
return [...annotationOverlays, ...parsedOverlays, ...matchedOverlays];
export async function getLinkOverlays(pdfDocument, structuredCharsProvider, contentRect){
let maxPages = Math.min(50, pdfDocument.catalog.numPages);
let pages = new Map();
for (let i = 0; i < maxPages; i++) {
let chars = await structuredCharsProvider(i);
let annotationOverlays = await getAnnotationOverlays(pdfDocument, chars, i);
let parsedOverlays = getParsedOverlays(chars);
let matchedOverlays = await getMatchedOverlays(pdfDocument, structuredCharsProvider, i, annotationOverlays, contentRect);

let overlays = [...annotationOverlays];

for (let matchedOverlay of matchedOverlays) {
if (!overlays.some(x => overlaysIntersect(x, matchedOverlay))) {
overlays.push(matchedOverlay);
}
}

for (let parsedOverlay of parsedOverlays) {
if (!overlays.some(x => overlaysIntersect(x, parsedOverlay))) {
overlays.push(parsedOverlay);
}
}

if (overlays.length) {
pages.set(i, overlays);
}
}
return pages;
}
145 changes: 2 additions & 143 deletions src/core/module/link/matched-overlays.js
Original file line number Diff line number Diff line change
Expand Up @@ -189,132 +189,6 @@ function matchCandidates(candidates, pageIndex) {
return matches;
}

function expandRect(currentRect, otherRects, surroundingRect) {
let [x1, y1, x2, y2] = currentRect;
let collision = { top: false, bottom: false, left: false, right: false };

function intersects(rectA, rectB) {
return rectA[0] < rectB[2] && rectA[2] > rectB[0] && rectA[1] < rectB[3] && rectA[3] > rectB[1];
}

while (!collision.top || !collision.bottom || !collision.left || !collision.right) {
if (!collision.top && y1 > surroundingRect[1]) {
y1 -= Math.min(10, y1 - surroundingRect[1]);
if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) {
y1 += Math.min(10, y1 - surroundingRect[1]); // Revert if collision
collision.top = true;
}
} else {
collision.top = true;
}

if (!collision.bottom && y2 < surroundingRect[3]) {
y2 += Math.min(10, surroundingRect[3] - y2);
if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) {
y2 -= Math.min(10, surroundingRect[3] - y2); // Revert if collision
collision.bottom = true;
}
} else {
collision.bottom = true;
}

if (!collision.left && x1 > surroundingRect[0]) {
x1 -= Math.min(10, x1 - surroundingRect[0]);
if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) {
x1 += Math.min(10, x1 - surroundingRect[0]); // Revert if collision
collision.left = true;
}
} else {
collision.left = true;
}

if (!collision.right && x2 < surroundingRect[2]) {
x2 += Math.min(10, surroundingRect[2] - x2);
if (otherRects.some(rect => intersects([x1, y1, x2, y2], rect))) {
x2 -= Math.min(10, surroundingRect[2] - x2); // Revert if collision
collision.right = true;
}
} else {
collision.right = true;
}
}

return [x1, y1, x2, y2];
}

function getRect(destination, chars, contentRect) {
let paragraph = { from: destination.offsetFrom, to: destination.offsetTo };

while (chars[paragraph.from - 1] && !chars[paragraph.from - 1].paragraphBreakAfter) {
paragraph.from--;
}

while (!chars[paragraph.to].paragraphBreakAfter) {
paragraph.to++;
}

let rect = getBoundingRect(chars.slice(paragraph.from, paragraph.to + 1));




let paragraphs = []; // This will hold the start and end indices of each paragraph
let start = 0; // Start index of the current paragraph

for (let i = 0; i < chars.length; i++) {
if (chars[i].paragraphBreakAfter || i === chars.length - 1) {
// If current char has .paragraphBreakAfter or it's the last element of the array
let end = i; // End index of the current paragraph

let rect = getBoundingRect(chars.slice(start, end + 1));

let sumParagraph = (rect[2] - rect[0]) * (rect[3] - rect[1]);

let sumLines = 0;

let linesNum = 0;
let lineStart = start;
for (let k = start; k <= end; k++) {
if (chars[k].lineBreakAfter) {
let lineEnd = k + 1;
let rect = getBoundingRect(chars.slice(lineStart, lineEnd));
// console.log('uuu', chars.slice(lineStart, lineEnd).map(x => x.c).join(''), rect[2] - rect[0]);
sumLines += (rect[2] - rect[0]) * (rect[3] - rect[1]);
linesNum++;
lineStart = k + 1;
}
}

let sum = 0;
for (let j = start; j <= end; j++) {
let char = chars[j];
sum += (char.rect[2] - char.rect[0]) * (char.rect[3] - char.rect[1]);
}

let densityRatio = sum / sumLines;
if (end - start > 50 && densityRatio > 0.8 && linesNum >= 2 && !(paragraph.from>=start && paragraph.from <=end)) {
paragraphs.push({
start: start,
end: end,
densityRatio,
rect,
text: chars.slice(start, end + 1).map(x => x.c).join('')
});
}



start = i + 1; // The next paragraph starts after the current one ends
}
}

let rects = paragraphs.map(x => x.rect);

let expandedRect = expandRect(rect, rects, [0, 0, 595.276, 790.866]);

return expandedRect;
}

export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, pageIndex, annotationOverlays, contentRect) {
let MAX_PAGES_AROUND = 5;
let from = Math.max(pageIndex - MAX_PAGES_AROUND, 0);
Expand All @@ -330,26 +204,15 @@ export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, p

let matches = matchCandidates(allCandidates, pageIndex);

for (let match of matches) {
let destination = match.destination;
destination.rect = getRect(destination, pages[destination.pageIndex], contentRect);
}

let overlays = [];
for (let match of matches) {
let sourceChars = pages[match.source.pageIndex].slice(match.source.offsetFrom, match.source.offsetTo + 1);
let sourceRect = getBoundingRect(sourceChars);

let destinationChars = pages[match.destination.pageIndex].slice(match.destination.offsetFrom, match.destination.offsetTo + 1);
let destinationRect = getBoundingRect(destinationChars);

let previewRect = match.destination.rect;
previewRect = [
Math.max(previewRect[0], contentRect[0]),
Math.max(previewRect[1], contentRect[1]),
Math.min(previewRect[2], contentRect[2]),
Math.min(previewRect[3], contentRect[3]),
];
// destinationRect[2] = destinationRect[0];
// destinationRect[1] = destinationRect[3];

let overlay = {
type: 'internal-link',
Expand All @@ -363,10 +226,6 @@ export async function getMatchedOverlays(pdfDocument, structuredCharsProvider, p
pageIndex: match.destination.pageIndex,
rects: [destinationRect],
},
previewPosition: {
pageIndex: match.destination.pageIndex,
rects: [previewRect],
},
};

overlays.push(overlay);
Expand Down
9 changes: 6 additions & 3 deletions src/core/module/link/parsed-overlays.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { getRangeRects, getSortIndex } from '../util.js';

export async function getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex) {
let chars = await structuredCharsProvider(pageIndex);

export function getParsedOverlays(chars) {
if (!chars.length) {
return [];
}
let sequences = [];
let sequence = { from: 0, to: 0, lbp: [] };

Expand Down Expand Up @@ -66,6 +67,8 @@ export async function getParsedOverlays(pdfDocument, structuredCharsProvider, pa
}
}

let { pageIndex } = chars[0];

let overlays = [];
for (let link of links) {
let rects = getRangeRects(chars, link.from, link.to - 1);
Expand Down
Loading

0 comments on commit f51dff3

Please sign in to comment.