Skip to content

Commit

Permalink
More fixes to text analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed May 30, 2024
1 parent 849a4e2 commit 3a29075
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 9 deletions.
5 changes: 3 additions & 2 deletions src/core/module/content-rect.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ export async function getContentRect(pdfDocument, structuredCharsProvider) {
let { view } = await pdfDocument.getPage(numPages === 2 ? 1 : 0);
let width = view[2] - view[0];
rect[0] = 0;
rect[2] = width;

// Note: Even if page width/height is the same, some pages, in the same PDF,
// can have bigger mediaBox with a smaller cropBox applied on it
rect[2] = Infinity;
return rect;
}
25 changes: 22 additions & 3 deletions src/core/module/page-label.js
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,12 @@ function getLabelSequence(words) {
return sequences.sort((a, b) => b.length - a.length)[0];
}

function getClusterMaxDistance(cluster, property) {
let min = Math.min(...cluster.map(x => x[property]));
let max = Math.max(...cluster.map(x => x[property]));
return max - min;
}

export async function getPageLabel(pdfDocument, structuredCharsProvider, pageIndex, metadataPagesField) {
const NEXT_PREV_PAGES = 2;
let numPages = pdfDocument.catalog.numPages;
Expand Down Expand Up @@ -221,14 +227,27 @@ export async function getPageLabel(pdfDocument, structuredCharsProvider, pageInd
}
}

let yClusters = getClusters(candidateWords, 'relativeY', 5);
let eps = 5;

let yClusters = getClusters(candidateWords, 'relativeY', eps);

let bestSequence = [];
for (let yCluster of yClusters) {
let clusters = getClusters(yCluster, 'relativeX', 5);
let clusters = getClusters(yCluster, 'relativeX', eps);
for (let cluster of clusters) {
// Ignore clusters with too many values
if (cluster.length > eps * 5) {
continue;
}
let sequence = getLabelSequence(cluster);
if (sequence && sequence.length > bestSequence) {

if (
sequence &&
sequence.length > bestSequence &&
// Make sure the final sequence page label min and max distance doesn't surpass eps
getClusterMaxDistance(sequence, 'relativeY') <= eps &&
getClusterMaxDistance(sequence, 'relativeX') <= eps
) {
bestSequence = sequence;
}
}
Expand Down
10 changes: 7 additions & 3 deletions src/core/module/reference-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ function removeASCIISymbolsAndNumbers(inputString) {
}

function getReferencesTitleOffset(chars) {
let titles = ['references', 'bibliography', 'literature', 'bibliographie'];
let titles = ['references', 'bibliography', 'literature', 'bibliographie', 'literatur'];

let paragraphs = []; // This will hold the start and end indices of each paragraph
let start = 0; // Start index of the current paragraph
Expand Down Expand Up @@ -244,6 +244,10 @@ function extractByLayout(chars) {

let clusters = getClusters(deltas, 'delta', 1);

if (!clusters.length) {
return [];
}

let paragraphBreaks = clusters[0];

// Extracting by layout depends on first line of each reference being shifted more on the left
Expand Down Expand Up @@ -320,11 +324,11 @@ function extractByLayout(chars) {
references.push({
text,
chars,
position
position,
});
}

return references
return references;
}

function extractByParagraphSpacing(chars) {
Expand Down
4 changes: 3 additions & 1 deletion src/core/module/structure.js
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,9 @@ function split(chars, reflowRTL) {
}

for (let char of chars) {
if (char.lineBreakAfter && !char.paragraphBreakAfter && dashChars.has(char.c)) {
// OCRed PDFs sometimes result in each line being a separate paragraph
// while, normal PDFs only need this when paragraph is wrapped to another column
if (char.lineBreakAfter /*&& !char.paragraphBreakAfter*/ && dashChars.has(char.c)) {
char.ignorable = true;
}
}
Expand Down

0 comments on commit 3a29075

Please sign in to comment.