Skip to content

Commit

Permalink
Fixed crashes with recognizing certain PDFs per #26
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Dec 10, 2024
1 parent 19dbdf7 commit 655ffd9
Show file tree
Hide file tree
Showing 6 changed files with 268 additions and 10 deletions.
13 changes: 6 additions & 7 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ export const compareOCRPage = async (pageA, pageB, options) => {

const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n);

// The `tessScheduler` property must be defined manually for Node.js, which runs this function in the main thread.
// In the browser, this is run in a worker, and the Tesseract module is defined automatically there.
if (typeof process !== 'undefined') {
options.tessScheduler = gs.schedulerInner;
}

const pageMetricsObj = pageMetricsArr[pageA.n];
return gs.compareOCRPageImp({
pageA, pageB, binaryImage, pageMetricsObj, options,
Expand Down Expand Up @@ -637,9 +643,6 @@ export async function recognize(options = {}) {
const compOptions = {
debugLabel: opt.saveDebugImages ? 'Combined' : undefined,
supplementComp: true,
// The `tessScheduler` property must be defined manually for Node.js, which runs this function in the main thread.
// In the browser, this is run in a worker, and the Tesseract module is defined automatically there.
tessScheduler: typeof process !== 'undefined' ? gs.schedulerInner : undefined,
ignoreCap: opt.ignoreCap,
ignorePunct: opt.ignorePunct,
confThreshHigh: opt.confThreshHigh,
Expand All @@ -657,10 +660,6 @@ export async function recognize(options = {}) {
const compOptions = {
mode: 'comb',
debugLabel: 'Combined',
supplementComp: true,
// The `tessScheduler` property must be defined manually for Node.js, which runs this function in the main thread.
// In the browser, this is run in a worker, and the Tesseract module is defined automatically there.
tessScheduler: typeof process !== 'undefined' ? gs.schedulerInner : undefined,
ignoreCap: opt.ignoreCap,
ignorePunct: opt.ignorePunct,
confThreshHigh: opt.confThreshHigh,
Expand Down
15 changes: 12 additions & 3 deletions js/worker/compareOCRModule.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ import { imageUtils } from '../objects/imageObjects.js';
import { getRandomAlphanum } from '../utils/miscUtils.js';
// import { CompDebug } from '../objects/imageObjects.js';

// Function that logs to stderr and then waits for the log to be flushed to the console.
// This should only be used for debugging purposes.
const debugLog = (x) => new Promise((resolve) => {
process.stderr.write(`${String(x)}\n`, resolve);
});

/** @type {OffscreenCanvasRenderingContext2D} */
let calcCtx;
/** @type {OffscreenCanvasRenderingContext2D} */
Expand Down Expand Up @@ -1150,15 +1156,18 @@ export async function checkWords(wordsA, binaryImage, imageRotated, pageMetricsO

let res;
if (options.tessScheduler) {
res = (await options.tessScheduler.addJob('recognize', inputImage, extraConfig)).data;
res = (await options.tessScheduler.addJob('recognize', {
image: inputImage,
options: extraConfig,
}));
} else if (options.tessWorker) {
res = await options.tessWorker.recognize(inputImage, extraConfig);
res = (await options.tessWorker.recognize(inputImage, extraConfig)).data;
} else {
throw new Error('`tessScheduler` and `tessWorker` missing. One must be provided for words to be checked.');
}

let wordTextA = wordsA.map((x) => x.text).join(' ');
let wordTextB = res.data.text.trim();
let wordTextB = res.text.trim();

wordTextA = ocr.replaceLigatures(wordTextA);
wordTextB = ocr.replaceLigatures(wordTextB);
Expand Down
Binary file added tests/assets/testocr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
97 changes: 97 additions & 0 deletions tests/assets/testocr_errors.hocr
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<title></title>
<meta name='font-metrics' content='{"SansDefault":{"normal":{"width":{"46":0.166667,"49":0.5217385362320001,"50":0.8695651159420001,"84":1.055556,"97":0.833333,"98":0.833333,"99":0.833333,"100":0.833333,"101":0.888889,"102":0.555556,"103":0.888889,"104":0.777778,"105":0.166667,"106":0.388889,"107":0.777778,"108":0.166667,"109":1.333333,"110":0.777778,"111":0.888889,"112":0.833333,"113":0.833333,"114":0.5,"115":0.777778,"116":0.444444,"117":0.777778,"118":0.833333,"119":1.277778,"120":0.833333,"121":0.833333,"122":0.833333},"height":{"46":0.166667,"49":1.3913036521739999,"50":1.3913036521739999,"84":1.333333,"97":1,"98":1.333333,"99":1,"100":1.333333,"101":1,"102":1.333333,"103":1.388889,"104":1.333333,"105":1.333333,"106":1.722222,"107":1.333333,"108":1.333333,"109":1,"110":1,"111":1,"112":1.333333,"113":1.333333,"114":1,"115":1,"116":1.277778,"117":1,"118":1,"119":1,"120":1,"121":1.388889,"122":1},"kerning":{"84,104":0.111111,"104,105":0.277778,"105,115":0.166667,"108,111":0.166667,"111,116":0.111111,"111,102":0.055556,"49,50":0.405797231884,"112,111":0.111111,"111,105":0.166667,"105,110":0.222222,"110,116":0.166667,"116,101":0.111111,"101,120":0.111111,"120,116":0.055556,"116,111":0.111111,"101,115":0.111111,"115,116":0.111111,"116,104":0.111111,"104,101":0.166667,"111,99":0.111111,"99,114":0.166667,"99,111":0.111111,"111,100":0.111111,"100,101":0.166667,"97,110":0.222222,"110,100":0.166667,"115,101":0.111111,"101,101":0.111111,"105,102":0.111111,"105,116":0.111111,"119,111":0.055556,"111,114":0.222222,"114,107":0.055556,"107,115":0.111111,"111,110":0.166667,"97,108":0.222222,"108,108":0.222222,"116,121":0.055556,"121,112":0.166667,"112,101":0.111111,"102,105":0.055556,"105,108":0.222222,"108,101":0.166667,"102,111":0,"114,109":0.111111,"109,97":0.166667,"97,116":0.111111,"116,46":0.166667,"113,117":0.222222,"117,105":0.277778,"105,99":0.166667,"99,107":0.111111,"98,114":0.166667,"114,111":0,"111,119":0.111111,"119,110":0.111111,"100,111":0.166667,"111,103":0.111111,"106,117":0.222222,"117,109":0.222222,"109,112":0.222222,"101,100":0.111111,"111,118":0.111111,"118,101":0.111111,"101,114":0.166667,"108,97":0.166667,"97,122":0.166667,"122,121":0.055556,"111,120":0.111111,"120,46":0.222222},"kerning2":{"84,104":0.11111100000000007,"104,105":0.277777,"105,115":0.16666599999999998,"108,111":0.1666669999999999,"111,116":0.11111200000000004,"111,102":0.11111099999999996,"49,50":0.40579723188399996,"112,111":0.11111099999999996,"111,105":0.16666599999999998,"105,110":0.22222200000000003,"110,116":0.16666699999999995,"116,101":0.11111099999999996,"101,120":0.11111099999999996,"120,116":0.11111200000000004,"116,111":0.11111099999999996,"101,115":0.11111100000000007,"115,116":0.11111200000000004,"116,104":0.11111100000000007,"104,101":0.1666669999999999,"111,99":0.11111099999999996,"99,114":0.166667,"99,111":0.11111099999999996,"111,100":0.11111099999999996,"100,101":0.1666669999999999,"97,110":0.22222200000000003,"110,100":0.166667,"115,101":0.11111099999999996,"101,101":0.11111099999999996,"105,102":0.11111099999999996,"105,116":0.16666699999999995,"119,111":0.05555499999999991,"111,114":0.22222200000000003,"114,107":0.05555500000000002,"107,115":0.11111100000000007,"111,110":0.16666599999999998,"97,108":0.22222199999999998,"108,108":0.22222199999999998,"116,121":0.05555600000000005,"121,112":0.22222299999999995,"112,101":0.11111099999999996,"102,105":0.05555499999999999,"105,108":0.277777,"108,101":0.1666669999999999,"102,111":0,"114,109":0.05555599999999994,"109,97":0.22222299999999995,"97,116":0.11111200000000004,"116,46":0.22222199999999998,"113,117":0.22222200000000003,"117,105":0.277777,"105,99":0.166667,"99,107":0.11111100000000007,"98,114":0.22222200000000003,"114,111":0,"111,119":0.11111099999999996,"119,110":0.11111100000000007,"100,111":0.1666669999999999,"111,103":0.11111099999999996,"106,117":0.22222200000000003,"117,109":0.22222299999999984,"109,112":0.22222299999999995,"101,100":0.166667,"111,118":0.11111099999999996,"118,101":0.11111099999999996,"101,114":0.166667,"108,97":0.22222299999999995,"97,122":0.166667,"122,121":0.05555600000000005,"111,120":0.11111099999999996,"120,46":0.22222199999999998},"variants":{},"heightCaps":1.333333,"obs":225,"obsCaps":5},"italic":{"width":{},"height":{},"kerning":{},"kerning2":{},"variants":{},"heightCaps":0,"obs":0,"obsCaps":0},"smallCaps":{"width":{},"height":{},"kerning":{},"kerning2":{},"variants":{},"heightCaps":0,"obs":0,"obsCaps":0},"bold":{"width":{},"height":{},"kerning":{},"kerning2":{},"variants":{},"heightCaps":0,"obs":0,"obsCaps":0},"obs":225}}'></meta>
<meta name='default-font' content='SansDefault'></meta>
<meta name='sans-font' content='NimbusSans'></meta>
<meta name='serif-font' content='NimbusRomNo9L'></meta>
<meta name='enable-opt' content='undefined'></meta>
<meta name='layout' content='[{"n":0,"default":true,"boxes":{}}]'></meta>
<meta name='layout-data-table' content='[{"n":0,"default":true,"tables":[]}]'></meta>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='scribeocr' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf ocrp_lang ocrp_dir ocrp_font ocrp_fsize'/>
</head>
<body>
<div class='ocr_page' title='bbox 0 0 640 480'>
<span class='ocr_line' title="bbox 36 92 580 122; baseline 0 -6; x_x_height 18; x_asc_height 23">
<span class='ocrx_word' id='word_1_0' title='bbox 36 92 96 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>ThAs</span>
<span class='ocrx_word' id='word_1_1' title='bbox 109 92 129 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>is</span>
<span class='ocrx_word' id='word_1_2' title='bbox 141 98 156 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>a</span>
<span class='ocrx_word' id='word_1_3' title='bbox 169 92 201 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>lot</span>
<span class='ocrx_word' id='word_1_4' title='bbox 212 92 240 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>of</span>
<span class='ocrx_word' id='word_1_5' title='bbox 251 92 282 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>12</span>
<span class='ocrx_word' id='word_1_6' title='bbox 296 92 364 122;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>point</span>
<span class='ocrx_word' id='word_1_7' title='bbox 374 93 427 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>text</span>
<span class='ocrx_word' id='word_1_8' title='bbox 437 93 463 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>to</span>
<span class='ocrx_word' id='word_1_9' title='bbox 474 93 526 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>test</span>
<span class='ocrx_word' id='word_1_10' title='bbox 536 92 580 116;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>the</span>
</span>
<span class='ocr_line' title="bbox 36 126 618 157; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_11' title='bbox 36 132 81 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>ocr</span>
<span class='ocrx_word' id='word_1_12' title='bbox 91 126 160 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>code</span>
<span class='ocrx_word' id='word_1_13' title='bbox 172 126 223 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>and</span>
<span class='ocrx_word' id='word_1_14' title='bbox 236 132 286 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>see</span>
<span class='ocrx_word' id='word_1_15' title='bbox 299 126 314 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>if</span>
<span class='ocrx_word' id='word_1_16' title='bbox 325 126 339 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>it</span>
<span class='ocrx_word' id='word_1_17' title='bbox 348 126 433 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>works</span>
<span class='ocrx_word' id='word_1_18' title='bbox 445 132 478 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>on</span>
<span class='ocrx_word' id='word_1_19' title='bbox 500 126 529 150;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>all</span>
<span class='ocrx_word' id='word_1_20' title='bbox 541 127 618 157;x_wconf 100;x_font Arial' lang='eng' style='font-family:Arial'>types</span>
</span>
<span class='ocr_line' title="bbox 36 160 223 184; baseline 0 0; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_21' title='bbox 36 160 64 184;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>of</span>
<span class='ocrx_word' id='word_1_22' title='bbox 72 160 113 184;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>file</span>
<span class='ocrx_word' id='word_1_23' title='bbox 123 160 223 184;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>format.</span>
</span>
<span class='ocr_line' title="bbox 36 194 585 225; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_24' title='bbox 36 194 91 218;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>The</span>
<span class='ocrx_word' id='word_1_25' title='bbox 102 194 177 224;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>quick</span>
<span class='ocrx_word' id='word_1_26' title='bbox 189 194 274 218;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>brown</span>
<span class='ocrx_word' id='word_1_27' title='bbox 287 194 339 225;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>dog</span>
<span class='ocrx_word' id='word_1_28' title='bbox 348 194 456 225;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>jumped</span>
<span class='ocrx_word' id='word_1_29' title='bbox 468 200 531 218;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>over</span>
<span class='ocrx_word' id='word_1_30' title='bbox 540 194 585 218;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>the</span>
</span>
<span class='ocr_line' title="bbox 37 228 585 259; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_31' title='bbox 37 228 92 259;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>lazy</span>
<span class='ocrx_word' id='word_1_32' title='bbox 103 228 153 252;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>fox.</span>
<span class='ocrx_word' id='word_1_33' title='bbox 165 228 220 252;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>The</span>
<span class='ocrx_word' id='word_1_34' title='bbox 232 228 307 258;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>quick</span>
<span class='ocrx_word' id='word_1_35' title='bbox 319 228 404 252;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>brown</span>
<span class='ocrx_word' id='word_1_36' title='bbox 417 228 468 259;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>dog</span>
<span class='ocrx_word' id='word_1_37' title='bbox 478 228 585 259;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>jumped</span>
</span>
<span class='ocr_line' title="bbox 36 262 597 293; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_38' title='bbox 36 268 99 286;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>over</span>
<span class='ocrx_word' id='word_1_39' title='bbox 109 262 153 286;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>the</span>
<span class='ocrx_word' id='word_1_40' title='bbox 165 262 221 293;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>lazy</span>
<span class='ocrx_word' id='word_1_41' title='bbox 231 262 281 286;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>fox.</span>
<span class='ocrx_word' id='word_1_42' title='bbox 294 262 349 286;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>The</span>
<span class='ocrx_word' id='word_1_43' title='bbox 360 262 435 292;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>quick</span>
<span class='ocrx_word' id='word_1_44' title='bbox 447 262 532 286;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>brown</span>
<span class='ocrx_word' id='word_1_45' title='bbox 545 262 597 293;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>dog</span>
</span>
<span class='ocr_line' title="bbox 43 296 561 327; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_46' title='bbox 43 296 150 327;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>jumped</span>
<span class='ocrx_word' id='word_1_47' title='bbox 162 302 226 320;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>over</span>
<span class='ocrx_word' id='word_1_48' title='bbox 235 296 279 320;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>the</span>
<span class='ocrx_word' id='word_1_49' title='bbox 292 296 347 327;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>lazy</span>
<span class='ocrx_word' id='word_1_50' title='bbox 357 296 407 320;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>fox.</span>
<span class='ocrx_word' id='word_1_51' title='bbox 420 296 475 320;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>The</span>
<span class='ocrx_word' id='word_1_52' title='bbox 486 296 561 326;x_wconf 100;x_font Verdana' lang='eng' style='font-family:Verdana'>quick</span>
</span>
<span class='ocr_line' title="bbox 37 330 561 361; baseline 0 -7; x_x_height 18; x_asc_height 24">
<span class='ocrx_word' id='word_1_53' title='bbox 37 330 122 354;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>brown</span>
<span class='ocrx_word' id='word_1_54' title='bbox 135 330 187 361;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>dog</span>
<span class='ocrx_word' id='word_1_55' title='bbox 196 330 304 361;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>jumped</span>
<span class='ocrx_word' id='word_1_56' title='bbox 316 336 379 354;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>over</span>
<span class='ocrx_word' id='word_1_57' title='bbox 388 330 433 354;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>the</span>
<span class='ocrx_word' id='word_1_58' title='bbox 445 330 500 361;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>lazy</span>
<span class='ocrx_word' id='word_1_59' title='bbox 511 330 561 354;x_wconf 100;x_font DejaVu_Sans_Ultra-Light' lang='eng' style='font-family:DejaVu_Sans_Ultra-Light'>fox.</span>
</span>
</div>
</body>
</html>
Loading

0 comments on commit 655ffd9

Please sign in to comment.