Skip to content

Commit

Permalink
Moved JSON export code from JavaScript to C++ (#984)
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica authored Dec 25, 2024
1 parent 6cf46c1 commit 28846b3
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 161 deletions.
15 changes: 8 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"node-fetch": "^2.6.9",
"opencollective-postinstall": "^2.0.3",
"regenerator-runtime": "^0.13.3",
"tesseract.js-core": "^5.1.1",
"tesseract.js-core": "^6.0.0-0",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
},
Expand Down
156 changes: 3 additions & 153 deletions src/worker-script/utils/dump.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,6 @@ const deindent = (html) => {
* @access public
*/
module.exports = (TessModule, api, output, options) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL,
} = TessModule;
const blocks = [];
let block;
let para;
let textline;
let word;
let symbol;

const enumToString = (value, prefix) => (
Object.keys(TessModule)
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
Expand All @@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};

// If output.layoutBlocks is true and options.skipRecognition is true,
// the user wants layout data but text recognition has not been run.
// In this case, fields that require text recognition are skipped.
if (output.blocks || output.layoutBlocks) {
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
}

block = {
paragraphs: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null,
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null,
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
// getRowAttributes was added in a recent minor version of Tesseract.js-core,
// so we need to check if it exists before calling it.
// This can be removed in the next major version (v6).
let rowAttributes;
if (ri.getRowAttributes) {
rowAttributes = ri.getRowAttributes();
// Descenders is reported as a negative within Tesseract internally so we need to flip it.
// The positive version is intuitive, and matches what is reported in the hOCR output.
rowAttributes.descenders *= -1;
}
textline = {
words: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null,
baseline: ri.getBaseline(RIL_TEXTLINE),
rowAttributes,
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],

text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null,
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),

is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),

is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: !options.skipRecognition ? wc.GetUTF8Text() : null,
confidence: !options.skipRecognition ? wc.Confidence() : null,
});
} while (wc.Next());
TessModule.destroy(wc);
textline.words.push(word);
}

// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null,
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: !options.skipRecognition ? ci.GetUTF8Text() : null,
confidence: !options.skipRecognition ? ci.Confidence() : null,
});
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
}

return {
text: output.text ? api.GetUTF8Text() : null,
hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
Expand All @@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
confidence: !options.skipRecognition ? api.MeanTextConf() : null,
blocks: output.blocks && !options.skipRecognition ? blocks : null,
layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null,
blocks: output.blocks && !options.skipRecognition ? JSON.parse(api.GetJSONText()).blocks : null,
layoutBlocks: output.layoutBlocks && options.skipRecognition
? JSON.parse(api.GetJSONText()).blocks : null,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
Expand Down
Binary file added tests/assets/images/escape_chars.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions tests/recognize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -269,4 +269,29 @@ describe('recognize()', () => {
}).timeout(TIMEOUT)
));
});

describe('should support blocks (json) output', () => {
it('recongize large image', async () => {
await worker.reinitialize('eng');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('T');
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('This');
expect(blocks[0].paragraphs[0].lines[0].text).to.be('This is a lot of 12 point text to test the\n');
}).timeout(TIMEOUT);

it('recongize image with special characters', async () => {
await worker.reinitialize('eng');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/escape_chars.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].text).to.be('"Double Quotes"\n');
expect(blocks[0].paragraphs[0].lines[1].text).to.be('Back \\ Slash\n');
}).timeout(TIMEOUT);

it('recongize chinese image', async () => {
await worker.reinitialize('chi_tra');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/chinese.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('繁');
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體');
expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n');
}).timeout(TIMEOUT);
});
});

0 comments on commit 28846b3

Please sign in to comment.