User:Putnik/TesseractOCR.js
Jump to navigation
Jump to search
Note: After publishing, you may have to bypass your browser's cache to see the changes.
- Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
- Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
- Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
- Opera: Press Ctrl-F5.
/*jshint boss:true*/
/*global $, mw*/
/**
* This script adds a toolbar button that replaces the editbox text with OCR text
* derived by sending the .prp-page-image image through Tesseract.js.
*
* For more information, see https://wikisource.org/wiki/Wikisource:Tesseract_OCR
*/
( function ( mw, $ ) {
var i18n = $.extend( {
'loading tesseract core': 'Loading Tesseract core',
'initializing tesseract': 'Initializing Tesseract',
'loading language traineddata': 'Loading language traineddata',
'initializing api': 'Initializing API',
'recognizing text': 'Recognizing text',
'no text': 'No text retrieved from Tesseract',
'image not found': 'No image found on this page',
'button label': 'Get text via Tesseract OCR',
'loading indicator': 'Animated loading indicator',
}, window.tesseractOcrI18n || {} );
var languages = {
af: 'afr',
ar: 'ara',
az: 'aze',
be: 'bel',
bg: 'bul',
bn: 'ben',
ca: 'cat',
chr: 'chr',
cs: 'ces',
da: 'dan',
de: 'deu',
el: 'ell+grc',
en: 'enm',
eo: 'epo',
es: 'spa+spa_old',
et: 'est',
eu: 'eus',
fa: 'fas',
fi: 'fin',
fr: 'fra+frm',
gl: 'glg',
he: 'heb',
hi: 'hin',
hr: 'hrv',
hu: 'hun',
id: 'ind',
is: 'isl',
it: 'ita+ita_old',
ja: 'jpn',
kn: 'kan',
ko: 'kor',
lt: 'lit',
lv: 'lav',
mk: 'mkd',
ml: 'mal',
ms: 'msa',
mt: 'mlt',
nb: 'nor',
nl: 'nld',
nn: 'nor',
pa: 'pan',
pl: 'pol',
pt: 'por',
ro: 'ron',
ru: 'rus',
sk: 'slk',
sl: 'slv',
sq: 'sqi',
sr: 'srp',
sv: 'swe',
sw: 'swa',
ta: 'tam',
te: 'tel',
th: 'tha',
tl: 'tgl',
tr: 'tur',
uk: 'ukr',
vi: 'vie',
zh: 'chi_sim+chi_tra',
};
var language = 'eng';
var langCode = mw.config.get( 'wgContentLanguage' );
if ( languages[ langCode ] !== undefined ) {
language = languages[ langCode ] + '+' + language;
}
var loadingGifUrl = '//upload.wikimedia.org/wikipedia/commons/4/42/Loading.gif';
/**
* The initialisation function, run on every load. Adds the OCR button to the
* toolbar if we're currently editing or previewing in the Page namespace.
*/
function run() {
var isPage, useOldToolbar, useBetaToolbar, toolbarLib;
mw.loader.using( 'user.options', function () {
isPage = mw.config.get( 'wgCanonicalNamespace' ) === 'Page';
useOldToolbar = mw.user.options.get( 'showtoolbar' ) === 1;
useBetaToolbar = mw.user.options.get( 'usebetatoolbar' ) === 1;
if ( isPage && ( useOldToolbar || useBetaToolbar ) ) {
toolbarLib = useBetaToolbar ? 'ext.wikiEditor' : 'mediawiki.toolbar';
mw.loader.using( [ toolbarLib ], function () {
customizeToolbar( useBetaToolbar );
} );
}
} );
}
/**
* Add the OCR button to the toolbar. This is called in run, and doesn't
* need to check anything about whether we need to add the button.
*
* @param {boolean} useBeta Whether the WikiEditor toolbar should be used.
*/
function customizeToolbar( useBeta ) {
// Add old-style toolbar button.
if ( ! useBeta && mw.toolbar ) {
mw.toolbar.addButton( {
imageFile: 'https://upload.wikimedia.org/wikipedia/commons/e/e0/Button_ocr.png',
speedTip: i18n[ 'button label' ],
imageId: 'TesseractOcrButton'
} );
$( 'img#TesseractOcrButton' ).on( 'click', doOcr ).css( 'width', '50px' );
}
// Add new-style WikiEditor toolbar button.
if ( useBeta ) {
$( document ).ready( function () {
var ocrButtonDetails = {
type: 'button',
icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/11/Toolbaricon_TesseractOCR.png/120px-Toolbaricon_TesseractOCR.png',
label: i18n[ 'button label' ],
action: { type: 'callback', execute: doOcr }
};
var ocrButton = {
section: 'main', // 'proofreadpage-tools',
group: 'insert', // 'other',
tools: { 'TesseractOcr': ocrButtonDetails }
};
$( '#wpTextbox1' ).wikiEditor( 'addToToolbar', ocrButton );
$( 'a[rel="TesseractOcr"]' ).css( {
width: '56px',
backgroundSize: 'contain'
} );
} );
}
// Pre-load the loading gif.
$( '<img>' ).attr( 'src', loadingGifUrl ).appendTo( 'body' ).hide();
}
/**
* This function is run when the OCR button is clicked. It sends the page
* image to the API and replace the editbox's text with the restult.
*/
function doOcr() {
if ( $( '.prp-page-image img' ).length === 0 ) {
mw.notify( i18n[ 'image not found' ] );
}
// Send the HTTPS URL because this will be accessed by PHP in the tool.
var imageUrl = 'https:' + $( '.prp-page-image img' ).attr( 'src' );
$.getScript( 'https://tools-static.wmflabs.org/cdnjs/ajax/libs/tesseract.js/2.0.0-alpha.2/tesseract.min.js', function() {
var { TesseractWorker } = Tesseract;
var worker = new TesseractWorker({
workerPath: 'https://tools-static.wmflabs.org/cdnjs/ajax/libs/tesseract.js/2.0.0-alpha.2/worker.min.js',
langPath: 'https://tools.wmflabs.org/tessdata/4.0.0',
corePath: 'https://tools.wmflabs.org/tessdata/core/tesseract-core.wasm.js',
});
worker
.recognize( imageUrl, language )
.progress( showProgressMsg )
.then( processOcrResult );
} );
}
/**
* The API result (either the OCR'd text, or an error message) is processed by
* this function.
*
* @param {string} data The response (either text or error) returned from the API.
*/
function processOcrResult( result ) {
if ( result.text === undefined || result.text.length === 0 ) {
mw.notify( i18n[ 'no text' ] );
return;
}
$( '#wpTextbox1' ).val( result.text );
}
/**
* Show (or hide) a loading message.
*
* @param {object} data The data object returned from Tesseract.
*/
function showProgressMsg( data ) {
var loadingBoxId = 'TesseractOcrLoading';
var $msgBox = $( '#' + loadingBoxId );
if ( $msgBox.length === 0 ) {
var $loadingGif = $( '<img>' )
.attr( 'src', loadingGifUrl )
.attr( 'alt', i18n[ 'loading indicator' ] )
.css( {
display: 'inline-block',
margin: '0.3em'
} );
$msgBox = $( '<p>' )
.attr( 'id', loadingBoxId )
.css( {
backgroundColor: '#efefef',
border: '1px solid #ccc',
display: 'none'
} )
.prepend( $( '<span>' ) )
.prepend( $loadingGif );
$( '#wpTextbox1' ).before( $msgBox );
}
// Add the new message if required.
if ( data.status.length !== 0 && data.progress < 1) {
msg = i18n[ data.status ] !== undefined ? i18n[ data.status ] : data.status;
if ( data.progress > 0 ) {
msg += ' (' + Math.round( data.progress * 100 ) + '%)';
}
$msgBox.find( 'span' ).text( msg );
$msgBox.show();
} else {
$msgBox.hide();
}
}
run();
}( mediaWiki, jQuery ) );