MediaWiki:Hocr.js

From Wikisource
Jump to navigation Jump to search

Note: After publishing, you may have to bypass your browser's cache to see the changes.

  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
  • Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Press Ctrl-F5.
/*
 *   Author: w:fr:Phe, highlighting code derived from an earlier implementation
 * by Alex Brollo
 *
 *   hocr query an external server to get the hocr layer of a given Page,
 * then allow to highlight the scan by double clicking on a word. Works
 * in view, preview, edit and diff mode.
 *
 *   The hocr server is called lazilly when user ask it (double click). This mean
 * the first double click on a page can be a bit slow, around a second because
 * it'll query the hocr and build the distance matrix, most cpu time is used
 * by building the dist matrix and there is nothing to do with that.
 *
 *   In view mode the html text is instrumented to add a span id=word#nr around
 * each text word. This is used to retrieve the dblclick'ed word.
 *
 *   In edit mode words are retrieved by the position of the selection.
 *
 *   Caveat
 *
 *   The highlighted part of the scan is not necessarily visible. See
 * the comment in highlight_bbox().
 *
 *   Keeping the higlighted word when zooming with mouse is not supported
 * but resizing the window keep the highlighted at the right position.
 *
 * TODO:
 *
 *   Improve the accuracy of match, when we are at the boundary of removed
 * or added text, the matcher tend to be off by one word. Same thing at the
 * begin and end of text if the hocr is damaged at this position. (Tried, don't
 * work. -- Phe)
 *
 *   Check if table layout can be broken. Improve accuracy of table handling
 * as ocr tend to produce text by column and table are written in html by line.
 *
 *   In edit mode ref must be moved to the bottom of the text (except ref follow=)
 * but we need to keep a mapping old text pos --> new text pos. All array must be
 * build with the new text but when clicking on word we must map the click pos to
 * the new text pos.
 */

var hocr = {
    // map to build xxx_text_as_id
    word_to_id : {},
    last_word_id : 0,
    hocr_text_as_id : [],
    html_text_as_id : [],
    wiki_text_as_id : [],
    // parallel array of hocr_text_as_id, index of word in hocr_text_as_id are used to retrieve
    // hocr data associated with this word.
    hocr_words_data : [],
    // the .ocr_page data.
    hocr_page_data : {},
    // an array of couple [ end word pos in char count, word id ] to retrieve word position in the edit box.
    edit_box_word_pos : [],
    html_dist_matrix : null,
    hocr_html_matcher : [],
    wiki_text_dist_matrix : null,
    hocr_wiki_text_matcher : [],
    hocr_server_called : false,
    // the last word index highlighted, used to redraw the highlighted text.
    hocr_last_word_index : -1,
    pr_container : '.prp-page-image',
    pr_image : '.prp-page-image img',
    img_extension_path : 'extensions/ProofreadPage/modules/page/images/',

    // inefficient but only for debugging purpose.
    id_to_word : function (idx) {
        for (var i = 0; i < hocr.word_to_id.length; ++i) {
            if (hocr.word_to_id[i] == idx)
                return i;
        }
        return null;
    },

    // for debugging purpose only.
    id_to_text : function (text_as_id) {
        var text = '';
        for (var i = 0; i < text_as_id.length; ++i) {
            text += hocr.id_to_word(text_as_id[i]) + ' ';
        }
        return text;
    },

    push_word_id : function (word_id_array, word) {
        if (hocr.word_to_id[word] === undefined)  {
            hocr.word_to_id[word] = hocr.last_word_id++;
        }
        word_id_array.push(hocr.word_to_id[word]);
    },

    compare_array : function (a, b) {
        // not the smartest way but shortest. Both array are sorted the same
        // way and contains only integer
        return a.join() == b.join();
    },

    // That's the cpu bottleneck. We can do little to improve it as we
    // really need the full matrix so filling it is f(a.length * b.length).
    levenshtein : function(a, b) {

        var row = a.length;
        var col = b.length;

        var matrix = [];
 
        for (var i = 0; i <= row; i++)
            matrix[i] = [i];
 
       for(var j = 0; j <= col; j++)
            matrix[0][j] = j;
 
        for (var i = 1; i <= row; i++) {
            for (var j = 1; j <= col; j++) {
                var cout = a[i] == b[j] ? 0 : 1;
                matrix[i][j] = Math.min(matrix[i][j-1] + 1, matrix[i-1][j] + 1, matrix[i-1][j-1] + cout);
            }
        }

        return matrix;
    },

    build_matcher : function (dist_matrix, A, B) {
        var result = [ ];

        var i = A.length;
        var j = B.length;

        while (i > 0 || j > 0)  {
            if (i > 0 && j > 0 && dist_matrix[i][j] == dist_matrix[i-1][j-1] + (A[i] == B[j] ? 0 : 1)) {
                i = i - 1;
                j = j - 1;
                result.push(j);
            } else if (i > 0 && dist_matrix[i][j] == dist_matrix[i-1][j] + 1) {
                i = i - 1;
                result.push(j);
            } else { // (j > 0 && dist_matrix[i][j] == dist_matrix[i][j-1] + 1)
                j = j - 1;
            }
        }

        result.reverse();

        return result;
    },

    locate_html_word : function(index) {
        return hocr.hocr_html_matcher[index];
    },

    locate_wiki_text_word : function(index) {
        return hocr.hocr_wiki_text_matcher[index];
    },

    parse_hocr_data : function (text) {
        var result = {};

        text = text.replace(/^\s+|\s+$/g,'');
        text = text.replace(/  +/g, ' ');

        var datas = text.split(';');
        for (var i = 0; i < datas.length; ++i) {
            datas[i] = datas[i].replace(/^\s+|\s+$/g,'');
            var property = datas[i].split(' ');
            result[property[0]] = property.slice(1).join(' ');
        }
        return result;
    },

    hocr_callback : function (data, msg_err) {
        if (data.error == 0) {
            var word_sep = '[' + hocr.char_class() + "]";

            var match_word = new RegExp(word_sep, "gm");

            // FIXME: the top level element is ignored (the .ocr_page), unless wrapped inside another top level div
            data.text = data.text.replace('<body>', '<body><div>').replace('</body>', '</div></body>');

            var hocr_html = $(data.text);

            hocr.hocr_page_data = hocr.parse_hocr_data($('.ocr_page', hocr_html).attr('title'));

            $('.ocrx_word', hocr_html).each(function (idx, value) {
                var word = $(this).text();
                if (word.search(match_word) != -1) {
                    word = word.replace(/’/g, "'");
                    // FIXME: better stripping of punctuation etc.
                    word = word.replace(/[!?;:,]/g, "");
                    hocr.push_word_id(hocr.hocr_text_as_id, word);
                    hocr.hocr_words_data.push(hocr.parse_hocr_data($(this).attr('title')));
                }
            });
        } else {
            // FIXME: we must be silent here ? or mw.log() it ?
            if (msg_err) {
                alert('something feel bad, error: ' + data.error + ' ' + data.text);
            }
        }
    },

    get_data : function (data, pagename) {
        for (var ids in data.query.pages) {
            if (ids > 0 && data.query.pages[ids].title == pagename) {
                return data.query.pages[ids];
            }
        }
        return null;
    },

    highlight_bbox : function (index) {
        hocr.hocr_last_word_index = index;
        if (hocr.hocr_words_data[index].bbox) {
            $("#bboxHighlighting").remove();

            var xy_scale = $(hocr.pr_image).width() / hocr.hocr_page_data.bbox.split(' ')[2];

            var bbox = hocr.hocr_words_data[index].bbox.split(' ');

            var pos_x = Math.round(bbox[0] * xy_scale) /*+ $(hocr.pr_image).position().left*/;
            var abs_pos_y = Math.round(bbox[1] * xy_scale) /*+ $(hocr.pr_image).position().top*/;


            if ($.inArray(mw.config.get('wgAction'), ['edit', 'submit']) == -1) {
                // we append the #bboxHighlighting after the img as a div with relative pos, so we
                // shift up its pos by the height of the img.
                var pos_y = abs_pos_y - $(hocr.pr_image).height();
                var position = 'relative';
            } else {
                var pos_y = abs_pos_y;
                var position = 'absolute';
            }

            var width  = Math.round((bbox[2] - bbox[0]) * xy_scale);
            var height = Math.round((bbox[3] - bbox[1]) * xy_scale);

            var $pr_container = $(hocr.pr_container);

            // filter:alpha(opacity=30) is for IE8 and earlier.
            $('<div id="bboxHighlighting" style="position:' + position + ';top:'+pos_y+'px;left:'+pos_x+'px;width:'+width+'px;height:'+height+'px; background-color: rgb(255, 0, 0); opacity:0.3;   filter:alpha(opacity=30);"></div>').appendTo($pr_container);

            // center the image if necessary, this doesn't make the #bboxHighlighting visible
            // if a level up elt is scrolled in such way than the elt is inside the pr_container
            // viewport but this viewport is clipped by a level up elt.
            // document.getElementById('bboxHighlighting').scrollIntoView(); will always ensure
            // the elt is visible but the effect is ugly as hell.
            if (abs_pos_y < $pr_container.scrollTop()  || abs_pos_y + height > $pr_container.scrollTop()  + $pr_container.height() ||
                pos_x < $pr_container.scrollLeft() || pos_x + width  > $pr_container.scrollLeft() + $pr_container.width()) {
                var new_pos_y = Math.round(abs_pos_y -  ($pr_container.height() / 2));
                var new_pos_x = Math.round(pos_x -  ($pr_container.width()  / 2));
                $pr_container.scrollTop(new_pos_y);
                $pr_container.scrollLeft(new_pos_x);
            }
        }
    },

    on_dblclick_html : function (event) {
        if (!hocr.hocr_server_called) {
            var url = '//phetools.toolforge.org/hocr_cgi.py?cmd=hocr&book='
              + encodeURIComponent(mw.config.get('wgTitle')) + '&lang=' + mw.config.get('wgContentLanguage');

            $.getJSON(url, function(data) { hocr.hocr_callback(data, false); } ).done(function () { hocr.hocr_server_called = true; hocr.on_dblclick_html(event); });
        } else {
            if (hocr.html_dist_matrix === null && hocr.hocr_text_as_id.length) {
                hocr.html_dist_matrix = hocr.levenshtein(hocr.html_text_as_id, hocr.hocr_text_as_id);

                hocr.hocr_html_matcher = hocr.build_matcher(hocr.html_dist_matrix, hocr.html_text_as_id, hocr.hocr_text_as_id);
            }
            var id = $(event.target).attr('id');
            if (id && hocr.hocr_text_as_id.length) {
                var word_number = new Number(id.replace(/^[^0-9]*([0-9]+)$/, '$1'));
                var best_index = hocr.locate_html_word(word_number);
                hocr.highlight_bbox(best_index);
            }
        }
    },

    retrieve_wiki_text_word_pos : function (start_word) {
        // Linear search, no big deal as the array is always small.
        for (var i = 0 ; i < hocr.edit_box_word_pos.length; i++) {
            if (start_word <= hocr.edit_box_word_pos[i][0])
                return hocr.edit_box_word_pos[i][1];
        }
        // FIXME: mw.log it
        return -1;
    },

    on_dblclick_wiki_text : function (event) {
        // required because we don't want on_dblclick_html() to be called.
        event.stopPropagation();
        if (!hocr.hocr_server_called) {
            var url = '//phetools.toolforge.org/hocr_cgi.py?cmd=hocr&book='
              + encodeURIComponent(mw.config.get('wgTitle')) + '&lang=' + mw.config.get('wgContentLanguage');

            $.getJSON(url, function(data) { hocr.hocr_callback(data, false); } ).done(function () { hocr.hocr_server_called = true; hocr.on_dblclick_wiki_text(event); } );
        } else {
            if (!hocr.hocr_text_as_id.length)
                return;
            var text_box =  document.getElementById("wpTextbox1");

            // rebuilding the matrix on each click is costly, we try to avoid that.
            var old_wiki_text_as_id = hocr.wiki_text_as_id;
            hocr.wiki_text_as_id = [];

            hocr.process_wiki_text(text_box.value);

            if (!hocr.compare_array(old_wiki_text_as_id, hocr.wiki_text_as_id)) {
                hocr.wiki_text_dist_matrix = hocr.levenshtein(hocr.wiki_text_as_id, hocr.hocr_text_as_id);

                hocr.hocr_wiki_text_matcher = hocr.build_matcher(hocr.wiki_text_dist_matrix, hocr.wiki_text_as_id, hocr.hocr_text_as_id);
            }

            var val = $("#wpTextbox1").textSelection( "getCaretPosition", { } );
            var word_number = hocr.retrieve_wiki_text_word_pos(val);
            if (word_number >= 0) {
                var best_index = hocr.locate_wiki_text_word(word_number);
                hocr.highlight_bbox(best_index);
            }
        }
    },

    char_class : function () {
        var char_latin = '0-9A-Za-zÀ-ÖØ-öø-ʯᴀ-ᴥᵢ-ᵥᵫ-ᵷᵹ-ᶚḀ-ỿₐ-ₔↄ⒈-⒐Ⱡ-ⱼⱾ-ⱿꜢ-ꝯꝱ-ꞇꞋ-ꞌꟻ-ꟿff-st';
        var char_hebrew = 'א-תװ-ײיִײַ-ﬨשׁ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﭏ';
        var char_cyrillic = 'Ѐ-ҁҊ-ԥᴫꙀ-ꙟꙢ-ꙮꙿ-ꚗ';
        var char_bengali = '\u0985-\u098c\u098f-\u0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09ce\u09dc-\u09dd\u09df-\u09e1\u09e6-\u09f1\u09f4-\u09f9';
        var combining_diacritical_marks = '\u0300-\u036f';

        return char_latin + char_hebrew + char_cyrillic + char_bengali + combining_diacritical_marks;
    },

    // split a text into an array containing words, non-words...
    split_text : function(text) {
        var word_sep = '[^' + hocr.char_class() + "]+";
        var words = [];

        var last_match = 0;
        var splitter = new RegExp(word_sep, "gm");
        while ((result = splitter.exec(text)) != null) {
            var word = text.slice(last_match, splitter.lastIndex - result[0].length);
            words.push(word);
            words.push(result[0]);

            last_match = splitter.lastIndex;
        }
        var last_word = text.slice(last_match);
        words.push(last_word);

        return words;
    },

    process_wiki_text : function(text) {
        // FIXME: reusing split_text() doesn't work
        var word_sep = '[^' + hocr.char_class() + "]+";

        text = text.replace(/’/g, "'");

        var last_match = 0;
        var splitter = new RegExp(word_sep, "gm");
        while ((result = splitter.exec(text)) != null) {
            var word = text.slice(last_match, splitter.lastIndex - result[0].length);

            hocr.edit_box_word_pos.push([last_match + word.length, hocr.wiki_text_as_id.length]);

            hocr.push_word_id(hocr.wiki_text_as_id, word);
            last_match = splitter.lastIndex;
        }

        var word = text.slice(last_match);
        hocr.edit_box_word_pos.push([last_match + word.length, hocr.wiki_text_as_id.length]);
        hocr.push_word_id(hocr.wiki_text_as_id, word);
    },

    split_text_node : function (node) {
        if (node.nodeValue.length) {
            results = hocr.split_text(node.nodeValue);
            var html = '';
            for (var i = 0; i < results.length; i++) {
                if (i % 2 == 0 && results[i].length) {
                    html += '<span id="word_id_' + hocr.html_text_as_id.length + '">' + results[i] + '</span>';
                    // FIXME: do all transform in push_word_id
                    results[i] = results[i].replace(/’/g, "'");
                    hocr.push_word_id(hocr.html_text_as_id, results[i]);
                } else {
                    html += results[i];
                }
            }
            if (html.length) {
                if (html.search('<span>') != -1) {
                    html = '<span>' + html + '</span>';
                }
                $(node).replaceWith(html);
            }
        }
    },

    child_text_nodes : function (node) {
        var nodes = [];

        function text_node_order(node) {
            if (node.nodeName === '#text') {
                nodes.push(node);
            } else if(node.nodeName !== 'STYLE') {
                for (var i = 0; i < node.childNodes.length; ++i) {
                    text_node_order(node.childNodes[i]);
                }
            }
        }

       text_node_order(node);

       return nodes;
    },

    get_text_nodes : function (result) {
         var $page_text = $(".pagetext");
         if ($page_text.length) {
             var nodes = hocr.child_text_nodes($page_text[0]);
             for (var i = 0; i < nodes.length; ++i) {
                hocr.split_text_node(nodes[i]);
             }
         }
    },

    redraw : function () {
        if (hocr.hocr_last_word_index != -1) {
            var resizeTimeout;

            // We wait bit to allow multiple event to occur, esp. some browser trigger multiple resize
            // event during a resize(), but worst they also don't fire a resize() at the end of a resize()
            // FF 24 / linux for example.
            $("#bboxHighlighting").remove();
            clearTimeout(resizeTimeout);
            resizeTimeout = setTimeout(function() { hocr.highlight_bbox(hocr.hocr_last_word_index); }, 500);
        }
    },

    setup : function () {
        if (mw.config.get('wgAction') == 'view' || (mw.config.get('wgAction') == 'submit' && $('#wikiPreview').length)) {
            hocr.get_text_nodes();
            $('#mw-content-text').dblclick(hocr.on_dblclick_html);
        }

        if ($.inArray(mw.config.get('wgAction'), [ 'edit', 'submit' ]) != -1) {

            $('#wpTextbox1').dblclick(hocr.on_dblclick_wiki_text);

            // Kludge, there is no #id for these buttons.
            $("img[src*='" + hocr.img_extension_path + "Button_multicol.png']").click(hocr.redraw);
            // FIXME: These three doesn't work as expected. Need the zoom function to be fixed in the
            // extension first ?
            $("img[src*='" + hocr.img_extension_path + "Button_examine.png']").click(hocr.redraw);
            $("img[src*='" + hocr.img_extension_path + "Button_zoom_out.png']").click(hocr.redraw);
            $("img[src*='" + hocr.img_extension_path + "Button_zoom_in.png']").click(hocr.redraw);
        }

        $(window).resize(hocr.redraw);
    },
};

if (mw.config.get("wgCanonicalNamespace") == 'Page' && $.inArray(mw.config.get('wgAction'), ['view', 'edit', 'submit']) != -1) {
    $(hocr.setup);
}