Utilisateur:Phe/hocr.js
Apparence
Note : après avoir enregistré vos modifications, il se peut que vous deviez forcer le rechargement complet du cache de votre navigateur pour voir les changements.
- Firefox / Safari : Maintenez la touche Maj (Shift) en cliquant sur le bouton Actualiser ou pressez Ctrl-F5 ou Ctrl-R (⌘-R sur un Mac) ;
- Google Chrome : Appuyez sur Ctrl-Maj-R (⌘-Shift-R sur un Mac) ;
- Internet Explorer : Maintenez la touche Ctrl en cliquant sur le bouton Actualiser ou pressez Ctrl-F5 ;
- Opera : Allez dans Menu → Settings (Opera → Préférences sur un Mac) et ensuite à Confidentialité & sécurité → Effacer les données d'exploration → Images et fichiers en cache.
/*
* Author: w:fr:Phe, highlighting code derived from an earlier implementation
* by Alex Brollo
*
* hocr query an external server to get the hocr layer of a given Page,
* then allow to highlight the scan by double clicking on a word. Works
* in view, preview, edit and diff mode.
*
* The hocr server is called lazilly when user ask it (double click). This mean
* the first double click on a page can be a bit slow, around a second because
* it'll query the hocr and build the distance matrix, most cpu time is used
* by building the dist matrix and there is nothing to do with that.
*
* In view mode the html text is instrumented to add a span id=word#nr around
* each text word. This is used to retrieve the dblclick'ed word.
*
* In edit mode words are retrieved by the position of the selection.
*
* Caveat
*
* The highlighted part of the scan is not necessarily visible. See
* the comment in highlight_bbox().
*
* Keeping the higlighted word when zooming with mouse is not supported
* but resizing the window keep the highlighted at the right position.
*
* TODO:
*
* Improve the accuracy of match, when we are at the boundary of removed
* or added text, the matcher tend to be off by one word. Same thing at the
* begin and end of text if the hocr is damaged at this position. (Tried, don't
* work. -- Phe)
*
* Check if table layout can be broken. Improve accuracy of table handling
* as ocr tend to produce text by column and table are written in html by line.
*
* In edit mode ref must be moved to the bottom of the text (except ref follow=)
* but we need to keep a mapping old text pos --> new text pos. All array must be
* build with the new text but when clicking on word we must map the click pos to
* the new text pos.
*/
var hocr = {
// map to build xxx_text_as_id
word_to_id : {},
last_word_id : 0,
hocr_text_as_id : [],
html_text_as_id : [],
wiki_text_as_id : [],
// parallel array of hocr_text_as_id, index of word in hocr_text_as_id are used to retrieve
// hocr data associated with this word.
hocr_words_data : [],
// the .ocr_page data.
hocr_page_data : {},
// an array of couple [ end word pos in char count, word id ] to retrieve word position in the edit box.
edit_box_word_pos : [],
html_dist_matrix : null,
hocr_html_matcher : [],
wiki_text_dist_matrix : null,
hocr_wiki_text_matcher : [],
hocr_server_called : false,
// the last word index highlighted, used to redraw the highlighted text.
hocr_last_word_index : -1,
// FIXME: help to test on my local box, remove that later.
pr_container : mw.config.get('wgServer') != "http://fr.wikisource.zaniah.virgus"
? '.prp-page-image' : '#pr_container',
pr_image : mw.config.get('wgServer') != "http://fr.wikisource.zaniah.virgus"
? '.prp-page-image img' : '#ProofReadImage',
img_extension_path : mw.config.get('wgServer') == "http://fr.wikisource.zaniah.virgus"
? 'extensions/ProofreadPage/modules/ext.proofreadpage.page/images/'
: 'extensions/ProofreadPage/modules/page/images/',
// inefficient but only for debugging purpose.
id_to_word : function (idx) {
for (var i = 0; i < hocr.word_to_id.length; ++i) {
if (hocr.word_to_id[i] == idx)
return i;
}
return null;
},
// for debugging purpose only.
id_to_text : function (text_as_id) {
var text = '';
for (var i = 0; i < text_as_id.length; ++i) {
text += hocr.id_to_word(text_as_id[i]) + ' ';
}
return text;
},
push_word_id : function (word_id_array, word) {
if (hocr.word_to_id[word] === undefined) {
hocr.word_to_id[word] = hocr.last_word_id++;
}
word_id_array.push(hocr.word_to_id[word]);
},
compare_array : function (a, b) {
// not the smartest way but shortest. Both array are sorted the same
// way and contains only integer
return a.join() == b.join();
},
// That's the cpu bottleneck. We can do little to improve it as we
// really need the full matrix so filling it is f(a.length * b.length).
levenshtein : function(a, b) {
var row = a.length;
var col = b.length;
var matrix = [];
for (var i = 0; i <= row; i++)
matrix[i] = [i];
for(var j = 0; j <= col; j++)
matrix[0][j] = j;
for (var i = 1; i <= row; i++) {
for (var j = 1; j <= col; j++) {
var cout = a[i] == b[j] ? 0 : 1;
matrix[i][j] = Math.min(matrix[i][j-1] + 1, matrix[i-1][j] + 1, matrix[i-1][j-1] + cout);
}
}
return matrix;
},
build_matcher : function (dist_matrix, A, B) {
var result = [ ];
var i = A.length;
var j = B.length;
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && dist_matrix[i][j] == dist_matrix[i-1][j-1] + (A[i] == B[j] ? 0 : 1)) {
i = i - 1;
j = j - 1;
result.push(j);
} else if (i > 0 && dist_matrix[i][j] == dist_matrix[i-1][j] + 1) {
i = i - 1;
result.push(j);
} else { // (j > 0 && dist_matrix[i][j] == dist_matrix[i][j-1] + 1)
j = j - 1;
}
}
result.reverse();
return result;
},
locate_html_word : function(index) {
return hocr.hocr_html_matcher[index];
},
locate_wiki_text_word : function(index) {
return hocr.hocr_wiki_text_matcher[index];
},
parse_hocr_data : function (text) {
var result = {}
text = text.replace(/^\s+|\s+$/g,'');
text = text.replace(/ +/g, ' ');
var datas = text.split(';');
for (var i = 0; i < datas.length; ++i) {
datas[i] = datas[i].replace(/^\s+|\s+$/g,'');
var property = datas[i].split(' ');
result[property[0]] = property.slice(1).join(' ');
}
return result;
},
hocr_callback : function (data, msg_err) {
if (data['error'] == 0) {
var word_sep = '[' + hocr.char_class() + "]";
var match_word = new RegExp(word_sep, "gm");
// FIXME: the top level element is ignored (the .ocr_page), unless wrapped inside another top level div
data['text'] = data['text'].replace('<body>', '<body><div>').replace('</body>', '</div></body>');
var hocr_html = $(data['text']);
hocr.hocr_page_data = hocr.parse_hocr_data($('.ocr_page', hocr_html).attr('title'));
$('.ocrx_word', hocr_html).each(function (idx, value) {
var word = $(this).text();
if (word.search(match_word) != -1) {
word = word.replace(/’/g, "'");
// FIXME: better stripping of punctuation etc.
word = word.replace(/[!?;:,]/g, "");
hocr.push_word_id(hocr.hocr_text_as_id, word);
hocr.hocr_words_data.push(hocr.parse_hocr_data($(this).attr('title')));
}
});
} else {
// FIXME: we must be silent here ? or mw.log() it ?
if (msg_err) {
alert('something feel bad, error: ' + data['error'] + ' ' + data['text']);
}
}
},
get_data : function (data, pagename) {
for (var ids in data.query.pages) {
if (ids > 0 && data.query.pages[ids].title == pagename) {
return data.query.pages[ids];
}
}
return null;
},
highlight_bbox : function (index) {
hocr.hocr_last_word_index = index;
if (hocr.hocr_words_data[index]['bbox']) {
$("#bboxHighlighting").remove();
var xy_scale = $(hocr.pr_image).width() / hocr.hocr_page_data['bbox'].split(' ')[2];
var bbox = hocr.hocr_words_data[index]['bbox'].split(' ');
var pos_x = Math.round(bbox[0] * xy_scale) /*+ $(hocr.pr_image).position().left*/;
var abs_pos_y = Math.round(bbox[1] * xy_scale) /*+ $(hocr.pr_image).position().top*/;
if ($.inArray(mw.config.get('wgAction'), ['edit', 'submit']) == -1) {
// we append the #bboxHighlighting after the img as a div with relative pos, so we
// shift up its pos by the height of the img.
var pos_y = abs_pos_y - $(hocr.pr_image).height();
var position = 'relative';
} else {
var pos_y = abs_pos_y;
var position = 'absolute';
}
var width = Math.round((bbox[2] - bbox[0]) * xy_scale);
var height = Math.round((bbox[3] - bbox[1]) * xy_scale);
var $pr_container = $(hocr.pr_container);
// filter:alpha(opacity=30) is for IE8 and earlier.
$('<div id="bboxHighlighting" style="position:' + position + ';top:'+pos_y+'px;left:'+pos_x+'px;width:'+width+'px;height:'+height+'px; background-color: rgb(255, 0, 0); opacity:0.3; filter:alpha(opacity=30);"></div>').appendTo($pr_container);
// center the image if necessary, this doesn't make the #bboxHighlighting visible
// if a level up elt is scrolled in such way than the elt is inside the pr_container
// viewport but this viewport is clipped by a level up elt.
// document.getElementById('bboxHighlighting').scrollIntoView(); will always ensure
// the elt is visible but the effect is ugly as hell.
if (abs_pos_y < $pr_container.scrollTop() || abs_pos_y + height > $pr_container.scrollTop() + $pr_container.height() ||
pos_x < $pr_container.scrollLeft() || pos_x + width > $pr_container.scrollLeft() + $pr_container.width()) {
var new_pos_y = Math.round(abs_pos_y - ($pr_container.height() / 2));
var new_pos_x = Math.round(pos_x - ($pr_container.width() / 2));
$pr_container.scrollTop(new_pos_y);
$pr_container.scrollLeft(new_pos_x);
}
}
},
on_dblclick_html : function (event) {
if (!hocr.hocr_server_called) {
var url = '//tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr&book='
+ encodeURIComponent(mw.config.get('wgTitle')) + '&lang=' + mw.config.get('wgContentLanguage');
$.getJSON(url, function(data) { hocr.hocr_callback(data, false); } ).done(function () { hocr.hocr_server_called = true; hocr.on_dblclick_html(event); });
} else {
if (hocr.html_dist_matrix === null && hocr.hocr_text_as_id.length) {
hocr.html_dist_matrix = hocr.levenshtein(hocr.html_text_as_id, hocr.hocr_text_as_id);
hocr.hocr_html_matcher = hocr.build_matcher(hocr.html_dist_matrix, hocr.html_text_as_id, hocr.hocr_text_as_id);
}
var id = $(event.target).attr('id');
if (id && hocr.hocr_text_as_id.length) {
var word_number = new Number(id.replace(/^[^0-9]*([0-9]+)$/, '$1'));
var best_index = hocr.locate_html_word(word_number);
hocr.highlight_bbox(best_index);
}
}
},
retrieve_wiki_text_word_pos : function (start_word) {
// Linear search, no big deal as the array is always small.
for (var i = 0 ; i < hocr.edit_box_word_pos.length; i++) {
if (start_word <= hocr.edit_box_word_pos[i][0])
return hocr.edit_box_word_pos[i][1];
}
// FIXME: mw.log it
return -1;
},
on_dblclick_wiki_text : function (event) {
// required because we don't want on_dblclick_html() to be called.
event.stopPropagation();
if (!hocr.hocr_server_called) {
var url = '//tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr&book='
+ encodeURIComponent(mw.config.get('wgTitle')) + '&lang=' + mw.config.get('wgContentLanguage');
$.getJSON(url, function(data) { hocr.hocr_callback(data, false); } ).done(function () { hocr.hocr_server_called = true; hocr.on_dblclick_wiki_text(event); } );
} else {
if (!hocr.hocr_text_as_id.length)
return;
var text_box = document.getElementById("wpTextbox1");
// rebuilding the matrix on each click is costly, we try to avoid that.
var old_wiki_text_as_id = hocr.wiki_text_as_id;
hocr.wiki_text_as_id = [];
hocr.process_wiki_text(text_box.value);
if (!hocr.compare_array(old_wiki_text_as_id, hocr.wiki_text_as_id)) {
hocr.wiki_text_dist_matrix = hocr.levenshtein(hocr.wiki_text_as_id, hocr.hocr_text_as_id);
hocr.hocr_wiki_text_matcher = hocr.build_matcher(hocr.wiki_text_dist_matrix, hocr.wiki_text_as_id, hocr.hocr_text_as_id);
}
var val = $("#wpTextbox1").textSelection( "getCaretPosition", { } );
var word_number = hocr.retrieve_wiki_text_word_pos(val);
if (word_number >= 0) {
var best_index = hocr.locate_wiki_text_word(word_number);
hocr.highlight_bbox(best_index);
}
}
},
char_class : function () {
var char_latin = '0-9A-Za-zÀ-ÖØ-öø-ʯᴀ-ᴥᵢ-ᵥᵫ-ᵷᵹ-ᶚḀ-ỿₐ-ₔↄ⒈-⒐Ⱡ-ⱼⱾ-ⱿꜢ-ꝯꝱ-ꞇꞋ-ꞌꟻ-ꟿff-st';
var char_hebrew = 'א-תװ-ײיִײַ-ﬨשׁ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﭏ';
var char_cyrillic = 'Ѐ-ҁҊ-ԥᴫꙀ-ꙟꙢ-ꙮꙿ-ꚗ';
var char_bengali = '\u0985-\u098c\u098f-\u0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09ce\u09dc-\u09dd\u09df-\u09e1\u09e6-\u09f1\u09f4-\u09f9';
return char_latin + char_hebrew + char_cyrillic + char_bengali;
},
// split a text into an array containing words, non-words...
split_text : function(text) {
var word_sep = '[^' + hocr.char_class() + "]+";
var words = [];
var last_match = 0;
var splitter = new RegExp(word_sep, "gm");
while ((result = splitter.exec(text)) != null) {
var word = text.slice(last_match, splitter.lastIndex - result[0].length);
words.push(word);
words.push(result[0]);
last_match = splitter.lastIndex;
}
var last_word = text.slice(last_match);
words.push(last_word);
return words;
},
process_wiki_text : function(text) {
// FIXME: reusing split_text() doesn't work
var word_sep = '[^' + hocr.char_class() + "]+";
text = text.replace(/’/g, "'");
var last_match = 0;
var splitter = new RegExp(word_sep, "gm");
while ((result = splitter.exec(text)) != null) {
var word = text.slice(last_match, splitter.lastIndex - result[0].length);
hocr.edit_box_word_pos.push([last_match + word.length, hocr.wiki_text_as_id.length]);
hocr.push_word_id(hocr.wiki_text_as_id, word);
last_match = splitter.lastIndex;
}
var word = text.slice(last_match);
hocr.edit_box_word_pos.push([last_match + word.length, hocr.wiki_text_as_id.length]);
hocr.push_word_id(hocr.wiki_text_as_id, word);
},
split_text_node : function (node) {
if (node.nodeValue.length) {
results = hocr.split_text(node.nodeValue);
var html = '';
for (var i = 0; i < results.length; i++) {
if (i % 2 == 0 && results[i].length) {
html += '<span id="word_id_' + hocr.html_text_as_id.length + '">' + results[i] + '</span>';
// FIXME: do all transform in push_word_id
results[i] = results[i].replace(/’/g, "'");
hocr.push_word_id(hocr.html_text_as_id, results[i]);
} else {
html += results[i];
}
}
if (html.length) {
if (html.search('<span>') != -1) {
html = '<span>' + html + '</span>';
}
$(node).replaceWith(html);
}
}
},
child_text_nodes : function (node) {
var nodes = [];
function text_node_order(node) {
if (node.nodeType == 3) {
nodes.push(node);
} else {
for (var i = 0; i < node.childNodes.length; ++i) {
text_node_order(node.childNodes[i]);
}
}
}
text_node_order(node);
return nodes;
},
get_text_nodes : function (result) {
var $page_text = $(".pagetext");
if ($page_text.length) {
var nodes = hocr.child_text_nodes($page_text[0]);
for (var i = 0; i < nodes.length; ++i) {
hocr.split_text_node(nodes[i]);
}
}
},
redraw : function () {
if (hocr.hocr_last_word_index != -1) {
var resizeTimeout;
// We wait bit to allow multiple event to occur, esp. some browser trigger multiple resize
// event during a resize(), but worst they also don't fire a resize() at the end of a resize()
// FF 24 / linux for example.
$("#bboxHighlighting").remove();
clearTimeout(resizeTimeout);
resizeTimeout = setTimeout(function() { hocr.highlight_bbox(hocr.hocr_last_word_index); }, 500);
}
},
setup : function () {
if (mw.config.get('wgAction') == 'view' || (mw.config.get('wgAction') == 'submit' && $('#wikiPreview').length)) {
hocr.get_text_nodes();
$('#mw-content-text').dblclick(hocr.on_dblclick_html);
}
if ($.inArray(mw.config.get('wgAction'), [ 'edit', 'submit' ]) != -1) {
$('#wpTextbox1').dblclick(hocr.on_dblclick_wiki_text);
// Kludge, there is no #id for these buttons.
$("img[src*='" + hocr.img_extension_path + "Button_multicol.png']").click(hocr.redraw);
// FIXME: These three doesn't work as expected. Need the zoom function to be fixed in the
// extension first ?
$("img[src*='" + hocr.img_extension_path + "Button_examine.png']").click(hocr.redraw);
$("img[src*='" + hocr.img_extension_path + "Button_zoom_out.png']").click(hocr.redraw);
$("img[src*='" + hocr.img_extension_path + "Button_zoom_in.png']").click(hocr.redraw);
}
$(window).resize(hocr.redraw);
},
};
if (mw.config.get("wgCanonicalNamespace") == 'Page' && $.inArray(mw.config.get('wgAction'), ['view', 'edit', 'submit']) != -1) {
$(hocr.setup);
}