Utilisateur:Phe/Python/align.py
Apparence
< Utilisateur:Phe | Python
#!/usr/bin/python
# -*- coding: utf-8 -*-
# text alignment program
# author : thomasv1 at gmx dot de
# licence : GPL
import os, string, re, sys
import difflib, urllib
sys.path.append("../pywikipedia")
import wikipedia, pagegenerators, catlib
djvutxt = '/home/thomasv/djvulibre/tools/djvutxt'
def do_match(target, filename, djvuname, number):
s = difflib.SequenceMatcher()
offset = 0
output = u""
for i in range(1000):
if i==10 and offset==0:
#no text layer
return u""
pagenum=i+number
p = os.popen(djvutxt + " --page=%d %s"% (pagenum,filename))
page1 = unicode(p.read(), 'utf-8')
p.close()
p = os.popen(djvutxt + " --page=%d %s "%((pagenum+1),filename))
page2 = unicode(p.read(), 'utf-8')
p.close()
text1 = page1+page2
text2 = target[offset:offset+ int(1.5*len(text1))]
p = re.compile(ur'[\W]+', re.U)
fp = re.compile(ur'([\W]+)', re.U)
ftext1 = fp.split(text1)
ftext2 = fp.split(text2)
page1 = p.split(page1)
text1 = p.split(text1)
text2 = p.split(text2)
s.set_seqs(text1,text2)
mb = s.get_matching_blocks()
ccc = mb[-2]
dummy = mb[-1]
ratio = s.ratio()
print i, ccc, ratio
if ratio<0.1:
print "low ratio"
break
mstr=u""
overflow = False
for i in range(ccc[0]+ccc[2]):
matched = False
for m in mb:
if i >= m[0] and i < m[0]+m[2] :
matched = True
if i >= len(page1):
overflow = True
break
if not overflow:
ss = ftext1[2*i]
if matched : ss =u"\033[1;32m%s\033[0;49m"%ss
if 2*i+1 < len(ftext1):
mstr = mstr + ss +ftext1[2*i+1]
print mstr
print "--------------------------------"
mstr=u""
no_color = u""
overflow = False
for i in range(ccc[1]+ccc[2]):
matched = False
for m in mb:
if i >= m[1] and i < m[1]+m[2] :
matched = True
if m[0]+i-m[1] >= len(page1):
overflow = True
break
if not overflow:
ss = ftext2[2*i]
if matched : ss =u"\033[1;31m%s\033[0;49m"%ss
if 2*i+1 < len(ftext2):
mstr = mstr + ss +ftext2[2*i+1]
no_color = no_color + ftext2[2*i] + ftext2[2*i+1]
print mstr
print "===================================="
output = output + u"\n==[[Page:%s/%d]]==\n"%(djvuname,pagenum) + no_color
offset = offset + len(no_color)
if offset!=0 and target[offset:]:
output = output+u"\n== reste ==\n" + target[offset:]
if offset==0:
output = u""
return output
def get_djvu_filename(filename):
filename = filename.replace(' ', '_')
return 'RDDM/' + filename
def do_align(pagename, djvuname, number):
site = wikipedia.getSite('fr',fam='wikisource')
wikipedia.setAction("pagination")
page = wikipedia.Page(site, pagename)
old_target = target = page.get()
if number == None:
# FIXME
pass
target = re.sub(u'{{TextQuality\|50%}}<div class="text">\n', '', target)
target = re.sub(u'<references/>\n</div>', u'', target)
match = re.match(u'({{journal\|[^}]*}}\n)', target)
target = re.sub(u'{{journal\|[^}]*}}\n', u'', target)
filename = get_djvu_filename(djvuname)
output = do_match(target, filename, djvuname, number)
if match:
output = match.group(1) + output
if output:
wikipedia.showDiff(old_target, output)
choice = wikipedia.inputChoice(u'Upload ?', [ 'Yes', 'No' ], [ 'Y', 'N'], 'N')
if choice == 'Y' or choice == 'y':
page.put(output)
if __name__ == "__main__":
try:
page_number = None
if len(sys.argv) > 3:
page_number = int(sys.argv[3])
do_align(unicode(sys.argv[1], 'utf-8'), sys.argv[2], page_number)
#do_align(u"Etudes sur l'Angleterre : les classes inférieures", "Revue des Deux Mondes - 1845 - tome 11.djvu", 35)
finally:
wikipedia.stopme()