#!/usr/bin/python
# text alignment program
# author : thomasv1 at gmx dot de
# licence : GPL
import os, string, re, sys
import difflib, urllib
sys.path.append("../pywikipedia")
import wikipedia, pagegenerators, catlib
def do_match(target, filename, djvuname, number):
d = difflib.Differ()
s = difflib.SequenceMatcher()
offset = 0
output = ""
for i in range(1000):
if i==10 and offset==0:
#no text layer
return ""
pagenum=i+number
p = os.popen("/home/thomasv/djvulibre/tools/djvutxt --page=%d %s "%(pagenum,filename))
page1 = p.read()
p.close()
p = os.popen("/home/thomasv/djvulibre/tools/djvutxt --page=%d %s "%((pagenum+1),filename))
page2 = p.read()
p.close()
text1 = page1+page2
text2 = target[offset:offset+ int(1.5*len(text1))]
p = re.compile(r'[\W]+')
fp = re.compile(r'([\W]+)')
ftext1 = fp.split(text1)
ftext2 = fp.split(text2)
page1 = p.split(page1)
text1 = p.split(text1)
text2 = p.split(text2)
s.set_seqs(text1,text2)
mb = s.get_matching_blocks()
ccc = mb[-2]
dummy = mb[-1]
ratio = s.ratio()
print i, ccc, ratio
if ratio<0.1:
print "low ratio"
break
mstr=""
overflow = False
for i in range(ccc[0]+ccc[2]):
matched = False
for m in mb:
if i >= m[0] and i < m[0]+m[2] :
matched = True
if i >= len(page1):
overflow = True
break
if not overflow:
ss = ftext1[2*i]
if matched : ss ="\033[1;32m%s\033[0;49m"%ss
if 2*i+1 < len(ftext1):
mstr = mstr + ss +ftext1[2*i+1]
print mstr
print "--------------------------------"
mstr=""
no_color = ""
overflow = False
for i in range(ccc[1]+ccc[2]):
matched = False
for m in mb:
if i >= m[1] and i < m[1]+m[2] :
matched = True
if m[0]+i-m[1] >= len(page1):
overflow = True
break
if not overflow:
ss = ftext2[2*i]
if matched : ss ="\033[1;31m%s\033[0;49m"%ss
if 2*i+1 < len(ftext2):
mstr = mstr + ss +ftext2[2*i+1]
no_color = no_color + ftext2[2*i] + ftext2[2*i+1]
print mstr
print "===================================="
output = output + "\n==[[Page:%s/%d]]==\n"%(djvuname,pagenum) + no_color
offset = offset + len(no_color)
if offset!=0 and target[offset:]:
output = output+"\n== reste ==\n" + target[offset:]
if offset==0:
output = ""
return output
def do_align(pagename, djvuname, number):
site = wikipedia.getSite('fr',fam='wikisource')
wikipedia.setAction("pagination")
filepage = wikipedia.ImagePage(site,"File:"+djvuname)
url = filepage.fileUrl()
filename = "djvu/"+url.split('/')[-1]
if not os.path.exists(filename):
os.system('wget -O %s %s'%(filename,url))
page = wikipedia.Page(site,pagename)
target = page.get()
output = do_match(target, filename, djvuname, number)
#if output:
# page.put(output)
if __name__ == "__main__":
#pagename = "Le Monde gréco-slave"
do_align("Brésil. - situation financière", "Revue des Deux Mondes - Période initiale, tome 1.djvu", 77)
#do_align("Les forceurs de blocus","Verne - Une ville flottante, 1872.djvu",129)