Utilisateur:ThomasV/align.py

#!/usr/bin/python
# text alignment program
# author : thomasv1 at gmx dot de
# licence : GPL

import os, string, re, sys
import difflib, urllib

sys.path.append("../pywikipedia")
import wikipedia, pagegenerators, catlib


def do_match(target, filename, djvuname, number):

    d = difflib.Differ()
    s = difflib.SequenceMatcher()
    offset = 0
    output = ""
    for i in range(1000):

        if i==10 and offset==0:
            #no text layer
            return ""

        pagenum=i+number
        p = os.popen("/home/thomasv/djvulibre/tools/djvutxt --page=%d %s "%(pagenum,filename))
        page1 = p.read()
        p.close()
        p = os.popen("/home/thomasv/djvulibre/tools/djvutxt --page=%d %s "%((pagenum+1),filename))
        page2 = p.read()
        p.close()

        text1 = page1+page2
        text2 = target[offset:offset+ int(1.5*len(text1))]

        p = re.compile(r'[\W]+')
        fp = re.compile(r'([\W]+)')
        ftext1 = fp.split(text1)
        ftext2 = fp.split(text2)

        page1 = p.split(page1)
        text1 = p.split(text1)
        text2 = p.split(text2)
        s.set_seqs(text1,text2)

        mb = s.get_matching_blocks()

        ccc = mb[-2]
        dummy = mb[-1]
        ratio = s.ratio()
        print i, ccc, ratio

        if ratio<0.1:
            print "low ratio"
            break
        mstr=""
        overflow = False
        for i in range(ccc[0]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[0] and i < m[0]+m[2] :
                   matched = True
                   if i >= len(page1):
                       overflow = True
                   break
            if not overflow:
                ss = ftext1[2*i]
                if matched : ss ="\033[1;32m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext1):
                    mstr = mstr + ss +ftext1[2*i+1]
        print mstr
        print "--------------------------------"

        mstr=""
        no_color = ""
        overflow = False
        for i in range(ccc[1]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[1] and i < m[1]+m[2] :
                   matched = True
                   if m[0]+i-m[1] >= len(page1):
                       overflow = True
                   break

            if not overflow:
                ss = ftext2[2*i]
                if matched : ss ="\033[1;31m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext2):
                    mstr = mstr + ss +ftext2[2*i+1]
                    no_color = no_color + ftext2[2*i] + ftext2[2*i+1]
        print mstr
        print "===================================="

        output = output + "\n==[[Page:%s/%d]]==\n"%(djvuname,pagenum) + no_color
        offset = offset + len(no_color)

    if offset!=0 and target[offset:]:
        output = output+"\n== reste ==\n" + target[offset:]

    if offset==0:
        output = ""

    return output

def do_align(pagename, djvuname, number):

    site = wikipedia.getSite('fr',fam='wikisource')
    wikipedia.setAction("pagination")
    filepage = wikipedia.ImagePage(site,"File:"+djvuname)
    url = filepage.fileUrl()
    filename = "djvu/"+url.split('/')[-1]
    if not os.path.exists(filename):
        os.system('wget -O %s %s'%(filename,url))
    page = wikipedia.Page(site,pagename)
    target = page.get()
    output = do_match(target, filename, djvuname, number)
    #if output:
    #    page.put(output)


if __name__ == "__main__":
    #pagename = "Le Monde gréco-slave"
    do_align("Brésil. - situation financière", "Revue des Deux Mondes - Période initiale, tome 1.djvu", 77)
    #do_align("Les forceurs de blocus","Verne - Une ville flottante, 1872.djvu",129)