Utilisateur:Phe/Python/align.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# text alignment program
# author : thomasv1 at gmx dot de
# licence : GPL

import os, string, re, sys
import difflib, urllib

sys.path.append("../pywikipedia")
import wikipedia, pagegenerators, catlib

djvutxt = '/home/thomasv/djvulibre/tools/djvutxt'

def do_match(target, filename, djvuname, number):

    s = difflib.SequenceMatcher()
    offset = 0
    output = u""
    for i in range(1000):

        if i==10 and offset==0:
            #no text layer
            return u""

        pagenum=i+number
        p = os.popen(djvutxt + " --page=%d %s"% (pagenum,filename))
        page1 = unicode(p.read(), 'utf-8')
        p.close()
        p = os.popen(djvutxt + " --page=%d %s "%((pagenum+1),filename))
        page2 = unicode(p.read(), 'utf-8')
        p.close()

        text1 = page1+page2
        text2 = target[offset:offset+ int(1.5*len(text1))]

        p = re.compile(ur'[\W]+', re.U)
        fp = re.compile(ur'([\W]+)', re.U)
        ftext1 = fp.split(text1)
        ftext2 = fp.split(text2)

        page1 = p.split(page1)
        text1 = p.split(text1)
        text2 = p.split(text2)
        s.set_seqs(text1,text2)

        mb = s.get_matching_blocks()

        ccc = mb[-2]
        dummy = mb[-1]
        ratio = s.ratio()
        print i, ccc, ratio

        if ratio<0.1:
            print "low ratio"
            break
        mstr=u""
        overflow = False
        for i in range(ccc[0]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[0] and i < m[0]+m[2] :
                   matched = True
                   if i >= len(page1):
                       overflow = True
                   break
            if not overflow:
                ss = ftext1[2*i]
                if matched : ss =u"\033[1;32m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext1):
                    mstr = mstr + ss +ftext1[2*i+1]
        print mstr
        print "--------------------------------"

        mstr=u""
        no_color = u""
        overflow = False
        for i in range(ccc[1]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[1] and i < m[1]+m[2] :
                   matched = True
                   if m[0]+i-m[1] >= len(page1):
                       overflow = True
                   break

            if not overflow:
                ss = ftext2[2*i]
                if matched : ss =u"\033[1;31m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext2):
                    mstr = mstr + ss +ftext2[2*i+1]
                    no_color = no_color + ftext2[2*i] + ftext2[2*i+1]
        print mstr
        print "===================================="

        output = output + u"\n==[[Page:%s/%d]]==\n"%(djvuname,pagenum) + no_color
        offset = offset + len(no_color)

    if offset!=0 and target[offset:]:
        output = output+u"\n== reste ==\n" + target[offset:]

    if offset==0:
        output = u""

    return output

def get_djvu_filename(filename):
    filename = filename.replace(' ', '_')
    return 'RDDM/' + filename

def do_align(pagename, djvuname, number):
    site = wikipedia.getSite('fr',fam='wikisource')
    wikipedia.setAction("pagination")

    page = wikipedia.Page(site, pagename)
    old_target = target = page.get()
    if number == None:
        # FIXME
        pass

    target = re.sub(u'{{TextQuality\|50%}}<div class="text">\n', '', target)
    target = re.sub(u'<references/>\n</div>', u'', target)

    match = re.match(u'({{journal\|[^}]*}}\n)', target)
    target = re.sub(u'{{journal\|[^}]*}}\n', u'', target)

    filename = get_djvu_filename(djvuname)
    output = do_match(target, filename, djvuname, number)
    if match:
        output = match.group(1) + output
    if output:
        wikipedia.showDiff(old_target, output)
        choice = wikipedia.inputChoice(u'Upload ?', [ 'Yes', 'No' ], [ 'Y', 'N'], 'N')
        if choice == 'Y' or choice == 'y':
            page.put(output)

if __name__ == "__main__":
    try:
        page_number = None
        if len(sys.argv) > 3:
            page_number = int(sys.argv[3])
        do_align(unicode(sys.argv[1], 'utf-8'), sys.argv[2], page_number)
        #do_align(u"Etudes sur l'Angleterre : les classes inférieures", "Revue des Deux Mondes - 1845 - tome 11.djvu", 35)
    finally:
        wikipedia.stopme()