Utilisateur:Phe/Python/create ws link.py

# -*- coding: utf-8 -*-

import botpywi
import sys
import re
import wikipedia
import page_cache
import query_ext


try:
    import psyco
    psyco.full()
except ImportError:
    pass

class Options:
    pass

options = Options()

cache = page_cache.page_cache()

index_defs = {}

index_namespace = {
    'en' : u'Index',
    'fr' : u'Livre',
    }

template_toc_link = {
    'en' : u'TOC link',
    'fr' : u'lp',
    }

def get_index_def(opt, index_name):
    global index_defs
    if not index_defs.has_key(index_name):
        pagename = index_namespace[opt.lang] + u':' + index_name
        text = cache.get(pagename, site = opt.site)
        attribs = botpywi.explode_attrib(text, u'pagelist')
        index_defs[index_name] = attribs
    return index_defs[index_name]

# taken from http://code.activestate.com/recipes/81611/
def to_roman(num):
    ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
    nums = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
    result = ""
    for i in range(len(ints)):
        count = int(num / ints[i])
        result += nums[i] * count
        num -= ints[i] * count
    return result

# mimic ProofreadPage mediawiki extension by ThomasV
def page_number(i, args):
    mode = u'normal'
    offset = 0;
    for num, param in args:
        match = re.match(u'(\d+)to(\d+)', num)
        if (match and i >= int(match.group(1)) and i <= int(match.group(2))) or (num.isdecimal() and i == int(num)):
            for iparam in param.split(u';'):
                if iparam == u'roman' or iparam == u'highroman' or not iparam.isdecimal():
                    mode = iparam

        if num.isdecimal() and i >= int(num):
            for iparam in param.split(u';'):
                if iparam.isdecimal():
                    offset = int(num) - int(iparam)

    view = i - offset
    if mode == u'roman':
        view = to_roman(view).lower()
    elif mode == u'highroman':
        view = to_roman(view)
    elif mode == u'normal':
        view = unicode(view)
    else:
        view = mode

    return view, mode

def sub_pagename(prefix, pagename):
    if not prefix:
        return u' '
    if pagename.startswith(prefix):
        return pagename[len(prefix):]
    return None

# [[<noinclude>Page:Bulletin de la société géologique de France - 1re série - IV - 1833-1834.djvu/236</noinclude><includeonly>Bulletin de la société géologique de France - 1re série - IV - 1833-1834/Séance du 20 janvier 1834#168</includeonly>|168]]
def change_page_nr(match):
    if match.group(1) == u'-':
        return match.group(1) + match.group(2) + match.group(3)
    val = match.group(2)
    if not options.dict_page.has_key(val):
        print "unable to locate page:", val
        return match.group(1) + match.group(2) + match.group(3)

    if options.use_toc_link:
        toc_link = template_toc_link[options.lang]
        subpage = sub_pagename(options.prefix, options.dict_page[val][0])
        if subpage:
            if options.lang == 'en':
                return match.group(1) + u'{{' + toc_link + u'|' + options.dict_page[val][1] + u'|' + subpage + u'|' + val + u'}}' + match.group(3)
            else:
                return match.group(1) + u'{{' + toc_link + u'|' + subpage + u'|' + options.dict_page[val][1] + u'|' + val + u'}}' + match.group(3)
        else:
            print 'toc link cannot be used, prefix:', options.prefix, 'target pagename:', options.dict_page[val][0]

    # fall back or toc link not asked
    link = u"[[<noinclude>" + options.base_pagename + options.dict_page[val][1] + u"</noinclude><includeonly>" + options.dict_page[val][0]
    if options.anchor:
        link += u'#' + val 
    link += match.group(1) + u'</includeonly>|' + val + u']]' + match.group(3)
    return link

def change_page_nr_roman(match):
    val = match.group(1)
    if not options.dict_page.has_key(val):
        print "unable to locate page:", val
        return u'{{sc|' + match.group(1) + u'}}' + match.group(2)

    if options.use_toc_link:
        toc_link = template_toc_link[options.lang]
        subpage = sub_pagename(options.prefix, options.dict_page[val][0])
        if subpage:
            if options.lang == 'en':
                return u'{{' + toc_link + u'|' + options.dict_page[val][1] + u'|' + subpage + u'|' + val + u'}}' + match.group(2)
            else:
                return u'{{' + toc_link + u'|' + subpage + u'|' + options.dict_page[val][1] + u'|' + val + u'|{{sc|' +  match.group(1) + u'}}' + u'}}' + match.group(2)
        else:
            print 'toc link cannot be used, prefix:', options.prefix, 'target pagename:', options.dict_page[val][0]

    return u"[[<noinclude>" + options.base_pagename + options.dict_page[val][1] + u"</noinclude><includeonly>" + options.dict_page[val][0] + u'#' + val + u'</includeonly>|{{sc|' + val + u'}}]]' + match.group(2)

def load_data_from_prefix(opt):
    if not opt.prefix.endswith(u'/'):
        opt.prefix += u'/'

    cache.purge(opt.pagename, site = opt.site)

    gen = query_ext.PreloadingPagesStartswith(opt.prefix, site = opt.site)
    titles = []
    for p in gen:
        if not p[u'title'].endswith(u'/Texte entier'):
            titles.append(p[u'title'])

    titles = [ wikipedia.Page(site = opt.site, title = x) for x in titles ]

    cache.mass_load(titles)

    for p in titles:
        text = cache.read_page(p.title(), site = p.site())
        for it in re.finditer(u'{{[Pp]age\|.*?/(\d+)\|num=(.*?)}}', text):
            try:
                val = int(it.group(2))
                page_nr = it.group(1)
                opt.dict_page[unicode(val)] = [ p.title(), page_nr ]
            except ValueError:
                pass

        for it in re.finditer('{{[Pp]ageNum\|.*?\|nb=(.*?)\|from=(.*?)\|to=(.*?)}}', text):
            try:
                start = int(it.group(2))
                end = int(it.group(3))
                delta = int(it.group(1))
                for k in range(start, end + 1):
                    opt.dict_page[unicode(k + delta)] = [ p.title(), unicode(k) ]
            except ValueError:
                raise
                pass

        for it in re.finditer(u'(?ms)<pages (.*?)/>', text):
            #print it.group(1)
            start = int(re.sub(u'(?ms).*from=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
            end   = int(re.sub(u'(?ms).*to=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
            index = re.match(u'(?ms).*index="(.*?)".*', it.group(1)).group(1)
            if opt.index and opt.index != index:
                continue
            args = get_index_def(opt, index)
            #print start, end, p.title()
            for k in range(start, end + 1):
                page_nr = page_number(k, args)
                opt.dict_page[page_nr[0]] = [ p.title(), unicode(k) ]

        for it in re.finditer(u'(?ms)<pages (.*?)></pages>', text):
            #print it.group(1)
            start = int(re.sub(u'(?ms).*from=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
            end   = int(re.sub(u'(?ms).*to=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
            index = re.match(u'(?ms).*index="(.*?)".*', it.group(1)).group(1)
            if opt.index and opt.index != index:
                continue
            args = get_index_def(opt, index)
            #print start, end, p.title()
            for k in range(start, end + 1):
                page_nr = page_number(k, args)
                opt.dict_page[page_nr[0]] = [ p.title(), unicode(k) ]

def load_data_from_index(opt):
    # no dict_page, try to create it through the index
    # 48 [u'Highways and Byways in Sussex/Chichester and the Hills', u'76']
    # we can't do all the works as if subpage exist but at least we can create
    # the page number --> djvu page number index.
    # FIXME: this could be used to check than {{Page| is correctly used
    args = get_index_def(opt, opt.index)
    # FIXME: we must get the right djvu page number.
    # do it it the reverse way to ensure existing page def will overwrite
    # non-existing one in the dictionary
    for k in range(1200, 0, -1):
        page_nr = page_number(k, args)
        opt.dict_page[page_nr[0]] = [ u'', unicode(k) ]


# A set of string to replace with a magic string then back to the original
# to avoid substitution.
magic_string = [
    u'TOC row 1-1-1',
    u'TOC row 1-1-1-1',
    u'TOC row 1-out-1',
    u'TOC row 2-1',
    u'TOC row 2-1-1',
    u'TOC row 2out-1',
    u'{{t/c2}}',
    u'Haut2Colonnes',
    u'Fin2Colonnes',
]

# Yes the u'px' is a ugly hack
def add_magic_string(text):
    for pos, s in enumerate(magic_string):
        text = text.replace(s, u'__PHE_MAGIC_STRING__' + unicode(pos) + u'px')
    return text

def remove_magic_string(text):
    for pos, s in enumerate(magic_string):
        text = text.replace(u'__PHE_MAGIC_STRING__' + unicode(pos) + u'px', s)
    return text


def main(opt):
    opt.dict_page = {}

    if opt.prefix:
        load_data_from_prefix(opt)
 
    if len(opt.dict_page) == 0:
        load_data_from_index(opt)
    
    #if len(index_defs) > 1:
    #    raise "Can't support more than one index definition"

    text = cache.read_page(opt.pagename, site = opt.site)

    if opt.use_toc_link:
        toc_link = template_toc_link[opt.lang]
        text = re.sub(u'{{\s*' + toc_link + u'\s*\|[^|]*?\|[^|]*?\|[^|]*?\|({{sc\|[xlvicm]*)}}',u'\\1', text)
        text = re.sub(u'{{\s*' + toc_link + u'\s*\|[^|]*?\|[^|]*?\|([^}]*?)}}',u'\\1', text)
    text = re.sub(u"\[\[[^[|]*?\|(\d+)\]\]", "\\1", text)
    text = re.sub(u"\[\[[^[|]*?\|({{sc\|[xlvicm]+}})\]\]", "\\1", text)

    #text = re.sub(u'(?ms)^<noinclude>.*?</noinclude>', u'', text)
    #text = re.sub(u'(?ms)<noinclude>.*?</noinclude>$', u'', text)
    text = re.sub(u'(?ms)<noinclude>.*?</noinclude>(.*)<noinclude>.*?</noinclude>$', u'\\1', text)

    text = add_magic_string(text)

    # FIXME: this regexp is specific to en:
    text = re.sub(u'([^0-9])(\d+?)(?!px|em|th|st|rd|nd)([^%"0-9{]|$)', change_page_nr, text)
    # FIXME: this regexp is specific to fr:
    text = re.sub(u'{{sc\|([xlvicm]+)}}([^{])', change_page_nr_roman, text)

    text = remove_magic_string(text)

    #wikipedia.showDiff(cache.read_page(opt.pagename, site = opt.site), text)

    #text = re.sub(u'(?ms)^<noinclude>.*?</noinclude>', u'', text)
    #text = re.sub(u'(?ms)<noinclude>.*?</noinclude>$', u'', text)
    print text


if __name__ == "__main__":
    try:
        options.lang = None
        options.index = None
        options.prefix = None
        options.anchor = True
        for arg in sys.argv[1:]:
            arg = unicode(arg,  'utf-8')
            if arg.startswith('-prefix:'):
                options.prefix = arg[len(u'-prefix:'):]
            elif arg.startswith('-lang:'):
                options.lang = arg[len(u'-lang:'):]
            elif arg.startswith('-no_anchor'):
                options.anchor = False
            elif arg.startswith('-index:'):
                options.index = arg[len(u'-index:'):]
            elif arg == u'-help':
                print sys.argv[0], "-lang:lang_code -index:index_name -prefix:prefix_page_name -no_anchor page_to_handle"
                sys.exit(1)
            else:
                options.pagename = arg

        if options.index == None and options.prefix == None:
            print sys.argv[0], "either -prefix: or -index: must be given"
            sys.exit(1)

        options.use_toc_link = template_toc_link.has_key(options.lang)

        components = options.pagename.split(u'/')
        options.base_pagename = u"/".join(components[:len(components)-1]) + u'/'

        options.site = wikipedia.getSite(code = options.lang, fam = 'wikisource')

        main(options)
    finally:
        wikipedia.stopme()
        cache.save()