Utilisateur:Phe/Scripts/create dl.py

# -*- coding: utf-8 -*-

import botpywi
import query_ext
import sys
import re
import wikipedia
import locale

AL_BY_VOLUME = 1
AL_ALPHA = 2

def transform_link(match, template, blacklist):
    if not match.group(2) in blacklist:
        text = match.group(1) + u' {{' + template + u'|' + match.group(2).upper() + u'|{{sc|' + match.group(2) + u'}}}}'
    else:
        text = match.group(1) + u' {{sc|' + match.group(2) + u'}}'
    if len(match.groups()) > 3 and match.group(3):
        for it in re.finditer(u', {{sc\|([^}]*?)}}', match.group(3)):
            text += u', {{' + template + u'|' + it.group(1).upper() + u'|{{sc|' + it.group(1) + u'}}}}'
    return text

# u"\\1 {{Tr6L|\\2|{{sc|\\2}}}}"
def transform_trevoux_link(match):
    blacklist = set( [
            u'Le P. Mabillon', u'Maty', u'Ducange', u'Durand', u'Pasquier',
            u'Lymnæus'
            ] )

    return transform_link(match, u'Tr6L', blacklist)

def transform_michaud_link(match):
    return transform_link(match, u'Mich2L', set())

Trevoux_1771_data = {
    u'prefix dl' : u'Utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
    u'prefix article' : u'Utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
    u'need dl prefix code' : True,
    u'need article list' : AL_BY_VOLUME | AL_ALPHA,
    u'suffix article list alpha' : u'Index alphabétique',
    u'cat article' : u'Articles du Dictionnaire de Trévoux, 6e édition',
    u'create link' : { u'regexp' : [ u"(Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_trevoux_link ], u'skip regexp' : u'{{[Tt]r6l[ ]*\|' },
    u'error page' : u'Discussion utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
}

Encyclo_diderot_1_data = {
    u'prefix dl' : u'L’Encyclopédie/1re édition',
    u'prefix article' : u'L’Encyclopédie/1re édition',
    u'need dl prefix code' : True,
    u'need article list' : AL_BY_VOLUME | AL_ALPHA,
    u'suffix article list alpha' : u'Index alphabétique',
    u'cat article' : u'L’Encyclopédie, 1re édition',
    u'error page' : u'Discussion:L’Encyclopédie/1re édition/Erreur',
    #u'cat article regexp' : [ (u'([Vv]ille|riviere|île\b)', u'Géographie'), (u'Norwege', u'Norvège') , (u'\bHist\.', u'Histoire'), (u'Géog\.', u'Géographie'), (u'Art méchaniq', u'Mécanique'), (u'\barbres\b', u'Botanique'), (u'Jurisp', u'Jurisprudence'), (u'Myth', u'Mythologie'), (u'Arith\.', u'Mathématiques'), (u'Comm\.', u'Commerce'), ('Theol\.', u'Théologie'), (u'Œconomie.', u'Économie'), (u'maladie', u'Médecine'), (u'Mathématiciens', u'Mathématiques'), (u'Botan', u'Botanique'), (u'Suede', u'Suède') ],
    #u'cat article regexp self' : [ u'(France|Allemagne|Italie|Suisse|Angleterre|Espagne|Afrique|Musique|Astronomie|Chimie|Marine|Physiologie|Philosophie)',  ],
    u'cat article module' : 'Diderot_cat',
    u'if not cated' : [ u'à revoir' ],
    u'signature' : [ ( u"\(''A''\)", u"Antoine-Gaspard Boucher d’Argis|Boucher d’Argis"), (u"\(''a''\)", u"Nicolas Lenglet Du Fresnoy|Du Fresnoy"), (u"\(''B''\)", u"Louis de Cahusac|Cahusac"), (u"\(''C''\)", u"Jean Pestré|Pestré"), (u"\(''c''\)", u"Louis Jean-Marie Daubenton|Louis Daubenton"), (u"\(''D''\)", u"Louis-Jacques Goussier|Goussier"), (u"\(''d''\)", u"Arnulphe d’Aumont|Aumont"), (u"\(''E''\)", u"Jean-Baptiste de La Chapelle|La Chapelle"), (u"\(''e''\)", u"Claude Bourgelat|Bourgelat"), (u"\(''F''\)", u"César Chesneau Dumarsais|Dumarsais"), (u"\(''f''\)", u"Jacques-François de Villiers|Villiers"), (u"\(''G''\)", u"Edme-François Mallet|Mallet"), (u"\(''g''\)", u"Paul-Joseph Barthez|Barthez"), (u"\(''H''\)", u"François-Vincent Toussaint|Toussaint"), (u"\(''h''\)", u"André Morellet|Morellet"), (u"\(''I''\)", u"Pierre Daubenton|Daubenton"), (u"\(''K''\)", u"Antoine Joseph Dezallier d’Argenville|Argenville"), (u"\(''L''\)", u"Pierre Tarin|Tarin"), (u"\(''M''\)", u"Paul-Jacques Malouin|Malouin"), (u"\(''m''\)", u"Jean-Joseph Menuret|Menuret"), (u"\(''N''\)", u"Urbain de Vandenesse|Vandenesse"), (u"\(''O''\)", u"Jean le Rond d’Alembert|d’Alembert"), (u"\(''P''\)", u"Jacques-François Blondel|Blondel"), (u"\(''Q''\)", u"Guillaume Le Blond|Le Blond"), (u"\(''R''\)", u"Paul Landois|Landois"), (u"\(''S''\)", u"Jean-Jacques Rousseau|Rousseau"), (u"\(''T''\)", u"Charles Le Roy|Le Roy"), (u"\(''V''\)", u"Marc-Antoine Eidous|Eidous"), (u"\(''X''\)", u"Claude Yvon|Yvon"), (u"\(''Y''\)", u"Antoine Louis|Louis"), (u"\(''Z''\)", u"Jacques-Nicolas Bellin|Bellin"), (u"\(''C\. D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''C\. D\. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le chevalier {{sc\|de Jaucourt}}''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le Chevalier {{sc\|de Jaucourt}}''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\.J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\({{sc\|de Jaucourt}}\)", u"Louis de Jaucourt|Jaucourt"), (u"{{sc\|de Jaucourt}}", u"Louis de Jaucourt|Jaucourt"),(u"{{sc\|de Jaucourt\.}}", u"Louis de Jaucourt|Jaucourt"),(u"\(''D\. J''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\({{sc\|de Jaucourt\.}}\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le Chevalier ''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''C\. D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''\D. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''E\. R\. M\.''\)", u"[[Auteur:Jacques-Philippe-Augustin Douchet|Douchet]] & [[Auteur:Nicolas Beauzée|Beauzée]]"), (u"<nowiki>\*</nowiki>", u"Denis Diderot|Diderot"), (u"{{sc\|\*[^}]*}}", u"Denis Diderot|Diderot"), (u"\(—\)", u"Paul Henri Thiry d’Holbach|Baron d’Holbach"), (u"\(''b''\)", u"Gabriel François Venel|Venel") ],
}

Michaud_1843_data = {
    u'prefix dl' : u'Biographie universelle ancienne et moderne/2e éd., 1843',
    U'prefix article' : u'Biographie universelle ancienne et moderne/2e éd., 1843',
    u'need dl prefix code' : True,
    u'need article list' : AL_BY_VOLUME | AL_ALPHA,
    u'suffix article list alpha' : u'Index alphabétique',
    u'cat article' : u'Articles de la Biographie universelle, Michaud, 2e édition',
    u'create link' : { u'regexp' : [ u"(voy\.|''voy\.''|''Voy\.''|Voy\.|Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_michaud_link ], u'skip regexp' : u'{{[Mm]ich2L[ ]*\|' },
    u'error page' : u'Discussion:Biographie universelle ancienne et moderne/2e éd., 1843/Erreurs',
}

Celestin_port_data = {
    U'prefix article' : u'Dictionnaire historique, géographique et biographique du Maine-et-Loire',
    u'need dl prefix code' : True,
    u'need article list' : AL_BY_VOLUME | AL_ALPHA,
    u'suffix article list alpha' : u'Index alphabétique',
    u'cat article' : u'Articles du Célestin Port',
    u'create link' : { u'regexp' : [ u"(voy\.|''voy\.''|''Voy\.''|Voy\.|Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_michaud_link ], u'skip regexp' : u'{{[Mm]ich2L[ ]*\|' },
    u'error page' : u'Discussion:Dictionnaire historique, géographique et biographique du Maine-et-Loire/Erreurs',
}

page_prefix = {
    u'fr' : u'Page',
}

dl_prefix_code = {}

dl_prefix_code[u'fr'] = u"""{| style="width:100%%;"
|
<div id="dynamic_links" class="nopopups" title="%s" style="height:800px;overflow:auto;margin-left:3em;text-align:left;" >
"""

dl_suffix_code = {}

dl_suffix_code[u'fr'] = u"""</div>
|
<div style="padding:10px;height:800px;overflow:auto;">
<div id="dict_entry" class=text "/>
</div>
|}"""

index_prefix_code = u'<div style="text-align:left;">\n{{colonnes|nombre=4|\n1=\n'
index_suffix_code = u'}}\n</div>'

class VolumeSet(object):
    def __init__(self, descr):
        self.prefix_dl = descr[u'prefix dl']
        self.prefix_article = descr[u'prefix article']
        self.volumes = []
        self.need_dl_prefix_code = descr[u'need dl prefix code']
        self.need_article_list = descr[u'need article list']
        self.suffix_article_list_alpha = descr[u'suffix article list alpha']
        self.cat_article = descr[u'cat article']
        self.create_link = descr.get(u'create link', u'')
        self.error_page = descr[u'error page']
        self.cat_article_regexp = descr.get(u'cat article regexp', [])
        self.cat_article_regexp_self = descr.get(u'cat article regexp self', [])
        self.cat_article_module = descr.get(u'cat article module', '')
        self.cat_article_module_re = {}
        if self.cat_article_module:
            module = __import__(self.cat_article_module)
            for key in module.cat:
                result = []
                for pat in module.cat[key]:
                    result.append(botpywi.quote_regex(pat))
                regexp = u"|".join(result)
                regexp = regexp.replace(u' ', u'\\s')
                regexp = regexp.replace(u"'", u"'*")
                self.cat_article_module_re[key] = re.compile(regexp)
        self.signature = []
        signature = descr.get(u'signature', [])
        for sign, author in signature:
            self.signature.append( (re.compile(sign), author) )
        self.if_not_cated = descr.get(u'if not cated')
        self.error = u''

class Volume(object):
    def __init__(self, descr):
        self.suffix_dl = descr[u'suffix dl']
        self.suffix_article_list = descr[u'suffix article list']
        self.prefix_page = descr[u'prefix page']
        self.create_link = descr.get(u'create link', u'')
        self.pages_text = {}

class Trevoux_1771(VolumeSet):
    def __init__(self):
        VolumeSet.__init__(self, Trevoux_1771_data)
        for i in [ (u'I', 1) ]: #, (u'II', 2), (u'III', 3), (u'IV', 3), (u'V', 4), (u'VI', 5), (u'VII', 6), (u'VIII', 8)]:
            descr_volume = {
                u'suffix dl' : u'Tome %d' % i[1],
                u'suffix article list' : u'Index tome %s' % i[0],
                u'prefix page' : u'Dictionnaire de Trévoux, 1771, %s.djvu' % i[0],
                }
            self.volumes.append(Volume(descr_volume))

class Encyclo_diderot_1(VolumeSet):
    def __init__(self):
        VolumeSet.__init__(self, Encyclo_diderot_1_data)
        for i in range(1, 18):
            basename = u'Diderot - Encyclopedie 1ere edition tome %d.djvu' % i
            descr_volume = {
                u'suffix dl' : u'Volume %d' % i,
                u'suffix article list' : u'Index tome %d' % i,
                u'prefix page' : basename,
                }
            self.volumes.append(Volume(descr_volume))

class Michaud_1843(VolumeSet):
    def __init__(self):
        VolumeSet.__init__(self, Michaud_1843_data)
        for i in range(1, 3):
            basename = u'Michaud - Biographie universelle ancienne et moderne - 1843 - Tome %d.djvu' % i
            descr_volume = {
                u'suffix dl' : 'Tome %d' % i,
                u'suffix article list' : u'Index tome %d' % i,
                u'prefix page' : basename,
                }
            self.volumes.append(Volume(descr_volume))

predef_dict = {
    u'Trévoux 1771' : Trevoux_1771(),
    u'Encyclopédie Diderot 1' : Encyclo_diderot_1(),
    u'Michaud 1843' : Michaud_1843(),
}

class Statistics:
    def __init__(self):
        self.estimed_article = 0
        self.total_article = 0
        self.cated_article = 0
        self.authored_article = 0
        self.total_cat = 0
        self.total_authors = 0

    def ratio(self, total, b):
        if total == 0 or b == 0:
            return 0.0
        return 1.0 - ((float(total) - b) / total)

    def __str__(self):
        text = u'articles: estimed, total, total cat, total authors: %d %d %d %d\n' % (self.estimed_article, self.total_article, self.total_cat, self.total_authors)
        text += u"cated article: %f\n" % self.ratio(self.total_article, self.cated_article)
        text += u"authored article: %f" % self.ratio(self.total_article, self.authored_article)
        return text

class Options:
    def __init__(self):
        self.save_dl = False
        self.save_alpha_index = False
        self.save_volume_index = False
        self.save_article = False
        self.do_links = False

def compare_title(a, b):
    first = int(re.match(u'.*/(\d+)', a[2]).group(1))
    second = int(re.match(u'.*/(\d+)', b[2]).group(1))
    return first - second

def fill_dict_section(volume_set, volume, text, page_nr, dict_section, order, opt):
    for it in re.finditer(u'<section begin=(["]?)(.*?)\\1[ ]*/>', text):
        if not dict_section.has_key(it.group(2)):
            dict_section[it.group(2)] = [ order ]
            order += 1
        elif order - 1 != dict_section[it.group(2)][0]:
            # FIXME: not sufficient if we have twice the same section
            # name consecutively
            volume_set.error += u'# duplicate section [[' + page_prefix[opt.site.code] + u':' + volume.prefix_page + u'/' + unicode(page_nr) + u'|' + unicode(page_nr) + u']], ' + unicode(dict_section[it.group(2)][1]) + u', ' + unicode(dict_section[it.group(2)][-1]) + u', ' + it.group(2) + u'\n'
        dict_section[it.group(2)].append(page_nr)
    return order

def create_link(page, volume_set, volume, opt):
    cl = (volume.create_link or volume_set.create_link) and opt.do_links
    if cl:
        regexp = cl[u'regexp']
        content = page[u'revisions'][0]['*']
        if not re.search(cl[u'skip regexp'], content):
            new_content = re.sub(regexp[0], regexp[1], content)
            if new_content.strip(u'\n') != content.strip(u'\n'):
                save_page(page[u'title'], new_content, opt, old_text = content,
                          comment = u"mise à jour des liens")

def filter_table_entry(entry):
    return entry[2].endswith(u'(nobot)')

def scan_pages(base_page, volume_set, volume, opt):
    gen = query_ext.PreloadingPagesStartswith(base_page, site = opt.site)
    page_list = []
    for p in query_ext.PreloadingContents(gen, site = opt.site):
        page_nr = int(re.match(u'.*/(\d+)', p[u'title']).group(1))
        page_list.append((page_nr, p[u'revisions'][0]['*'], p[u'title']) )
        create_link(p, volume_set, volume, opt)
        volume.pages_text[page_nr] = p[u'revisions'][0]['*']
    page_list.sort(compare_title)

    order = 0
    dict_section = {}
    for p in page_list:
        order = fill_dict_section(volume_set, volume, p[1], p[0],
                                  dict_section, order, opt)

    table = [ (dict_section[key][0], dict_section[key][1:], key, volume.prefix_page) for key in dict_section ]
    table = [ x for x in table if not filter_table_entry(x) ]
    table.sort()
    # format : (order, [page numbers], title, prefix_page)
    return table

def generate_dl(table):
    text = u''
    for p in table:
        text += u'*[[DL#' + unicode(p[1][0]) + u':' + unicode(p[1][-1]) + u'|' + p[2] + u']]\n'
    return text

def get_categories(volume_set, pages_text, stats):
    cats = set()
    for regexp, replace in volume_set.cat_article_regexp:
        if re.search(regexp, pages_text):
            cats.add(replace)
    for regexp in volume_set.cat_article_regexp_self:
        match = re.search(regexp, pages_text)
        if match:
            cats.add(match.group(1))

    for key in volume_set.cat_article_module_re:
        if volume_set.cat_article_module_re[key].search(pages_text):
            cats.add(key)

    cats = [x for x in cats]
    cats.sort(locale.strcoll)
    text = u''
    for c in cats:
        text += u'\n[[Catégorie:Articles de dictionnaire - ' + c + u']]'

    if len(cats):
        stats.cated_article += 1
        stats.total_cat += len(cats)
    else:
        for c in volume_set.if_not_cated:
            text += u'\n[[Catégorie:Articles de dictionnaire - ' + c + u']]'

    return text

def get_authors(volume_set, text, stats):
    authors = set()
    for sign, author in volume_set.signature:
        if sign.search(text):
            if u'[' in author:
                authors.add(author) 
            elif u'|' in author:
                authors.add(u'[[Auteur:' + author + u']]')
            else:
                authors.add(u'[[Auteur:' + author + u'|' + author + u']]')

    if len(authors):
        stats.authored_article += 1
        stats.total_authors += len(authors)

    return u', '.join(authors)

def has_ref(text):
    return re.search(u'<ref>', text)

def generate_article(table, basename, volume_set, volume,
                     idx_volume, opt, stats):
    pagenames = []
    for i, p in enumerate(table):
        pagenames.append({u'title' : basename + u'/' + p[2]})
    pages = {}
    for p in query_ext.PreloadingContents(pagenames, site = opt.site):
        pages[p[u'title']] = p
        
    for i, p in enumerate(table):
        #if i > 10 or p[2] in [ u'ABACE, ABÉCE', u'ABAQUE' ]:
        if idx_volume > 3: # or i <= 2000:
            continue

        stats.total_article += 1

        params = {}
        params[u'index'] = volume.prefix_page
        params[u'from'] = p[1][0]
        params[u'to'] = p[1][-1]
        params[u'from section'] = p[2]
        params[u'to section'] = p[2]
        params[u'prev'] = ''
        if i:
            params[u'prev'] = u'[[../%s/]]' % table[i-1][2]
        else:
            if idx_volume:
                prev_volume = volume_set.volumes[idx_volume - 1]
                if len(prev_volume.table):
                    prev_entry = prev_volume.table[len(prev_volume.table) - 1]
                    params[u'prev'] = u'[[../%s/]]' % prev_entry[2]
        params[u'next'] = ''
        if i < len(table) - 1:
            params[u'next'] = u'[[../%s/]]' % table[i+1][2]
        else:
            if idx_volume < len(volume_set.volumes) - 1:
                next_volume = volume_set.volumes[idx_volume + 1]
                if len(next_volume.table):
                    next_entry = next_volume.table[0]
                    params[u'next'] = u'[[../%s/]]' % next_entry[2]
        ptext  = u'<pages index="%(index)s" from=%(from)d to=%(to)d' % params
        ptext += u' fromsection="%(from section)s"' % params
        ptext += u' tosection="%(to section)s"' % params
        ptext += u' header=1 prev="%(prev)s" next="%(next)s"' % params

        pages_text = u''
        sect = botpywi.quote_regex(p[2])
        sect_begin = u'<section begin=(["]?)%s\\1[ ]*/>' % sect
        sect_end = u'<section end=(["]?)%s\\3[ ]*/>' % sect
        for i in range(params[u'from'], params[u'to'] + 1):
            temp = volume.pages_text[i]
            if i == params[u'from'] or i == params[u'to']:
                temp = re.sub(u'(?ms).*%s(.*)%s.*' % (sect_begin, sect_end),
                              u'\\2', temp)
            pages_text += temp

        article_name = basename + u'/' + p[2]
        old_attrib = []
        if not pages[article_name].has_key(u'missing'):
            old_text = pages[article_name][u'revisions'][0][u'*']
            if re.match(u'(?ms).*<pages[ \n](.*?)/>.*', old_text):
                old_attrib = botpywi.explode_attrib(old_text, u'pages')

        authors = get_authors(volume_set, pages_text, stats)
        for name, attr in old_attrib:
            if name == u'auteur':
                authors = attr
                break
        if authors:
            ptext += u' auteur="%s"' % authors
        ptext += u' />'

        text = ptext

        if has_ref(pages_text):
            text += u'\n----\n<references />'

        if volume_set.cat_article:
            #text += u'\n\n{{DEFAULTSORT:%s}}' % p[2]
            text += u'\n\n[[Catégorie:' + volume_set.cat_article + u']]'

        text += get_categories(volume_set, pages_text, stats)

        if not pages.has_key(article_name):
            volume_set.error += u'# section invalide : "' + p[2] + '"\n'
            continue
        if not pages[article_name].has_key(u'missing'):
            old_text = pages[article_name][u'revisions'][0][u'*']
            if re.match(u'(?ms).*<pages[ \n](.*?)/>.*', old_text):
                attrib = botpywi.explode_attrib(text, u'pages')
                print "existing page with <pages tag:", article_name.encode('utf*8'), "\r",
                if attrib != old_attrib:
                    print "attrib diverges"
                    text = re.sub('(?ms)<pages[\n]*(.*?)/>', ptext, old_text)
                    save_page(article_name, text, opt, old_text = old_text)
            else:
                print "skipping existing page w/o <pages tag:", article_name.encode('utf-8')
        else:
            save_page(article_name, text, opt, old_text = False)

def save_page(title, text, opt, old_text = False, comment = None):
    page = wikipedia.Page(title = title, site = opt.site)
    if old_text:
        if type(old_text) == type(True) and old_text == True:
            if page.exists():
                old_text = page.get()
                if old_text.strip(u'\n') == text.strip(u'\n'):
                    return
            else:
                old_text = u''
        wikipedia.showDiff(old_text, text)
    if not comment:
        comment = u"mise à jour"
    print "saving:", title.encode('utf-8')
    # FIXME: don't save if old_text == text
    page.put_async(text, comment)

# FIXME: share (with a fatal if not found parameter)
def map_letter(letter, title):
    map_l = {
        u'É' : u'E',
        u'Ê' : u'E',
        u'Æ' : u'A',
        u'À' : u'A',
        u'Â' : u'A',
        u'Ç' : u'C',
        u'È' : u'E',
        }
    letter = letter.upper()
    letter = map_l.get(letter, letter)
    if letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        return letter
    print >> sys.stderr, (u"map_letter, unknown letter: " + letter).encode('utf-8'), title.encode('utf-8')
    return u'A'
    #raise RuntimeError(u"map_letter, unknown letter: " + letter)

def split_alpha(articles):
    result = {}
    for title in articles:
        first_letter = map_letter(title[0], title)
        result.setdefault(first_letter, [])
        result[first_letter].append(title)

    for key in result:
        result[key].sort(locale.strcoll)
    return result

def create_summary(pagename, summary, prefix_code, suffix_code, opt):
    print pagename.encode('utf-8')
    page = wikipedia.Page(title = pagename, site = opt.site)
    if page.exists():
        old_text = page.get()
        text = re.sub(u'(?ms)(<section begin="Sommaire"[ ]*/>\n)(.*)(<section end="Sommaire"[ ]*/>\n)', u'\\1' + summary + u'\\3', old_text)
    else:
        text = u'<div style="text-align:left;">\n{{colonnes|nombre=4|\n1=\n'
        text += u'<section begin="Sommaire"/>\n'
        text += summary
        text += u'<section end="Sommaire"/>\n'
        text += u'}}\n</div>'
    return text

def generate_table_alpha(volume_set, tables, opt):
    articles = []
    for key in tables:
        for p in tables[key][1]:
            articles.append(p[2])

    articles = split_alpha(articles)
    prefix = volume_set.prefix_article + u'/' + volume_set.suffix_article_list_alpha
    for key in articles:
        summary = u''
        # FIXME: assume the alpha index is one level below article
        for title in articles[key]:
            summary += u'*[[../../%s/]]' % title + u'\n'
        pagename = prefix + u'/' + key[0]
        text = create_summary(pagename, summary, index_prefix_code,
                              index_suffix_code, opt)
        save_page(pagename, text, opt, old_text = True)

def generate_table_by_volume(volume_set, volume, table, opt):
    pagename = volume_set.prefix_article + u'/' + volume.suffix_article_list
    page = wikipedia.Page(title = pagename, site = opt.site)
    summary = u''
    for p in table:
        # FIXME: assume the volume index is on the same level as article
        summary += u'* [[../%s/]]' % p[2] + u'\n'
    text = create_summary(pagename, summary, index_prefix_code,
                          index_suffix_code, opt)
    save_page(pagename, text, opt, old_text = True)

def check_article_list(volume_set, tables, opt):
    articles = set()
    for key in tables:
        if volume_set.need_article_list & AL_BY_VOLUME:
            articles.add(tables[key][0].suffix_article_list)
        for d in tables[key][1]:
            articles.add(d[2])

    if volume_set.need_article_list & AL_ALPHA:
        for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
            articles.add(volume_set.suffix_article_list_alpha + u'/' + letter)

    if volume_set.prefix_dl:
        for volume in volume_set.volumes:
            if volume.suffix_dl:
                articles.add(volume.suffix_dl)

    # so prefix article is not in the set of error
    articles.add(u'')

    print (volume_set.prefix_article + u'/').encode('utf-8')
    # FIXME: wrong, each volume can have its own base article
    extraParams = { u'gapfilterredir' : u'nonredirects' }
    gen = query_ext.PreloadingPagesStartswith(volume_set.prefix_article,
                                              site = opt.site,
                                              extraParams = extraParams)
    for p in gen:
        part = p[u'title'][len(volume_set.prefix_article) + 1:]
        if not part in articles:
            print p[u'title'].encode('utf-8')
            volume_set.error += u'# Article not in list: [[' + p[u'title'] + u']]\n'

def check_duplicate_section(volume_set, tables, opt):
    seen = {}
    for key in tables:
        for d in tables[key][1]:
            if d[2] in seen:
                volume_set.error += u'# duplicate section, [[' + page_prefix[opt.site.code] + u':' + d[3] + u'/' + unicode(d[1][0]) + u'|' + unicode(d[1][0]) + u']], ' + unicode(d[1][-1]) + u' , ' + unicode(d[2]) + u', [[' + page_prefix[opt.site.code] + u':' + seen[d[2]][3] + u'/' + unicode(seen[d[2]][1][0]) + u'|' + unicode(seen[d[2]][1][0]) + u']], ' + unicode(seen[d[2]][1][-1]) + u'\n'
            else:
                seen[d[2]] = d


def do_dict(volume_set, opt, stats):
    tables = {}

    # we need two pass for the prev/next link at volume boundary
    print "first pass, scanning"
    for volume in volume_set.volumes:
        print volume.prefix_page.encode('utf-8')
        base_pages = page_prefix[opt.site.lang] + u':' + volume.prefix_page
        volume.table = scan_pages(base_pages, volume_set, volume, opt)
        tables[volume.prefix_page] = (volume, volume.table)

    print "second pass, generating"
    for idx_volume, volume in enumerate(volume_set.volumes):
        print volume.prefix_page.encode('utf-8')
        base_pages = page_prefix[opt.site.lang] + u':' + volume.prefix_page
        table = volume.table
        stats.estimed_article += len(table)
        if opt.save_dl and volume_set.prefix_dl and volume.suffix_dl:
            text = generate_dl(table)
            out_dl_name = volume_set.prefix_dl + u'/' + volume.suffix_dl
            dl_prefix = dl_suffix = u''
            if volume_set.need_dl_prefix_code:
                dl_prefix = dl_prefix_code[opt.site.lang] % volume.prefix_page
                dl_suffix = dl_suffix_code[opt.site.lang]
            text = create_summary(out_dl_name, text, dl_prefix, dl_suffix, opt)
            save_page(out_dl_name, text, opt, old_text = True)

        if opt.save_volume_index and volume_set.need_article_list & AL_BY_VOLUME:
            generate_table_by_volume(volume_set, volume, table, opt)

        if opt.save_article and volume_set.prefix_article and opt.save_article:
            base_article = volume_set.prefix_article
            generate_article(table, base_article, volume_set, volume,
                             idx_volume, opt, stats)

    if opt.save_alpha_index and volume_set.need_article_list & AL_ALPHA:
        generate_table_alpha(volume_set, tables, opt)

    check_duplicate_section(volume_set, tables, opt)
    check_article_list(volume_set, tables, opt)

    save_page(volume_set.error_page, volume_set.error, opt, old_text = True)

def do_help():
    print sys.argv[0], '-dict|-save_all|-save_dl|-save_alpha_index|-save_volume_index|-save_article|-do_links:'
    exit(1)

if __name__ == "__main__":
    locale.setlocale(locale.LC_COLLATE, 'fr_FR.utf8')
    statistics = Statistics()
    try:
        options = Options()
        options.site = wikipedia.getSite(code = 'fr', fam = 'wikisource')
        for arg in sys.argv[1:]:
            if arg == '-save_all':
                options.save_dl = True
                options.save_alpha_index = True
                options.save_volume_index = True
                options.save_article = True
                options.do_links = True
            elif arg == '-save_dl':
                options.save_dl = True
            elif arg == '-save_alpha_index':
                options.save_alpha_index = True
            elif arg == '-save_volume_index':
                options.save_volume_index = True
            elif arg == '-save_article':
                options.save_article = True
            elif arg == '-do_links':
                options.do_links = True 
            elif arg == '-help':
                do_help()
            elif arg.startswith('-dict'):
                options.dict = unicode(arg[len('-dict:'):], 'utf-8')
            else:
                do_help()
        if not options.dict:
            print 'no dict selected'
            do_help()

        do_dict(predef_dict[options.dict], options, statistics)

        print statistics
    finally:
        wikipedia.stopme()