Utilisateur:Phe/Scripts/create dl.py
Apparence
# -*- coding: utf-8 -*-
import botpywi
import query_ext
import sys
import re
import wikipedia
import locale
AL_BY_VOLUME = 1
AL_ALPHA = 2
def transform_link(match, template, blacklist):
if not match.group(2) in blacklist:
text = match.group(1) + u' {{' + template + u'|' + match.group(2).upper() + u'|{{sc|' + match.group(2) + u'}}}}'
else:
text = match.group(1) + u' {{sc|' + match.group(2) + u'}}'
if len(match.groups()) > 3 and match.group(3):
for it in re.finditer(u', {{sc\|([^}]*?)}}', match.group(3)):
text += u', {{' + template + u'|' + it.group(1).upper() + u'|{{sc|' + it.group(1) + u'}}}}'
return text
# u"\\1 {{Tr6L|\\2|{{sc|\\2}}}}"
def transform_trevoux_link(match):
blacklist = set( [
u'Le P. Mabillon', u'Maty', u'Ducange', u'Durand', u'Pasquier',
u'Lymnæus'
] )
return transform_link(match, u'Tr6L', blacklist)
def transform_michaud_link(match):
return transform_link(match, u'Mich2L', set())
Trevoux_1771_data = {
u'prefix dl' : u'Utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
u'prefix article' : u'Utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
u'need dl prefix code' : True,
u'need article list' : AL_BY_VOLUME | AL_ALPHA,
u'suffix article list alpha' : u'Index alphabétique',
u'cat article' : u'Articles du Dictionnaire de Trévoux, 6e édition',
u'create link' : { u'regexp' : [ u"(Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_trevoux_link ], u'skip regexp' : u'{{[Tt]r6l[ ]*\|' },
u'error page' : u'Discussion utilisateur:Phe/Dictionnaire et encyclopédie/Trévoux',
}
Encyclo_diderot_1_data = {
u'prefix dl' : u'L’Encyclopédie/1re édition',
u'prefix article' : u'L’Encyclopédie/1re édition',
u'need dl prefix code' : True,
u'need article list' : AL_BY_VOLUME | AL_ALPHA,
u'suffix article list alpha' : u'Index alphabétique',
u'cat article' : u'L’Encyclopédie, 1re édition',
u'error page' : u'Discussion:L’Encyclopédie/1re édition/Erreur',
#u'cat article regexp' : [ (u'([Vv]ille|riviere|île\b)', u'Géographie'), (u'Norwege', u'Norvège') , (u'\bHist\.', u'Histoire'), (u'Géog\.', u'Géographie'), (u'Art méchaniq', u'Mécanique'), (u'\barbres\b', u'Botanique'), (u'Jurisp', u'Jurisprudence'), (u'Myth', u'Mythologie'), (u'Arith\.', u'Mathématiques'), (u'Comm\.', u'Commerce'), ('Theol\.', u'Théologie'), (u'Œconomie.', u'Économie'), (u'maladie', u'Médecine'), (u'Mathématiciens', u'Mathématiques'), (u'Botan', u'Botanique'), (u'Suede', u'Suède') ],
#u'cat article regexp self' : [ u'(France|Allemagne|Italie|Suisse|Angleterre|Espagne|Afrique|Musique|Astronomie|Chimie|Marine|Physiologie|Philosophie)', ],
u'cat article module' : 'Diderot_cat',
u'if not cated' : [ u'à revoir' ],
u'signature' : [ ( u"\(''A''\)", u"Antoine-Gaspard Boucher d’Argis|Boucher d’Argis"), (u"\(''a''\)", u"Nicolas Lenglet Du Fresnoy|Du Fresnoy"), (u"\(''B''\)", u"Louis de Cahusac|Cahusac"), (u"\(''C''\)", u"Jean Pestré|Pestré"), (u"\(''c''\)", u"Louis Jean-Marie Daubenton|Louis Daubenton"), (u"\(''D''\)", u"Louis-Jacques Goussier|Goussier"), (u"\(''d''\)", u"Arnulphe d’Aumont|Aumont"), (u"\(''E''\)", u"Jean-Baptiste de La Chapelle|La Chapelle"), (u"\(''e''\)", u"Claude Bourgelat|Bourgelat"), (u"\(''F''\)", u"César Chesneau Dumarsais|Dumarsais"), (u"\(''f''\)", u"Jacques-François de Villiers|Villiers"), (u"\(''G''\)", u"Edme-François Mallet|Mallet"), (u"\(''g''\)", u"Paul-Joseph Barthez|Barthez"), (u"\(''H''\)", u"François-Vincent Toussaint|Toussaint"), (u"\(''h''\)", u"André Morellet|Morellet"), (u"\(''I''\)", u"Pierre Daubenton|Daubenton"), (u"\(''K''\)", u"Antoine Joseph Dezallier d’Argenville|Argenville"), (u"\(''L''\)", u"Pierre Tarin|Tarin"), (u"\(''M''\)", u"Paul-Jacques Malouin|Malouin"), (u"\(''m''\)", u"Jean-Joseph Menuret|Menuret"), (u"\(''N''\)", u"Urbain de Vandenesse|Vandenesse"), (u"\(''O''\)", u"Jean le Rond d’Alembert|d’Alembert"), (u"\(''P''\)", u"Jacques-François Blondel|Blondel"), (u"\(''Q''\)", u"Guillaume Le Blond|Le Blond"), (u"\(''R''\)", u"Paul Landois|Landois"), (u"\(''S''\)", u"Jean-Jacques Rousseau|Rousseau"), (u"\(''T''\)", u"Charles Le Roy|Le Roy"), (u"\(''V''\)", u"Marc-Antoine Eidous|Eidous"), (u"\(''X''\)", u"Claude Yvon|Yvon"), (u"\(''Y''\)", u"Antoine Louis|Louis"), (u"\(''Z''\)", u"Jacques-Nicolas Bellin|Bellin"), (u"\(''C\. D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''C\. D\. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le chevalier {{sc\|de Jaucourt}}''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le Chevalier {{sc\|de Jaucourt}}''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D\.J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\({{sc\|de Jaucourt}}\)", u"Louis de Jaucourt|Jaucourt"), (u"{{sc\|de Jaucourt}}", u"Louis de Jaucourt|Jaucourt"),(u"{{sc\|de Jaucourt\.}}", u"Louis de Jaucourt|Jaucourt"),(u"\(''D\. J''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''D J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\({{sc\|de Jaucourt\.}}\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''Le Chevalier ''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''C\. D\. J\.''\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''\D. J''\.\)", u"Louis de Jaucourt|Jaucourt"), (u"\(''E\. R\. M\.''\)", u"[[Auteur:Jacques-Philippe-Augustin Douchet|Douchet]] & [[Auteur:Nicolas Beauzée|Beauzée]]"), (u"<nowiki>\*</nowiki>", u"Denis Diderot|Diderot"), (u"{{sc\|\*[^}]*}}", u"Denis Diderot|Diderot"), (u"\(—\)", u"Paul Henri Thiry d’Holbach|Baron d’Holbach"), (u"\(''b''\)", u"Gabriel François Venel|Venel") ],
}
Michaud_1843_data = {
u'prefix dl' : u'Biographie universelle ancienne et moderne/2e éd., 1843',
U'prefix article' : u'Biographie universelle ancienne et moderne/2e éd., 1843',
u'need dl prefix code' : True,
u'need article list' : AL_BY_VOLUME | AL_ALPHA,
u'suffix article list alpha' : u'Index alphabétique',
u'cat article' : u'Articles de la Biographie universelle, Michaud, 2e édition',
u'create link' : { u'regexp' : [ u"(voy\.|''voy\.''|''Voy\.''|Voy\.|Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_michaud_link ], u'skip regexp' : u'{{[Mm]ich2L[ ]*\|' },
u'error page' : u'Discussion:Biographie universelle ancienne et moderne/2e éd., 1843/Erreurs',
}
Celestin_port_data = {
U'prefix article' : u'Dictionnaire historique, géographique et biographique du Maine-et-Loire',
u'need dl prefix code' : True,
u'need article list' : AL_BY_VOLUME | AL_ALPHA,
u'suffix article list alpha' : u'Index alphabétique',
u'cat article' : u'Articles du Célestin Port',
u'create link' : { u'regexp' : [ u"(voy\.|''voy\.''|''Voy\.''|Voy\.|Voyez|''Voyez'') {{sc\|([^}]*?)}}((, {{sc\|([^}]*?)}})*)", transform_michaud_link ], u'skip regexp' : u'{{[Mm]ich2L[ ]*\|' },
u'error page' : u'Discussion:Dictionnaire historique, géographique et biographique du Maine-et-Loire/Erreurs',
}
page_prefix = {
u'fr' : u'Page',
}
dl_prefix_code = {}
dl_prefix_code[u'fr'] = u"""{| style="width:100%%;"
|
<div id="dynamic_links" class="nopopups" title="%s" style="height:800px;overflow:auto;margin-left:3em;text-align:left;" >
"""
dl_suffix_code = {}
dl_suffix_code[u'fr'] = u"""</div>
|
<div style="padding:10px;height:800px;overflow:auto;">
<div id="dict_entry" class=text "/>
</div>
|}"""
index_prefix_code = u'<div style="text-align:left;">\n{{colonnes|nombre=4|\n1=\n'
index_suffix_code = u'}}\n</div>'
class VolumeSet(object):
def __init__(self, descr):
self.prefix_dl = descr[u'prefix dl']
self.prefix_article = descr[u'prefix article']
self.volumes = []
self.need_dl_prefix_code = descr[u'need dl prefix code']
self.need_article_list = descr[u'need article list']
self.suffix_article_list_alpha = descr[u'suffix article list alpha']
self.cat_article = descr[u'cat article']
self.create_link = descr.get(u'create link', u'')
self.error_page = descr[u'error page']
self.cat_article_regexp = descr.get(u'cat article regexp', [])
self.cat_article_regexp_self = descr.get(u'cat article regexp self', [])
self.cat_article_module = descr.get(u'cat article module', '')
self.cat_article_module_re = {}
if self.cat_article_module:
module = __import__(self.cat_article_module)
for key in module.cat:
result = []
for pat in module.cat[key]:
result.append(botpywi.quote_regex(pat))
regexp = u"|".join(result)
regexp = regexp.replace(u' ', u'\\s')
regexp = regexp.replace(u"'", u"'*")
self.cat_article_module_re[key] = re.compile(regexp)
self.signature = []
signature = descr.get(u'signature', [])
for sign, author in signature:
self.signature.append( (re.compile(sign), author) )
self.if_not_cated = descr.get(u'if not cated')
self.error = u''
class Volume(object):
def __init__(self, descr):
self.suffix_dl = descr[u'suffix dl']
self.suffix_article_list = descr[u'suffix article list']
self.prefix_page = descr[u'prefix page']
self.create_link = descr.get(u'create link', u'')
self.pages_text = {}
class Trevoux_1771(VolumeSet):
def __init__(self):
VolumeSet.__init__(self, Trevoux_1771_data)
for i in [ (u'I', 1) ]: #, (u'II', 2), (u'III', 3), (u'IV', 3), (u'V', 4), (u'VI', 5), (u'VII', 6), (u'VIII', 8)]:
descr_volume = {
u'suffix dl' : u'Tome %d' % i[1],
u'suffix article list' : u'Index tome %s' % i[0],
u'prefix page' : u'Dictionnaire de Trévoux, 1771, %s.djvu' % i[0],
}
self.volumes.append(Volume(descr_volume))
class Encyclo_diderot_1(VolumeSet):
def __init__(self):
VolumeSet.__init__(self, Encyclo_diderot_1_data)
for i in range(1, 18):
basename = u'Diderot - Encyclopedie 1ere edition tome %d.djvu' % i
descr_volume = {
u'suffix dl' : u'Volume %d' % i,
u'suffix article list' : u'Index tome %d' % i,
u'prefix page' : basename,
}
self.volumes.append(Volume(descr_volume))
class Michaud_1843(VolumeSet):
def __init__(self):
VolumeSet.__init__(self, Michaud_1843_data)
for i in range(1, 3):
basename = u'Michaud - Biographie universelle ancienne et moderne - 1843 - Tome %d.djvu' % i
descr_volume = {
u'suffix dl' : 'Tome %d' % i,
u'suffix article list' : u'Index tome %d' % i,
u'prefix page' : basename,
}
self.volumes.append(Volume(descr_volume))
predef_dict = {
u'Trévoux 1771' : Trevoux_1771(),
u'Encyclopédie Diderot 1' : Encyclo_diderot_1(),
u'Michaud 1843' : Michaud_1843(),
}
class Statistics:
def __init__(self):
self.estimed_article = 0
self.total_article = 0
self.cated_article = 0
self.authored_article = 0
self.total_cat = 0
self.total_authors = 0
def ratio(self, total, b):
if total == 0 or b == 0:
return 0.0
return 1.0 - ((float(total) - b) / total)
def __str__(self):
text = u'articles: estimed, total, total cat, total authors: %d %d %d %d\n' % (self.estimed_article, self.total_article, self.total_cat, self.total_authors)
text += u"cated article: %f\n" % self.ratio(self.total_article, self.cated_article)
text += u"authored article: %f" % self.ratio(self.total_article, self.authored_article)
return text
class Options:
def __init__(self):
self.save_dl = False
self.save_alpha_index = False
self.save_volume_index = False
self.save_article = False
self.do_links = False
def compare_title(a, b):
first = int(re.match(u'.*/(\d+)', a[2]).group(1))
second = int(re.match(u'.*/(\d+)', b[2]).group(1))
return first - second
def fill_dict_section(volume_set, volume, text, page_nr, dict_section, order, opt):
for it in re.finditer(u'<section begin=(["]?)(.*?)\\1[ ]*/>', text):
if not dict_section.has_key(it.group(2)):
dict_section[it.group(2)] = [ order ]
order += 1
elif order - 1 != dict_section[it.group(2)][0]:
# FIXME: not sufficient if we have twice the same section
# name consecutively
volume_set.error += u'# duplicate section [[' + page_prefix[opt.site.code] + u':' + volume.prefix_page + u'/' + unicode(page_nr) + u'|' + unicode(page_nr) + u']], ' + unicode(dict_section[it.group(2)][1]) + u', ' + unicode(dict_section[it.group(2)][-1]) + u', ' + it.group(2) + u'\n'
dict_section[it.group(2)].append(page_nr)
return order
def create_link(page, volume_set, volume, opt):
cl = (volume.create_link or volume_set.create_link) and opt.do_links
if cl:
regexp = cl[u'regexp']
content = page[u'revisions'][0]['*']
if not re.search(cl[u'skip regexp'], content):
new_content = re.sub(regexp[0], regexp[1], content)
if new_content.strip(u'\n') != content.strip(u'\n'):
save_page(page[u'title'], new_content, opt, old_text = content,
comment = u"mise à jour des liens")
def filter_table_entry(entry):
return entry[2].endswith(u'(nobot)')
def scan_pages(base_page, volume_set, volume, opt):
gen = query_ext.PreloadingPagesStartswith(base_page, site = opt.site)
page_list = []
for p in query_ext.PreloadingContents(gen, site = opt.site):
page_nr = int(re.match(u'.*/(\d+)', p[u'title']).group(1))
page_list.append((page_nr, p[u'revisions'][0]['*'], p[u'title']) )
create_link(p, volume_set, volume, opt)
volume.pages_text[page_nr] = p[u'revisions'][0]['*']
page_list.sort(compare_title)
order = 0
dict_section = {}
for p in page_list:
order = fill_dict_section(volume_set, volume, p[1], p[0],
dict_section, order, opt)
table = [ (dict_section[key][0], dict_section[key][1:], key, volume.prefix_page) for key in dict_section ]
table = [ x for x in table if not filter_table_entry(x) ]
table.sort()
# format : (order, [page numbers], title, prefix_page)
return table
def generate_dl(table):
text = u''
for p in table:
text += u'*[[DL#' + unicode(p[1][0]) + u':' + unicode(p[1][-1]) + u'|' + p[2] + u']]\n'
return text
def get_categories(volume_set, pages_text, stats):
cats = set()
for regexp, replace in volume_set.cat_article_regexp:
if re.search(regexp, pages_text):
cats.add(replace)
for regexp in volume_set.cat_article_regexp_self:
match = re.search(regexp, pages_text)
if match:
cats.add(match.group(1))
for key in volume_set.cat_article_module_re:
if volume_set.cat_article_module_re[key].search(pages_text):
cats.add(key)
cats = [x for x in cats]
cats.sort(locale.strcoll)
text = u''
for c in cats:
text += u'\n[[Catégorie:Articles de dictionnaire - ' + c + u']]'
if len(cats):
stats.cated_article += 1
stats.total_cat += len(cats)
else:
for c in volume_set.if_not_cated:
text += u'\n[[Catégorie:Articles de dictionnaire - ' + c + u']]'
return text
def get_authors(volume_set, text, stats):
authors = set()
for sign, author in volume_set.signature:
if sign.search(text):
if u'[' in author:
authors.add(author)
elif u'|' in author:
authors.add(u'[[Auteur:' + author + u']]')
else:
authors.add(u'[[Auteur:' + author + u'|' + author + u']]')
if len(authors):
stats.authored_article += 1
stats.total_authors += len(authors)
return u', '.join(authors)
def has_ref(text):
return re.search(u'<ref>', text)
def generate_article(table, basename, volume_set, volume,
idx_volume, opt, stats):
pagenames = []
for i, p in enumerate(table):
pagenames.append({u'title' : basename + u'/' + p[2]})
pages = {}
for p in query_ext.PreloadingContents(pagenames, site = opt.site):
pages[p[u'title']] = p
for i, p in enumerate(table):
#if i > 10 or p[2] in [ u'ABACE, ABÉCE', u'ABAQUE' ]:
if idx_volume > 3: # or i <= 2000:
continue
stats.total_article += 1
params = {}
params[u'index'] = volume.prefix_page
params[u'from'] = p[1][0]
params[u'to'] = p[1][-1]
params[u'from section'] = p[2]
params[u'to section'] = p[2]
params[u'prev'] = ''
if i:
params[u'prev'] = u'[[../%s/]]' % table[i-1][2]
else:
if idx_volume:
prev_volume = volume_set.volumes[idx_volume - 1]
if len(prev_volume.table):
prev_entry = prev_volume.table[len(prev_volume.table) - 1]
params[u'prev'] = u'[[../%s/]]' % prev_entry[2]
params[u'next'] = ''
if i < len(table) - 1:
params[u'next'] = u'[[../%s/]]' % table[i+1][2]
else:
if idx_volume < len(volume_set.volumes) - 1:
next_volume = volume_set.volumes[idx_volume + 1]
if len(next_volume.table):
next_entry = next_volume.table[0]
params[u'next'] = u'[[../%s/]]' % next_entry[2]
ptext = u'<pages index="%(index)s" from=%(from)d to=%(to)d' % params
ptext += u' fromsection="%(from section)s"' % params
ptext += u' tosection="%(to section)s"' % params
ptext += u' header=1 prev="%(prev)s" next="%(next)s"' % params
pages_text = u''
sect = botpywi.quote_regex(p[2])
sect_begin = u'<section begin=(["]?)%s\\1[ ]*/>' % sect
sect_end = u'<section end=(["]?)%s\\3[ ]*/>' % sect
for i in range(params[u'from'], params[u'to'] + 1):
temp = volume.pages_text[i]
if i == params[u'from'] or i == params[u'to']:
temp = re.sub(u'(?ms).*%s(.*)%s.*' % (sect_begin, sect_end),
u'\\2', temp)
pages_text += temp
article_name = basename + u'/' + p[2]
old_attrib = []
if not pages[article_name].has_key(u'missing'):
old_text = pages[article_name][u'revisions'][0][u'*']
if re.match(u'(?ms).*<pages[ \n](.*?)/>.*', old_text):
old_attrib = botpywi.explode_attrib(old_text, u'pages')
authors = get_authors(volume_set, pages_text, stats)
for name, attr in old_attrib:
if name == u'auteur':
authors = attr
break
if authors:
ptext += u' auteur="%s"' % authors
ptext += u' />'
text = ptext
if has_ref(pages_text):
text += u'\n----\n<references />'
if volume_set.cat_article:
#text += u'\n\n{{DEFAULTSORT:%s}}' % p[2]
text += u'\n\n[[Catégorie:' + volume_set.cat_article + u']]'
text += get_categories(volume_set, pages_text, stats)
if not pages.has_key(article_name):
volume_set.error += u'# section invalide : "' + p[2] + '"\n'
continue
if not pages[article_name].has_key(u'missing'):
old_text = pages[article_name][u'revisions'][0][u'*']
if re.match(u'(?ms).*<pages[ \n](.*?)/>.*', old_text):
attrib = botpywi.explode_attrib(text, u'pages')
print "existing page with <pages tag:", article_name.encode('utf*8'), "\r",
if attrib != old_attrib:
print "attrib diverges"
text = re.sub('(?ms)<pages[\n]*(.*?)/>', ptext, old_text)
save_page(article_name, text, opt, old_text = old_text)
else:
print "skipping existing page w/o <pages tag:", article_name.encode('utf-8')
else:
save_page(article_name, text, opt, old_text = False)
def save_page(title, text, opt, old_text = False, comment = None):
page = wikipedia.Page(title = title, site = opt.site)
if old_text:
if type(old_text) == type(True) and old_text == True:
if page.exists():
old_text = page.get()
if old_text.strip(u'\n') == text.strip(u'\n'):
return
else:
old_text = u''
wikipedia.showDiff(old_text, text)
if not comment:
comment = u"mise à jour"
print "saving:", title.encode('utf-8')
# FIXME: don't save if old_text == text
page.put_async(text, comment)
# FIXME: share (with a fatal if not found parameter)
def map_letter(letter, title):
map_l = {
u'É' : u'E',
u'Ê' : u'E',
u'Æ' : u'A',
u'À' : u'A',
u'Â' : u'A',
u'Ç' : u'C',
u'È' : u'E',
}
letter = letter.upper()
letter = map_l.get(letter, letter)
if letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
return letter
print >> sys.stderr, (u"map_letter, unknown letter: " + letter).encode('utf-8'), title.encode('utf-8')
return u'A'
#raise RuntimeError(u"map_letter, unknown letter: " + letter)
def split_alpha(articles):
result = {}
for title in articles:
first_letter = map_letter(title[0], title)
result.setdefault(first_letter, [])
result[first_letter].append(title)
for key in result:
result[key].sort(locale.strcoll)
return result
def create_summary(pagename, summary, prefix_code, suffix_code, opt):
print pagename.encode('utf-8')
page = wikipedia.Page(title = pagename, site = opt.site)
if page.exists():
old_text = page.get()
text = re.sub(u'(?ms)(<section begin="Sommaire"[ ]*/>\n)(.*)(<section end="Sommaire"[ ]*/>\n)', u'\\1' + summary + u'\\3', old_text)
else:
text = u'<div style="text-align:left;">\n{{colonnes|nombre=4|\n1=\n'
text += u'<section begin="Sommaire"/>\n'
text += summary
text += u'<section end="Sommaire"/>\n'
text += u'}}\n</div>'
return text
def generate_table_alpha(volume_set, tables, opt):
articles = []
for key in tables:
for p in tables[key][1]:
articles.append(p[2])
articles = split_alpha(articles)
prefix = volume_set.prefix_article + u'/' + volume_set.suffix_article_list_alpha
for key in articles:
summary = u''
# FIXME: assume the alpha index is one level below article
for title in articles[key]:
summary += u'*[[../../%s/]]' % title + u'\n'
pagename = prefix + u'/' + key[0]
text = create_summary(pagename, summary, index_prefix_code,
index_suffix_code, opt)
save_page(pagename, text, opt, old_text = True)
def generate_table_by_volume(volume_set, volume, table, opt):
pagename = volume_set.prefix_article + u'/' + volume.suffix_article_list
page = wikipedia.Page(title = pagename, site = opt.site)
summary = u''
for p in table:
# FIXME: assume the volume index is on the same level as article
summary += u'* [[../%s/]]' % p[2] + u'\n'
text = create_summary(pagename, summary, index_prefix_code,
index_suffix_code, opt)
save_page(pagename, text, opt, old_text = True)
def check_article_list(volume_set, tables, opt):
articles = set()
for key in tables:
if volume_set.need_article_list & AL_BY_VOLUME:
articles.add(tables[key][0].suffix_article_list)
for d in tables[key][1]:
articles.add(d[2])
if volume_set.need_article_list & AL_ALPHA:
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
articles.add(volume_set.suffix_article_list_alpha + u'/' + letter)
if volume_set.prefix_dl:
for volume in volume_set.volumes:
if volume.suffix_dl:
articles.add(volume.suffix_dl)
# so prefix article is not in the set of error
articles.add(u'')
print (volume_set.prefix_article + u'/').encode('utf-8')
# FIXME: wrong, each volume can have its own base article
extraParams = { u'gapfilterredir' : u'nonredirects' }
gen = query_ext.PreloadingPagesStartswith(volume_set.prefix_article,
site = opt.site,
extraParams = extraParams)
for p in gen:
part = p[u'title'][len(volume_set.prefix_article) + 1:]
if not part in articles:
print p[u'title'].encode('utf-8')
volume_set.error += u'# Article not in list: [[' + p[u'title'] + u']]\n'
def check_duplicate_section(volume_set, tables, opt):
seen = {}
for key in tables:
for d in tables[key][1]:
if d[2] in seen:
volume_set.error += u'# duplicate section, [[' + page_prefix[opt.site.code] + u':' + d[3] + u'/' + unicode(d[1][0]) + u'|' + unicode(d[1][0]) + u']], ' + unicode(d[1][-1]) + u' , ' + unicode(d[2]) + u', [[' + page_prefix[opt.site.code] + u':' + seen[d[2]][3] + u'/' + unicode(seen[d[2]][1][0]) + u'|' + unicode(seen[d[2]][1][0]) + u']], ' + unicode(seen[d[2]][1][-1]) + u'\n'
else:
seen[d[2]] = d
def do_dict(volume_set, opt, stats):
tables = {}
# we need two pass for the prev/next link at volume boundary
print "first pass, scanning"
for volume in volume_set.volumes:
print volume.prefix_page.encode('utf-8')
base_pages = page_prefix[opt.site.lang] + u':' + volume.prefix_page
volume.table = scan_pages(base_pages, volume_set, volume, opt)
tables[volume.prefix_page] = (volume, volume.table)
print "second pass, generating"
for idx_volume, volume in enumerate(volume_set.volumes):
print volume.prefix_page.encode('utf-8')
base_pages = page_prefix[opt.site.lang] + u':' + volume.prefix_page
table = volume.table
stats.estimed_article += len(table)
if opt.save_dl and volume_set.prefix_dl and volume.suffix_dl:
text = generate_dl(table)
out_dl_name = volume_set.prefix_dl + u'/' + volume.suffix_dl
dl_prefix = dl_suffix = u''
if volume_set.need_dl_prefix_code:
dl_prefix = dl_prefix_code[opt.site.lang] % volume.prefix_page
dl_suffix = dl_suffix_code[opt.site.lang]
text = create_summary(out_dl_name, text, dl_prefix, dl_suffix, opt)
save_page(out_dl_name, text, opt, old_text = True)
if opt.save_volume_index and volume_set.need_article_list & AL_BY_VOLUME:
generate_table_by_volume(volume_set, volume, table, opt)
if opt.save_article and volume_set.prefix_article and opt.save_article:
base_article = volume_set.prefix_article
generate_article(table, base_article, volume_set, volume,
idx_volume, opt, stats)
if opt.save_alpha_index and volume_set.need_article_list & AL_ALPHA:
generate_table_alpha(volume_set, tables, opt)
check_duplicate_section(volume_set, tables, opt)
check_article_list(volume_set, tables, opt)
save_page(volume_set.error_page, volume_set.error, opt, old_text = True)
def do_help():
print sys.argv[0], '-dict|-save_all|-save_dl|-save_alpha_index|-save_volume_index|-save_article|-do_links:'
exit(1)
if __name__ == "__main__":
locale.setlocale(locale.LC_COLLATE, 'fr_FR.utf8')
statistics = Statistics()
try:
options = Options()
options.site = wikipedia.getSite(code = 'fr', fam = 'wikisource')
for arg in sys.argv[1:]:
if arg == '-save_all':
options.save_dl = True
options.save_alpha_index = True
options.save_volume_index = True
options.save_article = True
options.do_links = True
elif arg == '-save_dl':
options.save_dl = True
elif arg == '-save_alpha_index':
options.save_alpha_index = True
elif arg == '-save_volume_index':
options.save_volume_index = True
elif arg == '-save_article':
options.save_article = True
elif arg == '-do_links':
options.do_links = True
elif arg == '-help':
do_help()
elif arg.startswith('-dict'):
options.dict = unicode(arg[len('-dict:'):], 'utf-8')
else:
do_help()
if not options.dict:
print 'no dict selected'
do_help()
do_dict(predef_dict[options.dict], options, statistics)
print statistics
finally:
wikipedia.stopme()