Utilisateur:Phe/Python/create ws link.py
< Utilisateur:Phe | Python
# -*- coding: utf-8 -*-
import botpywi
import sys
import re
import wikipedia
import page_cache
import query_ext
try:
import psyco
psyco.full()
except ImportError:
pass
class Options:
pass
options = Options()
cache = page_cache.page_cache()
index_defs = {}
index_namespace = {
'en' : u'Index',
'fr' : u'Livre',
}
template_toc_link = {
'en' : u'TOC link',
'fr' : u'lp',
}
def get_index_def(opt, index_name):
global index_defs
if not index_defs.has_key(index_name):
pagename = index_namespace[opt.lang] + u':' + index_name
text = cache.get(pagename, site = opt.site)
attribs = botpywi.explode_attrib(text, u'pagelist')
index_defs[index_name] = attribs
return index_defs[index_name]
# taken from http://code.activestate.com/recipes/81611/
def to_roman(num):
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
nums = ('M', 'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
result = ""
for i in range(len(ints)):
count = int(num / ints[i])
result += nums[i] * count
num -= ints[i] * count
return result
# mimic ProofreadPage mediawiki extension by ThomasV
def page_number(i, args):
mode = u'normal'
offset = 0;
for num, param in args:
match = re.match(u'(\d+)to(\d+)', num)
if (match and i >= int(match.group(1)) and i <= int(match.group(2))) or (num.isdecimal() and i == int(num)):
for iparam in param.split(u';'):
if iparam == u'roman' or iparam == u'highroman' or not iparam.isdecimal():
mode = iparam
if num.isdecimal() and i >= int(num):
for iparam in param.split(u';'):
if iparam.isdecimal():
offset = int(num) - int(iparam)
view = i - offset
if mode == u'roman':
view = to_roman(view).lower()
elif mode == u'highroman':
view = to_roman(view)
elif mode == u'normal':
view = unicode(view)
else:
view = mode
return view, mode
def sub_pagename(prefix, pagename):
if not prefix:
return u' '
if pagename.startswith(prefix):
return pagename[len(prefix):]
return None
# [[<noinclude>Page:Bulletin de la société géologique de France - 1re série - IV - 1833-1834.djvu/236</noinclude><includeonly>Bulletin de la société géologique de France - 1re série - IV - 1833-1834/Séance du 20 janvier 1834#168</includeonly>|168]]
def change_page_nr(match):
if match.group(1) == u'-':
return match.group(1) + match.group(2) + match.group(3)
val = match.group(2)
if not options.dict_page.has_key(val):
print "unable to locate page:", val
return match.group(1) + match.group(2) + match.group(3)
if options.use_toc_link:
toc_link = template_toc_link[options.lang]
subpage = sub_pagename(options.prefix, options.dict_page[val][0])
if subpage:
if options.lang == 'en':
return match.group(1) + u'{{' + toc_link + u'|' + options.dict_page[val][1] + u'|' + subpage + u'|' + val + u'}}' + match.group(3)
else:
return match.group(1) + u'{{' + toc_link + u'|' + subpage + u'|' + options.dict_page[val][1] + u'|' + val + u'}}' + match.group(3)
else:
print 'toc link cannot be used, prefix:', options.prefix, 'target pagename:', options.dict_page[val][0]
# fall back or toc link not asked
link = u"[[<noinclude>" + options.base_pagename + options.dict_page[val][1] + u"</noinclude><includeonly>" + options.dict_page[val][0]
if options.anchor:
link += u'#' + val
link += match.group(1) + u'</includeonly>|' + val + u']]' + match.group(3)
return link
def change_page_nr_roman(match):
val = match.group(1)
if not options.dict_page.has_key(val):
print "unable to locate page:", val
return u'{{sc|' + match.group(1) + u'}}' + match.group(2)
if options.use_toc_link:
toc_link = template_toc_link[options.lang]
subpage = sub_pagename(options.prefix, options.dict_page[val][0])
if subpage:
if options.lang == 'en':
return u'{{' + toc_link + u'|' + options.dict_page[val][1] + u'|' + subpage + u'|' + val + u'}}' + match.group(2)
else:
return u'{{' + toc_link + u'|' + subpage + u'|' + options.dict_page[val][1] + u'|' + val + u'|{{sc|' + match.group(1) + u'}}' + u'}}' + match.group(2)
else:
print 'toc link cannot be used, prefix:', options.prefix, 'target pagename:', options.dict_page[val][0]
return u"[[<noinclude>" + options.base_pagename + options.dict_page[val][1] + u"</noinclude><includeonly>" + options.dict_page[val][0] + u'#' + val + u'</includeonly>|{{sc|' + val + u'}}]]' + match.group(2)
def load_data_from_prefix(opt):
if not opt.prefix.endswith(u'/'):
opt.prefix += u'/'
cache.purge(opt.pagename, site = opt.site)
gen = query_ext.PreloadingPagesStartswith(opt.prefix, site = opt.site)
titles = []
for p in gen:
if not p[u'title'].endswith(u'/Texte entier'):
titles.append(p[u'title'])
titles = [ wikipedia.Page(site = opt.site, title = x) for x in titles ]
cache.mass_load(titles)
for p in titles:
text = cache.read_page(p.title(), site = p.site())
for it in re.finditer(u'{{[Pp]age\|.*?/(\d+)\|num=(.*?)}}', text):
try:
val = int(it.group(2))
page_nr = it.group(1)
opt.dict_page[unicode(val)] = [ p.title(), page_nr ]
except ValueError:
pass
for it in re.finditer('{{[Pp]ageNum\|.*?\|nb=(.*?)\|from=(.*?)\|to=(.*?)}}', text):
try:
start = int(it.group(2))
end = int(it.group(3))
delta = int(it.group(1))
for k in range(start, end + 1):
opt.dict_page[unicode(k + delta)] = [ p.title(), unicode(k) ]
except ValueError:
raise
pass
for it in re.finditer(u'(?ms)<pages (.*?)/>', text):
#print it.group(1)
start = int(re.sub(u'(?ms).*from=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
end = int(re.sub(u'(?ms).*to=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
index = re.match(u'(?ms).*index="(.*?)".*', it.group(1)).group(1)
if opt.index and opt.index != index:
continue
args = get_index_def(opt, index)
#print start, end, p.title()
for k in range(start, end + 1):
page_nr = page_number(k, args)
opt.dict_page[page_nr[0]] = [ p.title(), unicode(k) ]
for it in re.finditer(u'(?ms)<pages (.*?)></pages>', text):
#print it.group(1)
start = int(re.sub(u'(?ms).*from=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
end = int(re.sub(u'(?ms).*to=(["]?)(\d+)\\1.*', u"\\2", it.group(1)))
index = re.match(u'(?ms).*index="(.*?)".*', it.group(1)).group(1)
if opt.index and opt.index != index:
continue
args = get_index_def(opt, index)
#print start, end, p.title()
for k in range(start, end + 1):
page_nr = page_number(k, args)
opt.dict_page[page_nr[0]] = [ p.title(), unicode(k) ]
def load_data_from_index(opt):
# no dict_page, try to create it through the index
# 48 [u'Highways and Byways in Sussex/Chichester and the Hills', u'76']
# we can't do all the works as if subpage exist but at least we can create
# the page number --> djvu page number index.
# FIXME: this could be used to check than {{Page| is correctly used
args = get_index_def(opt, opt.index)
# FIXME: we must get the right djvu page number.
# do it it the reverse way to ensure existing page def will overwrite
# non-existing one in the dictionary
for k in range(1200, 0, -1):
page_nr = page_number(k, args)
opt.dict_page[page_nr[0]] = [ u'', unicode(k) ]
# A set of string to replace with a magic string then back to the original
# to avoid substitution.
magic_string = [
u'TOC row 1-1-1',
u'TOC row 1-1-1-1',
u'TOC row 1-out-1',
u'TOC row 2-1',
u'TOC row 2-1-1',
u'TOC row 2out-1',
u'{{t/c2}}',
u'Haut2Colonnes',
u'Fin2Colonnes',
]
# Yes the u'px' is a ugly hack
def add_magic_string(text):
for pos, s in enumerate(magic_string):
text = text.replace(s, u'__PHE_MAGIC_STRING__' + unicode(pos) + u'px')
return text
def remove_magic_string(text):
for pos, s in enumerate(magic_string):
text = text.replace(u'__PHE_MAGIC_STRING__' + unicode(pos) + u'px', s)
return text
def main(opt):
opt.dict_page = {}
if opt.prefix:
load_data_from_prefix(opt)
if len(opt.dict_page) == 0:
load_data_from_index(opt)
#if len(index_defs) > 1:
# raise "Can't support more than one index definition"
text = cache.read_page(opt.pagename, site = opt.site)
if opt.use_toc_link:
toc_link = template_toc_link[opt.lang]
text = re.sub(u'{{\s*' + toc_link + u'\s*\|[^|]*?\|[^|]*?\|[^|]*?\|({{sc\|[xlvicm]*)}}',u'\\1', text)
text = re.sub(u'{{\s*' + toc_link + u'\s*\|[^|]*?\|[^|]*?\|([^}]*?)}}',u'\\1', text)
text = re.sub(u"\[\[[^[|]*?\|(\d+)\]\]", "\\1", text)
text = re.sub(u"\[\[[^[|]*?\|({{sc\|[xlvicm]+}})\]\]", "\\1", text)
#text = re.sub(u'(?ms)^<noinclude>.*?</noinclude>', u'', text)
#text = re.sub(u'(?ms)<noinclude>.*?</noinclude>$', u'', text)
text = re.sub(u'(?ms)<noinclude>.*?</noinclude>(.*)<noinclude>.*?</noinclude>$', u'\\1', text)
text = add_magic_string(text)
# FIXME: this regexp is specific to en:
text = re.sub(u'([^0-9])(\d+?)(?!px|em|th|st|rd|nd)([^%"0-9{]|$)', change_page_nr, text)
# FIXME: this regexp is specific to fr:
text = re.sub(u'{{sc\|([xlvicm]+)}}([^{])', change_page_nr_roman, text)
text = remove_magic_string(text)
#wikipedia.showDiff(cache.read_page(opt.pagename, site = opt.site), text)
#text = re.sub(u'(?ms)^<noinclude>.*?</noinclude>', u'', text)
#text = re.sub(u'(?ms)<noinclude>.*?</noinclude>$', u'', text)
print text
if __name__ == "__main__":
try:
options.lang = None
options.index = None
options.prefix = None
options.anchor = True
for arg in sys.argv[1:]:
arg = unicode(arg, 'utf-8')
if arg.startswith('-prefix:'):
options.prefix = arg[len(u'-prefix:'):]
elif arg.startswith('-lang:'):
options.lang = arg[len(u'-lang:'):]
elif arg.startswith('-no_anchor'):
options.anchor = False
elif arg.startswith('-index:'):
options.index = arg[len(u'-index:'):]
elif arg == u'-help':
print sys.argv[0], "-lang:lang_code -index:index_name -prefix:prefix_page_name -no_anchor page_to_handle"
sys.exit(1)
else:
options.pagename = arg
if options.index == None and options.prefix == None:
print sys.argv[0], "either -prefix: or -index: must be given"
sys.exit(1)
options.use_toc_link = template_toc_link.has_key(options.lang)
components = options.pagename.split(u'/')
options.base_pagename = u"/".join(components[:len(components)-1]) + u'/'
options.site = wikipedia.getSite(code = options.lang, fam = 'wikisource')
main(options)
finally:
wikipedia.stopme()
cache.save()