Utilisateur:Phe/Python/typo page.py
< Utilisateur:Phe | Python
# -*- coding: utf-8 -*-
import botpywi
import sys
import re
import wikipedia
import page_cache
import query_ext
try:
import psyco
psyco.full()
except ImportError:
pass
cache = page_cache.page_cache()
# many things could be merged in one regexp, but I want to be able to
# test them separetly
def treat_text_part(text):
# apostrophe
if True:
# Well ... It's difficult to do it properly if the text is already
# wikified
text = re.sub(u"([^'])['‘]([^'])", u'\\1’\\2', text)
if True:
# incise en début de phrase --> cadratin, dangereux, des incises
# peuvent se trouver en début de ligne (mais il n'est pas faux
# d'utiliser le cadratin dans ce cas ?)
text = text.replace(u"\n–", u"\n—")
# double "-" en début de ligne mais pas quadruple "-"
#text = text.replace(u"\n--", u"\n—")
text = re.sub(u"\n--(?!--)", u"\n—", text)
# tiret en début de phrase --> dialogue (mais pas quadruple "-")
#text = text.replace(u"\n-", u"\n—")
text = re.sub(u"\n-(?!---)", u"\n—", text)
# et deux char utilisés pour les filets.
text = text.replace(u"\n─", u"\n—")
text = text.replace(u"\n―", u"\n—")
# some char followed by a space
text = re.sub(u"—([^ ])", u"— \\1", text)
text = re.sub(u'[ ]([,.])', u'\\1', text)
text = re.sub(u'\.\.\.[.]*', u'…', text)
text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', text)
text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', text)
# separated from the previous regexp else "word!»" overlap
text = re.sub(u'([^ \s])([»])', u'\\1 \\2', text)
# workaround some buggy text
text = re.sub(u'([;:!?»]) \n', u'\\1\n', text)
# < ><space>
#text = re.sub(u' ([;:!?»])', u' \\1', text)
text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', text)
#text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', text)
return text
def treat_text(old_text):
last_match = 0
new_text = u''
# FIXME: avoid all html entity
for it in re.finditer(u'(<(div|span)[^>]*?>| |—|<!--.*?-->|\n:[:]*|\n;[;]*|\[\[[^]|]*)', old_text):
#print it.group(1)
new_text += treat_text_part(old_text[last_match:it.start(1)])
new_text += old_text[it.start(1):it.end(1)]
last_match = it.end(1)
new_text += treat_text_part(old_text[last_match:])
new_text = re.sub(u'(?ms)[\s]+(<noinclude>.*?</noinclude>)$', u'\\1',
new_text)
return new_text
# FIXME: factorize
def compare_title(a, b):
first = int(re.match(u'.*/(\d+)', a).group(1))
second = int(re.match(u'.*/(\d+)', b).group(1))
if first > second:
return 1
elif first < second:
return -1
else:
return 0
def main(gen):
titles = []
for p in gen:
titles.append(p[u'title'])
if re.match(u'.*/\d+$', titles[0]):
titles.sort(compare_title)
titles = [ wikipedia.Page(site = site, title = x) for x in titles ]
cache.mass_load(titles)
for p in titles:
text = cache.read_page(p.title(), site = p.site())
new_text = treat_text(text)
if new_text.strip() != text.strip():
print p.title()
wikipedia.showDiff(text, new_text)
choice = wikipedia.inputChoice(u'Next, Quit, Count',
['Next', 'upload' ],
[ 'N', 'u'], 'n')
if choice == 'u':
cache.write_page(p.title(), new_text, u'Typographie',
site = p.site())
if __name__ == '__main__':
try:
class Options:
pass
site = wikipedia.getSite(code = 'fr', fam = 'wikisource')
for arg in sys.argv[:]:
if arg == '-help':
print sys.argv[0], "-help"
elif arg.startswith('-start:'):
pagename = unicode(arg[len('-start:'):], 'utf-8')
gen = query_ext.PreloadingPagesStartswith(pagename, site = site)
else:
gen = [ { u'title': unicode(arg, 'utf-8') } ]
main(gen)
finally:
wikipedia.stopme()
cache.save()