Utilisateur:Phe/page 306
La bibliothèque libre.
#!/usr/bin/python # -*- coding: utf-8 -*- #import botpywi import catlib import pagegenerators import wikipedia import re import sys import time def add_to_log(msg, opt): p = wikipedia.Page(opt.site, u"User:Phe/Log move page") text = p.get() text += msg try: p.put(text, u'Journal des erreurs de déplacement') return True except wikipedia.EditConflict: return False log_msg = u'' def flush_log(opt): global log_msg if not log_msg: return if not add_to_log(u'\n' + log_msg, opt): time.sleep(10) if not add_to_log(u'\n' + log_msg, opt): print u'ERROR: Unable to log last error message' log_msg = u'' log_msg = u'' def log_error(msg, page): err_msg = u'Error, page: [[%s]], %s' % (page.title(), msg) print err_msg global log_msg log_msg += u'*' + err_msg + u'\n' def regex_from_title(old_title): title = u'' for ch in old_title: if ch in u'()*.?+.': title += u'\\' title += ch title = title.replace(u' ', u'[ _]+') return title def move_one_page(page, new_title, opt): if not page.exists(): return print 'move page from;', page.title(), 'to:', new_title if not opt.dry_run: # Passing leaveRedirect = false to move() is more efficient but is # not enough stable actually (2009/10) and anyway it breaks things # until we have corrected redirect in a later step. So we defer # deletion of created redirect later. # Leaving the redirect in place make also the process more robust, # if the script stop on an exception it'll remain page to correct and # redirect to delete but nothing will be broken on the wiki (no red # link but link through redirect will exist) while True: # Should not be required but safer, as the throttle from # pywikipedia is too small for a move operation. time.sleep(10) if page.move(new_title, u'Conventions sur les titres'): break print "move fail, sleeping 30 seconds" page = wikipedia.Page(page.site(), page.title()) time.sleep(30) # in pages to modify it can exists pages that has been moved. # We fixup pagenames to get the real pagename as we can't use # p.get(follow_redirect = False) because some of the linked page can be # redirect existing before the move. Leaving a redirect in place doesn't # fix the problem as we will edit the redirect instead of the target. def fixup_pagename(pages, titles): results = set() for p in pages: found = False for t in titles: if p.title() == t[0]: found = True results.add(wikipedia.Page(p.site(), t[1])) if not found: results.add(p) return results def fix_redirect(pages, titles, opt): if not opt.dry_run: pages = fixup_pagename(pages, titles) pages = pagegenerators.PreloadingGenerator(pages) for p in pages: text = p.get(get_redirect = True) new_text = text for t in titles: regexp = u'\[\[[ ]*' + regex_from_title(t[0]) + u'[ ]*(\||\]|#)' repl = u'[[' + t[1] + u'\\1' new_text = re.sub(regexp, repl, new_text) if opt.dry_run: print 'Changing', p.title() wikipedia.showDiff(text, new_text) if not opt.dry_run: p.put(new_text, comment = u'Correction des redirects après renommage') def delete_redirect(p, opt): if p.exists(): if p.isRedirectPage(): print 'deleting page:', p.title() if len(list(p.getReferences())): if not opt.dry_run: log_error(u'skipping deletion, linked page exists, please fix linked pages and delete the redirect manually', p) else: if not opt.dry_run: if not p.delete(u'Nettoyage après correction des liens', prompt = False): log_error(u'Impossible de détruire la page', p) else: if not opt.dry_run: log_error(u'skipping deletion, page is not a redirect, check linked page before deletion', p) def delete_all_redirect(titles, opt): if opt.delete_redirect: for t in titles: p = wikipedia.Page(opt.site, t) delete_redirect(p, opt) elif not opt.dry_run: pages = [wikipedia.Page(opt.site, x) for x in titles] for p in pagegenerators.PreloadingGenerator(pages): if p.exists(): log_error(u'redirect created, check linked page before deletion', p) # The two next function must be changed to change the naming scheme, this one # change the title, the next function must return True if a title is a # candidate for a change def change_title(title): title = title.replace(u' - ', u'/') #title = title.replace(u', ', u'/') #title = title.replace(u' : ', u'/') # always change these two, at least on fr: title = title.replace(u"'", u'’') title = title.replace(u'‘', u'’') #title = title.replace(u'Don Quichotte/', u'L’Ingénieux Hidalgo Don Quichotte de la Manche/') #title = title.replace(u":", u'/') #title = title.replace(u' attiques', u' Attiques') #title = title.replace(u" ruisseau, ", u' ruisseau/') ## if title.endswith(u'.'): ## title = title[:-1] #title = title.replace(u'dernier des flibustiers', u'Dernier des flibustiers') #title = title.replace(u' coup ', u' Coup ') #title = title.replace(u', Chapitre ', u'/Chapitre ') #title = re.sub(u', (.*?), (.*)$', u'/\\1/\\2', title) #title = re.sub(u'\.(\d+)$', u'/\\1', title) #title = re.sub(u' (\d+) (\d+)$', u'/\\1/\\2', title) #title = re.sub(u',(\d+)$', u'/\\1', title) return title def filter_title(title): return re.match(u".*('|‘| - ).*", title) def move_pages(gen, opt): pages = set() titles = set() seen = set() for page in gen: if page.namespace() != 0: continue old_title = page.title() old_title = old_title.split(u'#')[0] # Possible with link to the same page with different section anchor if old_title in seen: continue seen.add(old_title) if not filter_title(old_title): continue new_title = change_title(old_title) # don't try to move a page to itself if old_title == new_title: continue # get linked page first, to avoid problem on getting linked page of a # page moved a few seconds ago. pages = pages.union(pagegenerators.ReferringPageGenerator(page)) move_one_page(page, new_title, opt) titles.add( (old_title, new_title) ) talk_page = page.toggleTalkPage() new_title_talk_page = change_title(talk_page.title()) titles.add( (talk_page.title(), new_title_talk_page) ) if opt.fix_redirect and opt.sync_between_page: fix_redirect(pages, titles, opt) pages = set() titles = set() delete_all_redirect([ x[0] for x in titles], opt) flush_log(opt) if opt.fix_redirect and not opt.sync_between_page: fix_redirect(pages, titles, opt) delete_all_redirect([ x[0] for x in titles], opt) flush_log(opt) if __name__ == '__main__': try: class Options: pass options = Options() options.dry_run = False options.fix_redirect = False options.sync_between_page = True options.delete_redirect = False gen = None options.site = wikipedia.getSite(code = 'fr', fam = 'wikisource') for arg in sys.argv[1:]: if arg == '-dry-run': options.dry_run = True elif arg.startswith('-cat:'): cat_name = unicode(arg[len('-cat:'):], u'utf-8') cat = catlib.Category(options.site, cat_name) gen = pagegenerators.CategorizedPageGenerator(cat) options.sync_between_page = True elif arg.startswith('-prefix:'): prefix_name = unicode(arg[len('-prefix;'):], 'utf-8') gen = pagegenerators.PrefixingPageGenerator(prefix_name, includeredirects = False, site = options.site) options.sync_between_page = False elif arg.startswith('-links:'): pagename = unicode(arg[len('-links:'):], 'utf-8') page = wikipedia.Page(options.site, pagename) gen = pagegenerators.LinkedPageGenerator(page) options.sync_between_page = False elif arg.startswith('-page:'): pagename = unicode(arg[len('-page:'):], 'utf-8') gen = [ wikipedia.Page(options.site, pagename) ] options.sync_between_page = False elif arg == '-fix-redirect': options.fix_redirect = True elif arg == '-delete-redirect': options.delete_redirect = True elif arg == '-help': print sys.argv[0], '[-dry-run | -help | -cat:cat_name | -prefix:prefix_name | -fix-redirect | -delete-redirect ]' sys.exit(1) if options.delete_redirect: if not options.site.isAllowed('delete', True): print 'You asked to delete redirect but have not enough right todo that' sys.exit(1) if not options.fix_redirect: print 'You asked to delete redirect but not to -fix-redirect' sys.exit(1) move_pages(gen, options) finally: flush_log(options) wikipedia.stopme()