Utilisateur:Phe/Scripts/botpywi.py
Apparence
# -*- coding: utf-8 -*-
import sys
sys.path.append('/usr/src/phe/pywikipedia')
sys.path.append('/usr/src/phe/wiki/src')
import cPickle
import re
import os
from htmlentitydefs import name2codepoint as n2cp
# number of worker thread
num_worker_threads = 4
dump_basedir = '/home/phe/wiki/file/'
def load_obj(name):
db = open('cache/' + name, 'r')
obj = cPickle.load(db)
db.close()
return obj
def save_obj(name, obj):
db = open('cache/' + name, 'w')
cPickle.dump(obj, db, 2)
db.close()
def dump_filepath(lang, family, nms):
return dump_basedir + lang + family + '/' + nms
def get_file_list_and_redirect(lang, family, nms):
root_dir = dump_filepath(lang, family, nms)
stat = os.stat(root_dir)
cache_name = lang + family + '_file_list_and_redirect_' + nms
timestamp = None
if os.access('cache/' + cache_name, os.R_OK):
lst, timestamp = load_obj(cache_name)
if timestamp != stat[9]:
print "Reloading cache"
lst = []
for root, dirnames, files in os.walk(root_dir, False):
redirect = {}
regular_files = []
for f in files:
filename = root + '/' + f
if os.path.islink(filename):
redirect[f] = os.readlink(filename)
else:
regular_files.append(f)
lst.append( (root, dirnames, regular_files, redirect) )
save_obj(cache_name, (lst, stat[9]))
return lst
def get_redirect_list(lang, family, nms):
lst = get_file_list_and_redirect(lang, family, nms)
return [ lst[0][0], lst[0][1], lst[0][3] ]
def get_file_list(lang, family, nms):
root_dir = dump_filepath(lang, family, nms)
stat = os.stat(root_dir)
cache_name = lang + family + '_file_list_' + nms
timestamp = None
if os.access('cache/' + cache_name, os.R_OK):
lst, timestamp = load_obj(cache_name)
if timestamp != stat[9]:
print "Reloading cache"
lst = []
root_dir = dump_filepath(lang, family, nms)
for root, dirnames, files in os.walk(root_dir, False):
files = filter(lambda x: not os.path.islink(root + '/' + x), files)
lst.append( (root, dirnames, files) )
save_obj(cache_name, (lst, stat[9]))
return lst
def namespaces(lang, family):
lst = os.listdir(dump_basedir + lang + family)
result = []
for d in lst:
nms = ''
if d != 'Article':
nms = unicode(d.replace('_', ' '), u'utf-8') + u':'
result.append( (d, nms) )
return result
def capitalize(s):
if len(s) >= 2:
s = s[0].upper() + s[1:]
elif len(s) == 1:
s = s[0].upper()
return s
def uncapitalize(s):
if len(s) >= 2:
s = s[0].lower() + s[1:]
elif len(s) == 1:
s = s[0].lower()
return s
def split_links(str):
lst = []
for it in re.finditer(u'\[\[(.+?)\]\]', str):
lst.append(it.group(1).split(u'|')[0])
return lst
def quote_regex(title):
result = u''
for c in title:
if c in u'()*.|?[]+':
result += u'\\'
result += c
return result
def load_lang_list(to_unicode = True):
langs = set()
lang_list = open('/usr/src/phe/botpywi/lang_list.txt')
for l in lang_list:
l = l.strip('\n')
if to_unicode:
l = unicode(l, 'utf-8')
langs.add(l)
langs.add(capitalize(l))
return langs
def filename_to_article_name(filename):
title = unicode(filename, 'utf-8')
title = title.replace(u'_', u' ')
title = title.replace(u'%2F', u'/')
return capitalize(title)
def article_to_filename(article_name):
filename = article_name.replace(u' ', u'_')
filename = filename.replace(u'/', u'%2F')
filename = capitalize(filename)
return filename.encode('utf-8')
class PreloadingPages:
def __init__(self, page_list, check_function, cache):
self.page_list = page_list
self.check_function = check_function
self.cache = cache
def preload(self, pages):
new_pages = []
for p in pages:
new_pages.append(self.cache.get_page(p).page)
self.cache.mass_load(new_pages)
def __iter__(self):
count = 0
somePages = []
for page in self.page_list:
count += 1
if count % 1024 == 0:
print >> sys.stderr, str(count) + '/' + str(len(self.page_list)) + '\r',
if self.check_function(page):
somePages.append(page)
if len(somePages) >= 50:
self.preload(somePages)
for refpage in somePages:
yield refpage
somePages = []
# preload remaining pages
self.preload(somePages)
for refpage in somePages:
yield refpage
print >> sys.stderr, str(count) + '/' + str(len(self.page_list))
def do_exec(cmdline):
fd = os.popen(cmdline)
for t in fd.readlines():
print t.strip("\n")
ret = fd.close()
if ret != None:
# on error ret is a tuple (pid, status) status low byte is signal
# number, high byte is exit status (if low byte = 0), bit 7 of low
# byte is set if a core file has been generated
print "Error:", cmdline, "fail to exec", ret
return False
return True
def read_file(filename):
fd = open(filename)
text = unicode(fd.read(), 'utf-8')
fd.close()
return text
def write_file(filename, text):
fd = open(filename, 'w')
fd.write(text.encode('utf-8'))
fd.close()
# ret True if older is more recent than newer or if newer doesn't exists
def compare_filetime(older, newer):
if not os.path.exists(newer):
return True
stat1 = os.stat(older)
stat2 = os.stat(newer)
return stat1.st_mtime > stat2.st_mtime
# FIXME: factorize some code from this function
def paginate_page_list(lst, add_section = True):
count = 0
text = u''
for f in lst:
if add_section:
count += 1
if (count - 1) % 50 == 0:
text += u'== ' + str(count) + u'-' + str(count + 49) + u'==\n'
text += u'# [[' + f.replace('_', ' ') + u']]\n'
return text
def setDaemon(p):
try:
p.setDaemon(True)
except AttributeError:
p.daemon = True
def build_regex_from_title(title):
title = title.replace(u' ', u'[ _]+')
title = u'[' + title[0].lower() + title[0].upper() + u']' + title[1:]
return title
def explode_attrib(text, attrib_name):
match = re.match(u'(?ms).*<' + attrib_name + u'(.*?)/>.*', text)
attribs_text = match.group(1)
result = []
# this regexp is nearly identical to the one used in mediawiki base code.
for it in re.finditer(u"([A-Za-z0-9]*)\s*=\s*(?:\"([^<\"]*)\"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]*))", attribs_text):
attrib = it.group(1).lower()
value = it.group(it.lastindex)
value = re.sub('\s+', u' ', value).strip()
result.append( (attrib, value) )
return result
def is_numeric(text):
try:
int(text)
return True
except ValueError:
return False
# code taken from the web
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == u"#":
if match.group(2) == u'':
return unichr(int(ent))
elif match.group(2) == u'x':
return unichr(int('0x'+ent, 16))
else:
cp = n2cp.get(ent)
if cp and ent not in [ u'nbsp', u'thinsp' ]:
return unichr(cp)
else:
return match.group()
def decode_htmlentities(string):
entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
return entity_re.subn(substitute_entity, string)[0]