Utilisateur:Phe/Scripts/botpywi.py

# -*- coding: utf-8 -*-

import sys
sys.path.append('/usr/src/phe/pywikipedia')
sys.path.append('/usr/src/phe/wiki/src')

import cPickle
import re
import os
from htmlentitydefs import name2codepoint as n2cp

# number of worker thread
num_worker_threads = 4

dump_basedir = '/home/phe/wiki/file/'

def load_obj(name):
    db = open('cache/' + name, 'r')
    obj = cPickle.load(db)
    db.close()
    return obj

def save_obj(name, obj):
    db = open('cache/' + name, 'w')
    cPickle.dump(obj, db, 2)
    db.close()

def dump_filepath(lang, family, nms):
    return dump_basedir + lang + family + '/' + nms

def get_file_list_and_redirect(lang, family, nms):
    root_dir = dump_filepath(lang, family, nms)
    stat = os.stat(root_dir)
    cache_name = lang + family + '_file_list_and_redirect_' + nms
    timestamp = None
    if os.access('cache/' + cache_name, os.R_OK):
        lst, timestamp = load_obj(cache_name)
    if timestamp != stat[9]:
        print "Reloading cache"
        lst = []
        for root, dirnames, files in os.walk(root_dir, False):
            redirect = {}
            regular_files = []
            for f in files:
                filename = root + '/' + f
                if os.path.islink(filename):
                    redirect[f] = os.readlink(filename)
                else:
                    regular_files.append(f)
            lst.append( (root, dirnames, regular_files, redirect) )
            save_obj(cache_name, (lst, stat[9]))
    return lst

def get_redirect_list(lang, family, nms):
    lst = get_file_list_and_redirect(lang, family, nms)
    return [ lst[0][0], lst[0][1], lst[0][3] ]

def get_file_list(lang, family, nms):
    root_dir = dump_filepath(lang, family, nms)
    stat = os.stat(root_dir)
    cache_name = lang + family + '_file_list_' + nms
    timestamp = None
    if os.access('cache/' + cache_name, os.R_OK):
        lst, timestamp = load_obj(cache_name)
    if timestamp != stat[9]:
        print "Reloading cache"
        lst = []
        root_dir = dump_filepath(lang, family, nms)
        for root, dirnames, files in os.walk(root_dir, False):
            files = filter(lambda x: not os.path.islink(root + '/' + x), files)
            lst.append( (root, dirnames, files) )
        save_obj(cache_name, (lst, stat[9]))
    return lst

def namespaces(lang, family):
    lst = os.listdir(dump_basedir + lang + family)
    result = []
    for d in lst:
        nms = ''
        if d != 'Article':
            nms = unicode(d.replace('_', ' '), u'utf-8') + u':'
        result.append( (d, nms) )
    return result


def capitalize(s):
    if len(s) >= 2:
        s = s[0].upper() + s[1:]
    elif len(s) == 1:
        s = s[0].upper()
    return s

def uncapitalize(s):
    if len(s) >= 2:
        s = s[0].lower() + s[1:]
    elif len(s) == 1:
        s = s[0].lower()
    return s


def split_links(str):
    lst = []
    for it in re.finditer(u'\[\[(.+?)\]\]', str):
        lst.append(it.group(1).split(u'|')[0])
    return lst


def quote_regex(title):
    result = u''
    for c in title:
        if c in u'()*.|?[]+':
            result += u'\\'
        result += c
    return result

def load_lang_list(to_unicode = True):
    langs = set()
    lang_list = open('/usr/src/phe/botpywi/lang_list.txt')
    for l in lang_list:
        l = l.strip('\n')
        if to_unicode:
            l = unicode(l, 'utf-8')
        langs.add(l)
        langs.add(capitalize(l))
    return langs


def filename_to_article_name(filename):
    title = unicode(filename, 'utf-8')
    title = title.replace(u'_', u' ')
    title = title.replace(u'%2F', u'/')
    return capitalize(title)

def article_to_filename(article_name):
    filename = article_name.replace(u' ', u'_')
    filename = filename.replace(u'/', u'%2F')
    filename = capitalize(filename)
    return filename.encode('utf-8')


class PreloadingPages:
    def __init__(self, page_list, check_function, cache):
        self.page_list = page_list
        self.check_function = check_function
        self.cache = cache

    def preload(self, pages):
        new_pages = []
        for p in pages:
            new_pages.append(self.cache.get_page(p).page)
        self.cache.mass_load(new_pages)

    def __iter__(self):
        count = 0
        somePages = []
        for page in self.page_list:
            count += 1
            if count % 1024 == 0:
                print >> sys.stderr, str(count) + '/' + str(len(self.page_list)) + '\r',
            if self.check_function(page):
                somePages.append(page)
                if len(somePages) >= 50:
                    self.preload(somePages)
                    for refpage in somePages:
                        yield refpage
                    somePages = []
        # preload remaining pages
        self.preload(somePages)
        for refpage in somePages:
            yield refpage
        print >> sys.stderr, str(count) + '/' + str(len(self.page_list))

def do_exec(cmdline):
    fd = os.popen(cmdline)
    for t in fd.readlines():
        print t.strip("\n")
    ret = fd.close()
    if ret != None:
        # on error ret is a tuple (pid, status) status low byte is signal
        # number, high byte is exit status (if low byte = 0), bit 7 of low
        # byte is set if a core file has been generated
        print "Error:", cmdline, "fail to exec", ret
        return False
    return True

def read_file(filename):
    fd = open(filename)
    text =  unicode(fd.read(), 'utf-8')
    fd.close()
    return text

def write_file(filename, text):
    fd = open(filename, 'w')
    fd.write(text.encode('utf-8'))
    fd.close()

# ret True if older is more recent than newer or if newer doesn't exists
def compare_filetime(older, newer):
    if not os.path.exists(newer):
        return True
    stat1 = os.stat(older)
    stat2 = os.stat(newer)
    return stat1.st_mtime > stat2.st_mtime

# FIXME: factorize some code from this function
def paginate_page_list(lst, add_section = True):
    count = 0
    text = u''
    for f in lst:
        if add_section:
            count += 1
            if (count - 1) % 50 == 0:
                text += u'== ' + str(count) + u'-' + str(count + 49) + u'==\n'
        text += u'# [[' + f.replace('_', ' ') + u']]\n'
    return text

def setDaemon(p):
    try:
	p.setDaemon(True)
    except AttributeError:
	p.daemon = True

def build_regex_from_title(title):
    title = title.replace(u' ', u'[ _]+')
    title = u'[' + title[0].lower() + title[0].upper() + u']' + title[1:]
    return title

def explode_attrib(text, attrib_name):
    match = re.match(u'(?ms).*<' + attrib_name + u'(.*?)/>.*', text)
    attribs_text = match.group(1)
    result = []
    # this regexp is nearly identical to the one used in mediawiki base code.
    for it in re.finditer(u"([A-Za-z0-9]*)\s*=\s*(?:\"([^<\"]*)\"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]*))", attribs_text):
        attrib = it.group(1).lower()
        value = it.group(it.lastindex)
        value = re.sub('\s+', u' ', value).strip()
        result.append( (attrib, value) )
    return result

def is_numeric(text):
    try:
        int(text)
        return True
    except ValueError:
        return False

# code taken from the web
def substitute_entity(match):
    ent = match.group(3)

    if match.group(1) == u"#":
        if match.group(2) == u'':
            return unichr(int(ent))
        elif match.group(2) == u'x':
            return unichr(int('0x'+ent, 16))
    else:
        cp = n2cp.get(ent)
        if cp and ent not in [ u'nbsp', u'thinsp' ]:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
    return entity_re.subn(substitute_entity, string)[0]