User:RussBot/dabmaintbot.py

#!/usr/bin/python
"""
dabmaintbot - Bot to update link counts on
[[en:Wikipedia:Disambiguation pages maintenance]]
"""

# requires the pywikipedia bot framework
# http://pywikipediabot.sourceforge.net
#
# Copyright (c) 2007 Russell M. Blau

import datetime
import locale
import re, sys
import signal
import wikipedia, pagegenerators

locale.setlocale(locale.LC_ALL, '')

site = wikipedia.getSite()

# wikipedia.use_query = False

#Constants:
ACTIVE_CUTOFF = 100
HISTORY_LEN = 6

# The following pages must be set manually before the bot is run
# if this is to be used on anything other than the English
# Wikipedia.  Normally, the input and output pages will be the same,
# but the output can be changed if desired for a 'test run' of the bot.

#input pages
maint_page = wikipedia.Page(site,
                "Wikipedia:Disambiguation pages with links/Current list")
dump_page = wikipedia.Page(site,
                "User:RussBot/DPL")
problem_page = wikipedia.Page(site,
                "Wikipedia:Disambiguation pages with links/problems")
#output pages
result_page = wikipedia.Page(site,
                "Wikipedia:Disambiguation pages with links/Current list")
problem_result = wikipedia.Page(site,
                "Wikipedia:Disambiguation pages with links/problems")

started_at = datetime.datetime.now()

# cache page objects to reduce server load
pagecache = {}

def cacheget(page):
    global pagecache
    title = page.sectionFreeTitle()
    try:
        return pagecache[title]
    except KeyError:
        pagecache[title] = page
        return page

def cacheput(page):
    global pagecache
    title = page.sectionFreeTitle()
    pagecache[title] = page

def refcount(page):
    if hasattr(page, "refcount"):
        return page.refcount
    while True:
        try:
            count = 0
            references = page.getReferences(True)

            for referring_page in references:
                if hasattr(referring_page, "_redirarg"):
                    continue
                if referring_page.namespace():
                    continue
                count += 1
            page.refcount = count
            cacheput(page)
            return count
        except KeyboardInterrupt:
            # This is just in case the program hangs while trying to retrieve references
            # Be careful as this may make it difficult to actually interrupt the script!
            continue

def increasing(seq):
    '''Return True if seq is uniformly increasing (from last to first),
    False otherwise'''
    for index in range( len(seq) - 1 ):
        if seq[index] <= seq[index+1]:
            return False
    return True

def fmt(num):
    return locale.format("%i", num, grouping=True)

try:
    for arg in sys.argv[1:]:
        arg = wikipedia.argHandler(arg, 'dabmaintbot')
        if arg:
            print "Unrecognized command line argument: %s" % arg
            # show help text and exit
            wikipedia.argHandler("-help", "dabmaintbot")

    mylang = site.language()

    fixed_pages = 0
    fixed_links = 0
    problems = []
    m_text = maint_page.get()

    active_r = re.compile(
        r"^# (?:'''• )?\[\[(.+)\]\] *\(([0-9]*) *" +
        r"\[\[Special:Whatlinkshere/(?:.+)\|links\]\]\) *" +
        r"(?:\((?:(?:new)|(?:[-+][0-9]+))\))? *" +
        r"(?:<!-- history (.*?)-->)? *(.*?) *(?:''')? *$", re.M)
    # the groups matched by this regex are:
    # 1.  the title of a disambiguation page
    # 2.  the number of links found last time the bot ran (may be empty)
    # 3.  the history of the page's link count (may be empty), consisting of a
    #     space-separated string of numbers
    # 4.  any notes added by users at the end of the line

    inactive_r = re.compile(
        r'^# \[\[(.+)\]\] \(([0-9]+)\) history ([0-9 ]*):(.*) *$', re.M)
    # the groups matched by this regex are the same as for active_r

    # lists are demarcated by HTML comments

    # Step 1: Collect all links and histories from the last scan
    
    start_mark = u"<!-- section title="
    end_mark = u"<!-- end section -->"
    marker = 0
    new_text = []
    disambiglinks = {}
    total_count = [0, 0, 0, 0]
    sections = []
    while True:
        section_start = m_text.find(start_mark, marker)
        if section_start == -1:
            break
        title_mark = section_start + len(start_mark)
        section_title = m_text[title_mark:
                               m_text.find(u" -->\n", title_mark)]
        section_marker = title_mark + len(section_title) + len(" -->\n")
        if section_marker >= len(m_text):
            wikipedia.output(
                u"ERROR: cannot locate section title in %s" % section_title)
            raise RuntimeError
        
        section_end = m_text.find(end_mark, section_marker)
        if section_end == -1:
            wikipedia.output(
                u"ERROR: cannot locate end of section %s" % section_title)
            raise RuntimeError
        marker = section_end
        sections.append((section_title, section_marker, section_end))
        sectionnumber = len(sections) - 1
        
        for item in active_r.finditer(m_text, section_marker, section_end):
            link_page_title = item.group(1)
            link_page = cacheget(wikipedia.Page(site, link_page_title))
            while link_page.isRedirectPage():
                link_page = cacheget(
                    wikipedia.Page(site, link_page.getRedirectTarget()))
            try:
                if not link_page.isDisambig():
                    continue
            except wikipedia.NoPage:
                continue
            link_page_title = link_page.title()
            if link_page_title in disambiglinks.keys():
                continue
            count = refcount(link_page)
            if item.group(3):
                history = item.group(3)
            else:
                history = u''
            disambiglinks[link_page_title] = {
                'section': sectionnumber,
                'title': link_page_title,
                'count': count,
                'history_text': history,
                'trailing_text': item.group(4).strip()
            }

        # search for inactive listings, which should always follow active ones
        for item in inactive_r.finditer(m_text, section_marker, section_end):
            link_page_title = item.group(1)
            link_page = cacheget(wikipedia.Page(site, link_page_title))
            while link_page.isRedirectPage():
                link_page = cacheget(
                    wikipedia.Page(site, link_page.getRedirectTarget()))
            try:
                if not link_page.isDisambig():
                    continue
            except wikipedia.NoPage:
                continue
            link_page_title = link_page.title()
            if link_page_title in disambiglinks.keys():
                continue
            count = refcount(link_page)
            if item.group(3):
                history = item.group(3)
            else:
                history = u''
            disambiglinks[link_page_title] = {
                'section': sectionnumber,
                'title': link_page_title,
                'count': count,
                'history_text': history,
                'trailing_text': item.group(4).strip()
            }

    # Step 2.  Collect links from data dump output page and add any that
    # aren't already in the collection

    for link_page in dump_page.linkedPages():
        while link_page.isRedirectPage():
            link_page = cacheget(
                wikipedia.Page(site, link_page.getRedirectTarget()))
        try:
            if not link_page.isDisambig():
                continue
        except wikipedia.NoPage:
            continue
        link_page_title = link_page.sectionFreeTitle()
        if link_page_title in disambiglinks.keys():
            continue
        count = refcount(link_page)
        history = u''
        disambiglinks[link_page_title] = {
            'section': 0,  # All new articles go into 'general' until classified
            'title': link_page_title,
            'count': count,
            'history_text': history,
            'trailing_text': u''
        }

    # Step 3.  Sort links by section and count, and output page
    marker = 0
    for (number, (section_name, section_marker, section_end)
         ) in enumerate(sections):
        section_links = [link for link in disambiglinks.values()
                         if link['section'] == number]
        section_links.sort(key=lambda i:i['count'], reverse=True)
        section_count = [0, 0]
        new_text.append(m_text[marker:section_marker])
        active = True
        for link in section_links:
            if link['count'] < ACTIVE_CUTOFF and active:
                active = False
                new_text.append(u"<!-- Inactive articles:\n")
            if link['history_text']:
                history = [int(n) for n in link['history_text'].split(" ")]
            else:
                history = []
            history = [link['count']] + history
            while len(history) > HISTORY_LEN:
                del history[-1]
            if len(history) == 1:
                link['diff'] = 'new'
            else:
                link['diff'] = "%+i" % (history[0] - history[1])
                if history[0] < history[1]:
                    fixed_pages += 1
                    fixed_links += (history[1] - history[0])
            link['history_text'] = " ".join(str(x) for x in history)
            
            if max(history) < ACTIVE_CUTOFF / 4:
                # discard items that have no significant history
                continue

            if active:
                section_count[0] += 1
                section_count[1] += link['count']
                item = (
u"[[%(title)s]] (%(count)i [[Special:Whatlinkshere/%(title)s|links]]) " +
u"(%(diff)s)<!-- history %(history_text)s--> %(trailing_text)s") % link
                # bullet items that have shown unusual or persistent increases
                if (len(history) > 1 and
                        history[0]-history[1] > ACTIVE_CUTOFF / 2
                   ) or (
                        len(history) == HISTORY_LEN and
                        increasing(history) and
                        history[0] - history[-1] > ACTIVE_CUTOFF
                   ):
                    prefix = "'''• "
                    suffix = "'''"
                    item.rstrip("'")
                    problems.append(
u"* [[%(title)s]] (%(count)i [[Special:Whatlinkshere/%(title)s|links]]) (%(diff)s)\n"
                        % link)
                else:
                    prefix = suffix = ""
                new_text.append("# %s%s%s\n" % (prefix, item, suffix))
            else:
                total_count[2] += 1
                total_count[3] += link['count']
                new_text.append(
u"# [[%(title)s]] (%(count)i) history %(history_text)s: %(trailing_text)s\n"
                    % link)
        if not active:
            new_text.append("-->\n")
        marker = section_end
        new_text.append(
            u"\n Section '%s' contains %i links to %i active articles.\n" %
            (section_name, section_count[1], section_count[0]))
        total_count[0] += section_count[0]
        total_count[1] += section_count[1]

    statistics_point = m_text.find(u"|}")
    if statistics_point >= 0:
        text = m_text[marker:statistics_point]
        text = re.sub(r"<!--banner-->.*?<!--/banner-->",
"<!--banner-->Since last week, at least %s links to %s pages have been fixed!<!--/banner-->"
                          % (fmt(fixed_links), fmt(fixed_pages)), text)
        new_text.append(text)
        marker = statistics_point
        new_text.append(u"|-\n")
        today = datetime.date.today()
        new_text.append(u"| %4i-%02i-%02i || %s || %s || %s || %s\n"
                        % (today.year, today.month, today.day,
                           fmt(total_count[0]+total_count[2]),
                           fmt(total_count[0]),
                           fmt(total_count[1]+total_count[3]),
                           fmt(total_count[1])))
        
    new_text.append(m_text[marker:])
    wikipedia.setAction(u"Disambiguation page maintenance script")
    result_page.put(u"".join(new_text))
    prob_text = problem_page.get()
    header_start = prob_text.index("<noinclude>")
    header_end = prob_text.index("</noinclude>") + len("</noinclude>")
    problem_result.put(prob_text[header_start:header_end] + "\n" +
                      u"".join(problems))
    
finally:
    elapsed = datetime.datetime.now() - started_at
    print "elapsed time = " + str(elapsed)
    wikipedia.stopme()