Jump to content

User:Gdr/yearbot.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
#!/usr/bin/python
#
#
#             YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
#                           Gdr, 2005-05-14
#                           Minor updates: User:Docu, 2006-12-17 
#
# INTRODUCTION
#
# This script assists with the population of the "Births" and "Deaths"
# sections of an article about a year in the English wikipedia, using
# articles in [[Category:<year> births]] and [[Category:<year> deaths]].
#
#
# USAGE
#
# See [[User:Gdr/Yearbot]]
# requires [[User:Gdr/history.py]]
#
# DATA STRUCTURES
#
# An entry is a dictionary with these fields:
#
# article   Name of article.
# bdate     Date of birth, as a pair like ('April 17', '0417').
# byear     Birth year, as string like '1543'
# ddate     Date of death, as a pair like ('September 23', '0923').
# dyear     Death year, as string like '1602'
# exclude   1 if article is to be excluded from the page.
# intro     Introductory paragraph of article, if any is found.
# pagelink  wikipedia.Page object referring to article.
# post      String placed after the article link.
# pre       String placed before the article link.
# sort      Sort key, if any.
# desc      Description extracted from article (used as text for 'post'
#           if entry is new).
#
#
# LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.

import catlib
import getopt
import history
import re
import sys
import wikipedia

class Year:
    site = wikipedia.Site('en')

    # List of regexp search-and-replace patterns that should be applied
    # to all descriptions.
    patterns = []

    # The year we are working on, its Page, and the original text.
    year = None
    year_pl = None
    year_orig = None
    year_text = None

    ignore = {
        'Special:Categories': 1,
        }

    # Matches a regexp pattern.
    pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'

    # File to store patterns.
    pattern_file = 'yearbot-patterns'

    def __init__(self, year):
        if not re.match(r'^' + self.year_re + r'$', year):
            print "%s doesn't look like a year" % year
        self.year = year
        self.year_pl = wikipedia.Page(self.site, self.year)
        self.patterns = []
        f = file(self.pattern_file)
        if f:
            for line in f:
                m = re.match(self.pattern_re, line)
                if m:
                    self.patterns.append(m.groups())
        f.close()

    # Matches a year in the range for which the script operates.
    year_re = r'1[0-9][0-9][0-9]'

    # Matches a trailing birth date.
    trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('
                               + year_re + r')\]?\]?\)$')

    # Matches a trailing death date.
    trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('
                               + year_re + r')\]?\]?\)$')

    # Matches a month name.
    month_re = (r'January|February|March|April|May|June|'
                r'July|August|September|October|November|December')

    # Matches a date.
    date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('
               + month_re + r'))\]?\]?')

    # Matches an entry starting with a date.
    entry_date_re = re.compile(r'^\s*' + date_re
                               + r'\s*(?:-|–|—)?\s*(.*)$')

    # Matches an entry: captures pre, article, linktext, post.
    entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')

    # Matches the introductory paragraph of an article, once filled in
    # with birth year and death year.
    intro1_re = r"^.*'''[^']+'''(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$"
    intro2_re = r"^.*'''[^']+'''[^\(]*\([^\)]+\)(.*)$"

    # Matches description.
    desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\[[^\]]+\]\])+)[,.!?]'
    desc1_re = re.compile(r'\)\s*was' + desc_re)
    desc2_re = re.compile(r'\),' + desc_re)
    desc3_re = re.compile(r'\s+was' + desc_re)
    desc4_re = re.compile(r',' + desc_re)

    # Matches wiki-link
    link1_re = re.compile(r'\[\[[^|\]]+\|([^|\]]+)\]\]')
    link2_re = re.compile(r'\[\[([^|\]]+)\]\]')

    # Approximate date?
    approx_re = re.compile(r'\bc(?:\.|a\.|irca)')

    def save_patterns(self):
        f = file(self.pattern_file, 'w')
        if f:
            for p in self.patterns:
                f.write(u'/%s/%s/\n' % (p[0], p[1]))
            f.close()
        else:
            print "Couldn't write %s" % self.pattern_file

    def apply_patterns(self):
        for entries in self.topic_entries.values():
            for e in entries:
                for p in self.patterns:
                    if e.has_key('post'):
                        e['post'] = re.sub(p[0], p[1], e['post'])
                    elif e.has_key('desc'):
                        e['desc'] = re.sub(p[0], p[1], e['desc'])

    def unwikify(self, text):
        text = self.link1_re.sub(r'\1', text)
        text = self.link2_re.sub(r'\1', text)
        return text

    def make_date(self, m):
        month = m.group(1) or m.group(4)
        day = m.group(2) or m.group(3)
        return ('%s %s' % (month, day),
                '%02d%02d' % (history.months[month], int(day)))

    def parse_entries(self, what):
        m = re.search(r'==\s*' + what.capitalize()
                      + '\s*==\n((?:\s*\n|\*.*\n)*)',
                      self.year_pl.get())
        if not m:
            print "No ==%s==" % what.capitalize()
            return []
        lines = re.split(r'\s*\n\s*', m.group(1))
        entries = []
        for line_orig in lines:
            entry = {}
            line = re.sub(r'^\*\s*', '', line_orig)
            m = self.entry_date_re.match(line)
            if m:
                date = self.make_date(m)
                if what == 'births':
                    entry['bdate'] = date
                elif what == 'deaths':
                    entry['ddate'] = date
                else:
                    entry['?date'] = date
                line = m.group(5)
            m = self.trail_born_re.match(line)
            if m:
                entry['byear'] = m.group(2)
                line = m.group(1)
            m = self.trail_died_re.match(line)
            if m:
                entry['dyear'] = m.group(2)
                line = m.group(1)
            m = self.entry_re.match(line)
            if m:
                entry['pre'] = m.group(1)
                entry['article'] = m.group(2)
                if m.group(3):
                    entry['linktext'] = m.group(3)
                entry['post'] = m.group(4)
                entries.append(entry)
            elif not re.match(r'^\s*$', line_orig):
                wikipedia.output(u"Couldn't parse %s" % line_orig)
        return entries

    def check_entry(self, entry, key, what, value):
        if value != None:
            if entry.has_key(key) and entry[key] != value:
                wikipedia.output(u"%s '%s' fails to match '%s'; "
                                 u"discarding the former."
                                 % (what, entry[key], value))
            entry[key] = value

    def parse_article(self, entry, what, entries = {}):
        intro = None
        try:
            text = entry['pagelink'].get()
        except wikipedia.IsRedirectPage, arg:
            return
        except wikipedia.NoPage:
            return

        # Look for {{lived}} template.
        m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re
                      + r')\s*|\s*d\s*=\s*(' + self.year_re
                      + r')\s*|\s*key\s*=\s*(.*)}}', text)
        if m:
            self.check_entry(entry, 'byear', 'birth year', m.group(1))
            self.check_entry(entry, 'dyear', 'death year', m.group(2))
            self.check_entry(entry, 'sortkey', 'sort key', m.group(3))
        else:
            # Get birth year from category, if possible.
            m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
                          + ') births(?:\|([^|\]]+))?\]\]', text)
            if m:
                self.check_entry(entry, 'byear', 'birth year', m.group(1))
                self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
            else:
                wikipedia.output(u"%s has no Category:births"
                                 % entry['article'])

            # Get death year from category, if possible.
            m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
                          + ') deaths(?:\|([^|\]]+))?\]\]', text)
            if m:
                self.check_entry(entry, 'dyear', 'death year', m.group(1))
                self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
            else:
                wikipedia.output(u"%s has no Category:deaths"
                                 % entry['article'])

        # Find introductory paragraph.
        m = re.search(self.intro1_re % (entry.get('byear') or self.year_re,
                                        entry.get('dyear') or self.year_re),
                      text, re.M)
        if m:
            entry['intro'] = m.group(0)
            intro = m.group(3)

            # Birth date available in intro?
            mm = re.search(self.date_re, m.group(1))
            if mm:
                self.check_entry(entry, 'bdate', 'birth date',
                                 self.make_date(mm))

            # Birth date approximate?
            if self.approx_re.search(m.group(1)) and what == 'births':
                entry['exclude'] = True

            # Death date available in intro?
            mm = re.search(self.date_re, m.group(2))
            if mm:
                self.check_entry(entry, 'ddate', 'death date',
                                 self.make_date(mm))

            # Death date approximate?
            if self.approx_re.search(m.group(2)) and what == 'deaths':
                entry['exclude'] = True
        else:
            m = re.search(self.intro2_re, text, re.M)
            if m:
                entry['intro'] = m.group(0)
                intro = m.group(1)
            else:
                # Use first line instead.
                entry['intro'] = text.split('\n')[0]

        # Brief description available?
        mm = None
        if intro:
            mm = (self.desc3_re.match(intro)
                  or self.desc4_re.match(intro))
        mm = (mm or self.desc1_re.search(entry['intro'])
              or self.desc2_re.search(entry['intro'])
              or self.desc3_re.search(entry['intro'])
              or self.desc4_re.search(entry['intro']))
        if mm:
            entry['desc'] = self.unwikify(mm.group(1))

    def get_entries(self, what):
        # Get entries from the section of the year page.
        entries = self.parse_entries(what)
        article_entry = {}
        for entry in entries:
            article_entry[entry['article']] = entry

        # Get lists of births and deaths articles for this year.
        cl = catlib.Category(self.site, '%s %s' % (self.year, what))
        for a in cl.articles():
            if (not self.ignore.has_key(a.title())
                and not article_entry.has_key(a.title())):
                e = {'article': a.title()}
                article_entry[a.title()] = e

        # Get them all.
        for e in article_entry.values():
            e['pagelink'] = wikipedia.Page(self.site, e['article'])
        wikipedia.getall(self.site, map(lambda e: e['pagelink'],
                                        article_entry.values()))

        # Merge redirects.
        for e in article_entry.values():
            try:
                text = e['pagelink'].get()
            except wikipedia.IsRedirectPage, arg:
                pl = wikipedia.Page(self.site, arg.args[0])
                redir = pl.title()
                wikipedia.output("%s redirects to %s" % (e['article'], redir))
                if article_entry.has_key(redir):
                    e['pagelink'] = article_entry[redir]['pagelink']
                    del article_entry[redir]
                else:
                    e['pagelink'] = pl
                del article_entry[e['article']]
                article_entry[redir] = e
                e['article'] = redir
            except wikipedia.NoPage:
                continue

        # Parse articles.
        for e in article_entry.values():
            self.parse_article(e, what)
        return article_entry.values()

    def guess_sortkey(self, article):
        words = article.split(' ')
        if 1 < len(words):
            return words[-1] + u', ' + u' '.join(words[:-1])
        else:
            return article

    def sort_entries(self, entries, what):
        for e in entries:
            if what == 'births':
                e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
            elif what == 'deaths':
                e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
            else:
                e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article'])
        entries.sort(key=lambda e: e['sort'])

    def format_entry(self, entry, what):
        if entry.get('exclude'):
            t = u'- '
        else:
            t = u'* '
        if what == 'births' and entry.has_key('bdate'):
            t = t + u'[[%s]] - ' % entry['bdate'][0]
        elif what == 'deaths' and entry.has_key('ddate'):
            t = t + u'[[%s]] - ' % entry['ddate'][0]
        t = t + (entry.get('pre') or u'')
        if entry.has_key('linktext'):
            t = t + u'[[%s|%s]]' % (entry['article'], entry['linktext'])
        elif entry['article'][-1] == ')':
            t = t + u'[[%s|]]' % entry['article']
        else:
            t = t + u'[[%s]]' % entry['article']
        if entry.has_key('post'):
            t = t + entry['post']
        elif entry.has_key('desc'):
            t = t + u', ' + entry['desc']
        if what == 'births' and entry.has_key('dyear'):
            t = t + u' (died [[%s]])' % entry['dyear']
        elif what == 'deaths' and entry.has_key('byear'):
            t = t + u' (born [[%s]])' % entry['byear']
        return t

    def write_entries(self, entries, what):
        if not self.year_text:
            self.year_text = self.year_pl.get()
        text = self.year_text
        m = re.search(r'==\s*' + what.capitalize()
                      + '\s*==\n((?:\s*\n|\*.*\n)*)',
                      text)
        if not m:
            print "No ==%s==" % what.capitalize()
            return ""
        return (text[:m.start(1)]
                + u'\n'.join(map(lambda e: self.format_entry(e, what),
                                 filter(lambda e: not e.get('exclude'),
                                        entries)))
                + u'\n\n'
                + text[m.end(1):])

    help_text = u"""
    h - Help
    l - List entries
    v - Preview changes to the page
    s - Save changes to the page
    q - Quit
    /<from>/<to>/ - Edit all entries and save pattern in file
    <n>p - Print entry <n>
    <n>i - Print introductory paragraph for entry <n>
    <n>t - Print whole article text for entry <n>
    <n>x - Exclude entry <n> (or include if already excluded)
    <n>d:<desc> - Update description for entry <n>
    <n>d<m> - Cut description for entry <n> to <m> words
    <n>P:<desc> - Update prefix text for entry <n>
    <n>/<from>/<to>/ - Edit entry <n> using regexp search-and-replace
    """

    def show_entries(self, title, entries, what):
        wikipedia.output(u'------- %s -------' % title)
        n = 0
        self.sort_entries(entries, what)
        for e in entries:
            n = n + 1
            wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))

    def interface(self, title, entries, what):
        self.show_entries(title, entries, what)
        while 1:
            inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]")
            m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp)
            m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp)
            m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp)
            m4 = re.match(r'^\s*' + self.pattern_re, inp)
            if inp == 'l':
                self.show_entries(title, entries, what)
            elif inp == 'q':
                return False
            elif inp == 's' or inp == 'w':
                return True
            elif inp == 'h':
                wikipedia.output(self.help_text)
            elif m1:
                n = int(m1.group(1))
                op = m1.group(2)
                n2 = int(m1.group(3))
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                elif op == 'd':
                    desc = (entries[n-1].get('post')
                            or entries[n-1].has_key('desc')
                            and u', ' + entries[n-1]['desc'] or '')
                    entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1])
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                else:
                    wikipedia.output(u"Not understood: %s" % inp)
            elif m2:
                n = int(m2.group(1))
                op = m2.group(2)
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                elif op == 'p':
                    for k, v in entries[n-1].items():
                        wikipedia.output(u'  %s: %s' % (k, v))
                elif op == 'd':
                    if m2.group(3) and 2 <= len(m2.group(3)):
                        entries[n-1]['post'] = u', ' + m2.group(3)[1:]
                        wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                    else:
                        entries[n-1]['post'] = ''
                        wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                elif op == 'P':
                    entries[n-1]['pre'] = m2.group(3)[1:]
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                elif op == 't':
                    try:
                        wikipedia.output(entries[n-1]['pagelink'].get())
                    except:
                        wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title())
                elif op == 'i':
                    wikipedia.output(entries[n-1].get('intro', u'No intro'))
                elif op == 'x':
                    entries[n-1]['exclude'] = not entries[n-1].get('exclude')
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                else:
                    wikipedia.output(u"Not understood: %s" % inp)
            elif m3:
                n = int(m3.group(1))
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                else:
                    desc = (entries[n-1].get('post')
                            or entries[n-1].has_key('desc')
                            and u', ' + entries[n-1]['desc'] or '')
                    entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc)
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
            elif m4:
                self.patterns.append((m4.group(1), m4.group(2)))
                self.save_patterns()
                self.apply_patterns()
            else:
                wikipedia.output(u"Not understood: %s" % inp)

    comment = "yearbot - robot-assisted updating of births and deaths"
    topic_names = ['births', 'deaths']

    def run(self):
        self.topic_entries = {}
        for what in self.topic_names:
            self.topic_entries[what] = self.get_entries(what)
            self.sort_entries(self.topic_entries[what], what)
        self.apply_patterns()
        while 1:
            for what in self.topic_names:
                entries = self.topic_entries[what]
                for i in range((len(entries) + 19) / 20):
                    efrom = i * 20
                    eto = min(len(entries), (i + 1) * 20)
                    batch = entries[efrom : eto]
                    title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto),
                    if not self.interface(title, batch, what):
                        return
                self.sort_entries(entries, what)
                self.year_text = self.write_entries(entries, what)
            wikipedia.showDiff(self.year_pl.get(), self.year_text)
            if wikipedia.input(u"OK? [yN]") == 'y':
                self.year_pl.put(self.year_text, self.comment)
                return

if __name__ == '__main__':
    wikipedia.username = 'yearbot'
    try:
        if len(sys.argv) < 2:
            raise "No year specified"
        Year(sys.argv[1]).run()
    finally:
        wikipedia.stopme()