Jump to content

User:Gdr/authority.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
#!/usr/bin/python
#
#
#              AUTHORITY.PY -- ADD AUTHORITY TO TAXOBOX
#                           Gdr, 2005-07-05
#
#
# 1. INTRODUCTION
#
# This Python script add an authority to a selected taxobox on the
# English wikipedia.
#
#
# 1.1 USAGE
#
#    ./authority.py --rebuild         Rebuild abbreviation table
#    ./authority.py --query=ABBREV    Query abbreviation
#    ./authority.py TAXON             Find authority and add it to taxon
#    ./authority.py TAXON AUTHORITY   Add authority to taxon
#
#
# 1.2 OPTIONS
#
#    -r     --rebuild     Rebuild abbreviation table
#    -q X   --query=X     Query abbreviation
#    -a A   --article=A   Start at article A instead of TAXON
#    -n     --noexpand    Don't expand abbreviations
#    -d     --disambig    Solve disambiguations for abbrevs
#
#
# 1.2 EXAMPLES
#
#    ./authority.py Magnolia
#    ./authority.py 'Boa constrictor'
#    ./authority.py Quercus L.
#    ./authority.py 'Passer domesticus' '(Linnaeus, 1758)'
#    ./authority.py 'Plasmodium vivax' 'Grassi & Feletti 1890'
#    ./authority.py -a 'Homo (genus)' Homo
#
#
# 1.3 LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.

import getopt
import htmlentitydefs
import os
import pickle
import re
import sys
import time
import unicodedata
import urllib
import wikipedia

class Error(Exception):
    def __init__(self, s):
        wikipedia.output(unicode(s))
        self.s = s

    def __repr__(self):
        return self.s

class Authority:

    # 2. CONFIGURATION

    # 2.1 USER CONFIGURATION

    # Which Wikipedia we are editing.
    site = wikipedia.Site('en')

    # 'authfile' is the filename in which the tables of author names and
    # abbreviations will be saved.
    authfile = 'authority.dat'

    # A regular expression that matches an authority and abbreviation in
    # a Wikipedia article. (This is the default; you can override it for
    # particular sources; see below.)
    auth_re = re.compile(ur'^\*[ \']*([\w\'., -]+[\w.])[ \']*'
                         ur' +(?:[-\u2013]|&[nm]dash;) +'
                         ur'\[\[([^\]|]+).*\r?\n', re.M|re.U)

    # 'wiki_abbrev_sources' is a dictionary mapping a code letter to a
    # Wikipedia sources for authority abbreviations.  Each source is a
    # dictionary with these keys:
    #
    # name ---- name of the Wikipedia article containing authorities and
    #           their abbreviations
    # re ------ a regular expression matching an authority and its
    #           abbreviation(s). There must be two groups, one for the
    #           abbreviation(s) for that authority and one for the name
    #           of the article about that authority. If omitted, auth_re
    #           is used as the default. Abbreviations are presumed to be
    #           separated by commas.
    # groups -- a tuple giving the group for the abbreviation(s) and the
    #           article; if omitted, (1,2) is the default.
    # fmt ----- format string for a new authority. Use %A for the
    #           abbreviation and %B for the authority.
    # sort ---- How to sort (by 'surname' or by 'abbrev').
    wiki_abbrev_sources = {
        'b': {'name': 'List of botanists by author abbreviation',
              'fmt': "* '''%A''' - [[%B]]\n",
              'sort': 'abbrev'},
        'z': {'name': 'List of zoologists by author abbreviation',
              'fmt': "* %A - [[%B]]\n",
              'sort': 'surname'},
        }

    # 'other_abbbrev_sources' is a list of other (non-Wikipedia) sources
    # for abbreviations. Each entry is a dictionary with keys:
    #
    # taxon --- a regular expression matching a taxon; means that this
    #           entry is only appropriate for articles contained in taxa
    #           matching this regexp. For example 'Plant' for a source
    #           listing only botanists, or 'Arthropod' for a source
    #           listsing only entomologists.
    # re ------ a regular expression matching the abbreviation and its
    #           expansion. %A will be replaced by the regexp-escaped
    #           form of the abbreviation we are looking for.  It should
    #           contain one group, matching the expansion.
    # url ----- the URL to visit to find the abbreviation. %A will be
    #           replaced by the URL-encoded form of the abbreviation we
    #           are looking for.
    other_abbrev_sources = [
        {'taxon': 'Plant',
         'url': 'http://www.ipni.org/ipni/authorsearch?find_abbreviation=%A&query_type=by_query',
         're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
        {'url': 'http://www.ipni.org/ipni/authorsearch?find_surname=%A&query_type=by_query',
         're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
        ]

    # 'auth_sources' is a list of sources to consult to find the
    # authority for a taxon. Each entry is a dictionary with these keys:
    #
    # taxon --- a regular expression matching a taxon; means that this
    #           entry is only appropriate for articles contained in taxa
    #           matching this regexp. For example 'Plant' for a source
    #           listing only plant names, or 'Coleoptera' for a source
    #           listsing only beetles.
    # url ----- the URL to visit to find the taxon. %T will be replaced
    #           by the URL-encoded form of the taxon we are looking
    #           for, and %S by the SN2000 "subject" area.
    # re ------ a regexp for getting the authority.  %A will be replaced
    #           by the regexp-escaped form of the abbreviation we are
    #           looking for.  It should contain one group, matching the
    #           expansion.
    auth_sources = [
        {'taxon': 'Plant',
         'url': ('http://www.ipni.org/ipni/plantsearch?'
                 'find_wholeName=%T&query_type=by_query'),
         're': r'<i>%T</i> (.*)</a>'},
        {'url': ('http://sn2000.taxonomy.nl/Taxonomicon/TaxonList.aspx?'
                 'searchBy=ScientificName&subject=%S&search=%T'),
         're': r'<i>%T</i>[^<]*<font size="-1"> *(\(?[^<,]+,? +[0-9]+\)?)'},
       # {'url': ('http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' 'search_topic=Scientific_Name&search_value=%T'), 're': (r'(?i)<SPAN CLASS="taxon_head"><I>%T</I></SPAN>' r'[ \r\n]*<A.*[ \r\n]*<SPAN CLASS="taxon_head">[ \r\n]*' r'&nbsp;<B>([^<]+)</B></A>'),}
        ]


    # 2.2 OTHER CONFIGURATION

    # 'rank_to_subject' is a dictionary mapping Linnaean rank in Latin
    # (as used in Wikipedia taxobox template names) to the SN2000
    # "Subject area" in which a taxon can be looked up. Ranks not listed
    # here are looked up in the subject area "High".
    rank_to_subject = {
        'subspecies': 'Species',
        'species': 'Species',
        'subgenus': 'Genus',
        'genus': 'Genus',
        'tribus': 'Family',
        'subfamilia': 'Family',
        'familia': 'Family',
        'superfamilia': 'Family',
        }

    # Don't ask easy questions of the user?
    noquery = False

    def __init__(self):
        for s in self.wiki_abbrev_sources.values():
            s['page'] = wikipedia.Page(self.site, s['name'])
        self.restore_abbreviations()


    # 3. ABBREVIATIONS
    #
    # We want to be able to find abbreviations and turn them into links
    # to the appropriate article. For example, given the abbreviation
    # 'L.' we need to generate the wikitext '[[Carolus Linnaeus|L.]]'.
    # This section includes the code for finding, storing, and updating
    # these abbreviations.


    # 3.1 LOADING AND SAVING ABBREVIATIONS

    # Load abbreviations from disk.
    def restore_abbreviations(self):
        self.abbrev = {}
        if os.path.isfile(self.authfile):
            f = open(self.authfile, 'r')
            if f:
                self.abbrev = pickle.load(f)
                f.close()

    # Save authorities to disk.
    def save_abbreviations(self):
        f = file('authority.dat', 'w')
        pickle.dump(self.abbrev, f)
        f.close()

    def unhtmlify(self, s):
        s = s.decode('iso-8859-1')
        while True:
            m = re.search(r'&([a-z]+);', s)
            if not m:
                break
            s = s[:m.start(0)] \
                 + unichr(htmlentitydefs.name2codepoint[m.group(1)]) \
                 + s[m.end(0):]
        return s


    # Normalize the unicode string 's' into ASCII. The idea is to store
    # the authority Lac'ep`ede under the key 'Lacepede' so that
    # inconsistent accentuation doesn't cause us to miss an
    # abbreviation. We decompose all composed characters and then ignore
    # everything non-ASCII. (This converts eacute->e etc.)
    def normalize(self, s):
        return unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')

    # Add an abbreviation to the table. 'abbrev' is the abbreviation;
    # 'article' is the title of the Wikipedia article on that authority;
    # 'code' is the code for the list from which it came, if any.
    def add_abbreviation(self, abbrev, article, code = None):
        key = self.normalize(abbrev)
        if key not in self.abbrev:
            self.abbrev[key] = []
        for a in self.abbrev[key]:
            # Do we already have this authority under this abbreviation?
            if abbrev == a[0] and article == a[1]:
                return
        self.abbrev[key].append((abbrev, article, code))


    # 3.2 USER INTERFACE FOR ADDING A NEW ABBREVIATION

    # If we don't find an abbreviation in any of wiki_abbrev_sources, we can
    # prompt the user to tell us the article title corresponding to the
    # abbreviation; then we can add it to the relevant source.

    # Return the normalized surname of the abbreviation.
    def surname(self, abbrev):
        m = re.search(r'(?ui)(?:de |von |d\')?[\w-]+\.?$',
                      self.normalize(abbrev))
        if m:
            return m.group(0)
        else:
            wikipedia.output(u"No surname for %s" % abbrev)
            return 'a'

    # 'abbrev' is the abbreviation for the authority described at
    # 'article'. Add this to the source given by 'code'.
    def add_abbreviation_to_source(self, abbrev, article, code):
        source = self.wiki_abbrev_sources[code]
        text = source['page'].get()
        if source['sort'] == 'surname':
            sortkey = self.surname(abbrev)
        else:
            sortkey = abbrev
        groups = source.get('groups', (1,2))

        # Format authority for insertion into the source.
        fmt = source['fmt']
        fmt = re.sub('%A', abbrev, fmt)
        if article[-1] == '(':
            fmt = re.sub('%B', article + '|', fmt)
        else:
            fmt = re.sub('%B', article, fmt)

        # Go through abbreviations in the source until we get to the
        # appropriate point in alphabetical order by surname.
        for m in re.finditer(source.get('re', self.auth_re), text):
            newtext = None
            if source['sort'] == 'surname':
                s2 = self.surname(m.group(groups[0]))
            else:
                s2 = m.group(groups[0])
            if sortkey[0] != s2[0]:
                # Sort keys not in the same letter of the alphabet.
                continue
            elif sortkey < s2:
                # New abbrev goes before this one.
                newtext = text[:m.start(0)] + fmt + text[m.start(0):]
            elif re.match(r'(?: *\r?\n)*==', text[m.end(0):]):
                # We've reached the end of the section for the right
                # letter, but not found anywhere to put the new
                # abbrev. So it goes at the end.
                newtext = text[:m.end(0)] + fmt + text[m.end(0):]
            else:
                continue
            # Found a place for it.
            wikipedia.showDiff(source['page'].get(), newtext)
            if wikipedia.input(u'OK? [yN]') == 'y':
                source['page'].put(newtext, 'nomialbot - adding %s = %s'
                                   % (abbrev, article))
            return
        wikipedia.output(u'Sorry, nowhere to put authority %s' % fmt)

    # 'abbrev' is the abbreviation for the authority described at
    # 'article'. Ask the user which source to add it to.
    def user_add_abbreviation(self, abbrev, article):
        for code, source in self.wiki_abbrev_sources.items():
            wikipedia.output(u'(%s) %s' % (code, source['name']))
        if self.noquery:
            inp = None
        else:
            inp = wikipedia.input(u"Add abbreviation %s = %s to which source? [%s]"
                                  % (abbrev, article,
                                     ''.join(self.wiki_abbrev_sources.keys())))
        if inp in self.wiki_abbrev_sources:
            self.add_abbreviation(abbrev, article, inp)
            self.save_abbreviations()
            self.add_abbreviation_to_source(abbrev, article, inp)
        else:
            self.add_abbreviation(abbrev, article)
            self.save_abbreviations()


    # 3.3 FINDING EXPANSIONS FOR ABBREVIATIONS

    # Rebuild table of authorities from the Wikipedia articles listed in
    # 'wiki_abbrev_sources'.
    def rebuild_abbreviations(self):
        wikipedia.getall(self.site,
                         map(lambda l: l['page'], self.wiki_abbrev_sources.values()))
        for code, s in self.wiki_abbrev_sources.items():
            for m in re.finditer(s.get('re', self.auth_re), s['page'].get()):
                groups = s.get('groups', (1,2))
                abbrevs = m.group(groups[0])
                pagename = m.group(groups[1])
                for a in re.split(r', +', abbrevs):
                    self.add_abbreviation(a, pagename, code)
        self.save_abbreviations()

    # User interface for finding an abbreviation using the stored
    # abbrevs, returning the pair (abbrev, expansion) or None.
    def find_abbreviation_in_store(self, abbrev):
        key = self.normalize(abbrev)
        if key in self.abbrev:
            if len(self.abbrev[key]) == 1:
                return self.abbrev[key][0]
            for i in range(len(self.abbrev[key])):
                wikipedia.output(u'(%d) %s' % (i + 1, self.abbrev[key][i][1]))
            while True:
                i = wikipedia.input(u"Which authority? [1-%d]"
                                    % len(self.abbrev[key]))
                if re.match(r'[0-9]+$', i) \
                    and int(i) - 1 in range(len(self.abbrev[key])):
                    break
            return (abbrev, self.abbrev[key][int(i) - 1][1])
        return None

    # Find abbreviation using 'other_abbrev_sources', returning the pair
    # (abbrev, expansion) or None.
    def find_abbreviation_other(self, abbrev):
        # TODO: check source[taxon]
        for source in self.other_abbrev_sources:
            url = re.sub('%A', urllib.quote(abbrev), source['url'])
            wikipedia.output(u'Trying %s' % url)
            f = urllib.urlopen(url)
            r = re.sub('%A', re.escape(abbrev), source['re'])
            m = re.search(r, f.read())
            f.close()
            if m:
                e = self.unhtmlify(m.group(1))
                self.user_add_abbreviation(abbrev, e)
                return (abbrev, e)
        return None

    # User interface for finding abbreviation using Wikipedia, returning
    # its expansion, or None.
    def find_abbreviation_wiki(self, abbrev):
        # See if there's a Wikipedia page for the abbrev.
        pl = wikipedia.Page(self.site, abbrev)
        if not pl.exists():
            expansions = []
        elif pl.isRedirectPage():
            expansions = [wikipedia.Page(self.site, pl.getRedirectTarget())]
        elif pl.isDisambig():
            expansions = pl.linkedPages()
        else:
            expansions = []
        for i in range(len(expansions)):
            wikipedia.output(u'(%d) %s' % (i + 1, expansions[i].title()))
        while True:
            if expansions:
                inp = wikipedia.input(u'Expansion for %s? [1-%d;aecq]'
                                      % (abbrev, len(expansions)))
            else:
                inp = wikipedia.input(u'Expansion for %s? [aecq]'
                                      % abbrev)
            if inp == 'a':
                abbrev = wikipedia.input(u'Enter new abbrev:')
                return self.find_abbreviation(abbrev)
            elif inp == 'e':
                expansion = wikipedia.input(u'Enter expansion for %s:'
                                            % abbrev)
                self.user_add_abbreviation(abbrev, expansion)
                return (abbrev, expansion)
            elif re.match(r'[0-9]+$', inp) \
                  and int(inp) - 1 in range(len(expansions)):
                expansion = expansions[int(inp) - 1].title()
                self.user_add_abbreviation(abbrev, expansion)
                return (abbrev, expansion)
            elif inp == 'c':
                return None
            elif inp == 'q':
                raise Error, "Quit requested"
            elif inp == 'l':
                for i in range(len(expansions)):
                    wikipedia.output(u'(%d) %s' % (i + 1, expansions[i]))
            else:
                wikipedia.output(
                    u'<number> = choose expansion;\n'
                    u'a = enter new abbreviation\n'
                    u'e = enter expansion\n'
                    u'c = continue (with no expansion for abbreviation)\n'
                    u'l = list expansions\n'
                    u'q = quit\n')

    # Find expansion for abbreviation using all available methods,
    # returning the pair (abbrev, expansion) or just abbrev if nothing
    # found.
    def find_abbreviation(self, abbrev):
        if abbrev:
            return self.find_abbreviation_in_store(abbrev) \
                    or self.find_abbreviation_other(abbrev) \
                    or self.find_abbreviation_wiki(abbrev) \
                    or (abbrev,)
        else:
            return ('',)

    def wikify_abbreviation(self, expansion):
        if 2 <= len(expansion):
            return u'[[%s|%s]]' % (expansion[1], expansion[0])
        else:
            return expansion[0]


    # 4. FINDING THE AUTHORITY FOR A TAXON

    # 'format_authority' takes an 'authority', splits it into its
    # component authorities, makes wikilinks for those components, and
    # returns a wikitext string.
    def format_authority(self, authority):
        r = re.compile(r'^\(|, +[0-9]*| +[0-9]+| +in +| +and +|'
                       r' *\bex\.? +| +& +| +&amp; +|\) *|'
                       r' +et al\.?')
        abbrevs = r.split(authority)
        joins = r.findall(authority)
        expansions = map(self.wikify_abbreviation,
                         map(self.find_abbreviation, abbrevs))
        return sum(x+y for x,y in zip(expansions, joins + ['']))

    # 'find_authority' returns the authority for the given taxon. 'text'
    # is the text of the Wikipedia article about that taxon.
    def find_authority(self, taxon, text):
        rank = self.rank_of_taxon(taxon, text)
        subject = self.rank_to_subject.get(rank, 'High')
        for source in self.auth_sources:
            if 'taxon' in source and not \
                re.search(r'(?m)^\| [a-z_]+ *= *\[\[%s' % source['taxon'], text):
                continue
            url = re.sub('%T', urllib.quote(taxon), source['url'])
            url = re.sub('%S', subject, url)
            url = re.sub('%R', rank, url)
            wikipedia.output(u'Trying %s' % url)
            f = urllib.urlopen(url)
            r = re.sub('%T',
                       re.sub(r'\\? +', r'(?: +|</i> +<i>)', re.escape(taxon)),
                       source['re'])
            m = re.search(r, f.read())
            f.close()
            if m:
                return self.unhtmlify(m.group(1))
        wikipedia.output(u'No authority found for %s' % taxon)
        return None


    # 5. UPDATING THE AUTHORITY FOR AN ARTICLE

    kingdom_map = {
        'Plant': 'Plantae',
        'Animal': 'Animalia',
        'Bacterium': 'Bacteria',
        'Fungus': 'Fungi',
        'Protist': 'Protista',
        }

    def kingdom(self, text):
        m = re.search(r'(?m)^\| *regnum *= *\[\[([^\|\]]+)', text)
        if m:
            return self.kingdom_map.get(m.group(1), m.group(1))
        else:
            raise Error, "No kingdom found."

    def rank_of_taxon(self, taxon, text):
        if re.match(r'^[\w-]+ [\w-]+ [\w-]+$', taxon):
            return 'subspecies'
        elif re.match(r'^[\w-]+ [\w-]+$', taxon):
            return 'species'
        m = re.search(r'(?m)^\| *((?!name)[a-z_]+) *= *'
                      r'[ \']*\[*%s[^\w]\]*[ \']*$' % re.escape(taxon), text)
        if not m:
            raise Error, "Can't find taxon %s in taxobox" % taxon
        return m.group(1)

    kingdom_to_color = {
        'Animalia': 'pink',
        'Plantae': 'lightgreen',
        'Fungi': 'lightblue',
        'Archaea': 'darkgray',
        'Protista': 'khaki',
        'Bacteria': 'lightgrey',
        }

    # 'find_article' takes the name of an article to start looking at,
    # and returns a Page object.
    def find_article(self, article):
        while True:
            pl = wikipedia.Page(self.site, article)
            if not pl.exists():
                wikipedia.output(u"No page %s" % pl.title())
                i = wikipedia.input(u"Redirect to:")
                if not i:
                    raise Error, "Quit requested"
                pl.put(u"#REDIRECT [[%s]]" % i,
                       u"nomialbot - redirecting scientific name %s to %s"
                       % (article, i))
                article = i
            elif pl.isRedirectPage():
                article = pl.getRedirectTarget()
            elif pl.isDisambig():
                links = pl.linkedPages()
                for i in range(len(links)):
                    wikipedia.output(u'(%d) %s' % (i + 1, links[i]))
                inp = wikipedia.input(u'Choose which article? [1-%d]'
                                      % len(links))
                if re.match(r'[0-9]+$', inp) \
                    and int(inp) - 1 in range(len(links)):
                    article = links[int(inp) - 1].title()
                else:
                    raise Error, "Quit requested"
            else:
                return pl

    # 'add_authority_to_article' takes a Page object, a taxon and an
    # authority. It adds the authority to that page.
    def add_authority_to_article(self, pl, taxon, authority, expand = True):
        text = pl.get()
        text = self.tidy_taxobox(text)
        if expand:
            authority = self.format_authority(authority)
        rank = self.rank_of_taxon(taxon, text)
        kingdom = self.kingdom(text)
        if rank == 'species':
            test_param = 'binomial'
            auth_param = 'binomial_authority'
        elif rank == 'subspecies':
            test_param = 'trinomial'
            auth_param = 'trinomial_authority'
        else:
            test_param = rank
            auth_param = rank + '_authority'
        m = re.search('(?m)^\| *%s *=.*$' % re.escape(test_param), text)
        if not m:
            raise Error, "Can't find rank %s in %s" % (test_param, pl.title())
        m1 = re.search(r'(?m)^\| *%s *= *(.*)' % re.escape(auth_param), text)
        if not m1:
            text = (text[:m.end(0)]
                    + u'\n| %s = %s' % (auth_param, authority)
                    + text[m.end(0):])
        elif wikipedia.input(u'%s already has authority "%s". '
                             u'Replace? [yN]' % (taxon, m1.group(1))) == 'y':
            text = (text[:m1.start(0)]
                    + u'\n| %s = %s' % (auth_param, authority)
                    + text[m1.end(0):])
        wikipedia.showDiff(pl.get(), text)
        if pl.get() != text and (self.noquery or (wikipedia.input(u"OK? [yN]") == 'y')):
            pl.put(text, u'nomialbot - adding authority for %s %s'
                   % (taxon, authority))

    def add_authority(self, article, taxon, authority, expand = True):
        pl = self.find_article(article)
        if pl:
            self.add_authority_to_article(pl, taxon, authority, expand)

    def find_and_add_authority(self, article, taxon, expand = True):
        pl = self.find_article(article)
        if not pl:
            return
        authority = self.find_authority(taxon, pl.get())
        if authority:
            self.add_authority_to_article(pl, taxon, authority, expand)


    # 7. GENERAL TIDYING

    subs = [
        # Capitalize "Taxobox"
        (r'{{taxobox', '{{Taxobox'),

        # Italicise genus entry.
        (r'(?m)^\| * genus *=[ \']*\[\[([^\]]+)\]\][ \']*$',
         '| genus = \'\'[[\\1]]\'\''),

        # Abbreviate genus in species entry.
        (r'(?m)^\| *species *= *([\']*)([A-Z])[a-z]+ ([a-z]+)',
         r'| species = \1\2. \3'),

        # Supply missing genus abbrev in species entry.
        (r'(?m)^(\| *genus *=[ \'\[]*([A-Z])[a-z]+[\] \']* *\n'
         r'\| *species *=[ \']*)([a-z-]+[ \']*$)',
         r'\1\2. \3'),

        # Supply missing species entry.
        (r'(?m)(^\| *genus *=.*\n)'
         r'(\| * binomial *= *'
         r'([A-Z])[a-z]+ ([a-z-]+))',
         r"\1| species = '''''\3. \4'''''\n\2"),

        # Italicise genus or species if it appears as the title.
        (r'(?ms)^\| *name *= *([a-z -]+[a-z]) *(\n.*'
         r'^\| *(?:genus|species) *=[ \'\[]*\1[ \'\]]*$)',
         '| name = \'\'\\1\'\'\\2'),

        # Bold genus if unlinked.
        (r'(?m)^\| *genus *= *\'*(\w+)\'* *$',
         "| genus = '''''\\1'''''"),

        # Cut superfluous taxa.
        (r'(?m)(?:^\| *(?!(?:regnum|phylum|divisio|classis|ordo|familia|genus|species))'
         r'(?:super|sub|infra|nano)(?:regnum|phylum|divisio|classis|ordo|familia|genus|species) *=.*\n)+'
         r'(^\| *(?:regnum|phylum|divisio|classis|ordo|familia|genus|species)'
         r' *=.*\n)'
         r'(?=^\| *[a-z]+ *=.*$)',
         r'\1'),
        ]

    conditional_subs = [
        # Bold species entry if subject of article.
        ([r'(?m)^\| *binomial *='],
         r'(?m)^\| *species *=[ \']*([^\]\'\}]+)[ \']*$',
         '| species = \'\'\'\'\'\\1\'\'\'\'\''),

        # Bold subspecies entry if subject of article.
        ([r'(?m)^\| *trinomial *='],
         r'(?m)^\| *subspecies *=[ \']*([^\]\'\}]+)[ \']*$',
         '| subspecies = \'\'\'\'\'\\1\'\'\'\'\''),
        ]

    anticonditional_subs = [
        # Supply missing binomial entry.
        ([r'(?m)^\| *binomial *=',
          r'(?m)^\| *subspecies *='],
         r'(?m)(^\| *genus *=[ \'\[]*([A-Z])([a-z]+)[ \'\]]*\n(?:.*\n)*'
         r'(?m)^\| *species *=[ \']*\2. ([a-z-]+)[ \']*\n)',
         r"\1| binomial = ''\2\3 \4''\n"),

        ([r'(?m)^\| *binomial *=',
          r'(?m)^\| *subspecies *='],
         r'(?m)(^\| *species *=[ \']*([A-Z][a-z]+ [a-z-]+)[ \']*\n)',
         r"\1| binomial = ''\2''\n"),
        ]

    def tidy_taxobox(self, text):
        for s in self.subs:
            text = re.sub(s[0], s[1], text)
        for s in self.conditional_subs:
            if all(re.search(c, text) for c in s[0]):
                text = re.sub(s[1], s[2], text)
        for s in self.anticonditional_subs:
            if not any(re.search(c, text) for c in s[0]):
                text = re.sub(s[1], s[2], text)

        # Add FishBase reference.
        if re.search(r'(?m)^\| *[a-z_]+ *= *'
                     r'\[\[(?:Actinopterygii|Chondrichthyes)\]\]$', text) \
            and not re.search(r'{{FishBase', text):
            m1 = re.search(r'(?m)^\| * genus *=[ \'\[]*'
                          r'([A-Z][a-z]+)[ \'\]]*$', text)
            m2 = re.search(r'(?m)^\| species *=[ \']*'
                          r'(?:[A-Z]\. )?([a-z-]+)[ \']*$', text)
            if m1 and m2:
                ref = time.strftime('{{FishBase species | genus = %s | '
                                    'species = %s | month = %%B | year = %%Y}}'
                                    % (m1.group(1), m2.group(1)))
            elif m1:
                ref = time.strftime('{{FishBase genus | genus = %s | '
                                    'month = %%B | year = %%Y}}'
                                    % m1.group(1))
            else:
                ref = None
            if ref:
                m1 = re.search(r'==+ *References? *==+ *\n+', text)
                m2 = re.search(r'(?:(?:{{.*-stub}}|\[\[[a-z][a-z]:.*\]\]'
                               r'|\[\[Category:.*\]\])[ \n]*)*$',
                               text)
                if m1:
                    text = text[:m1.end(0)] \
                            + '* ' + ref + '\n' \
                            + text[m1.end(0):]
                elif m2:
                    text = text[:m2.start(0)] \
                            + '\n==References==\n* ' + ref + '\n' \
                            + text[m2.start(0):]
                else:
                    raise Error, "Nowhere to put FishBase reference"
        return text


    # 6. DISAMBIGUATION

    # Run solve_disambiguation on all botanical abbreviations.
    def disambiguate(self):
        import solve_disambiguation
        for a in self.abbrev.values():
            for aa in a:
                if aa[2] == 'b' and aa[0][-1] == '.':
                    bot = solve_disambiguation.DisambiguationRobot(
                        '0', [aa[1]], False, False, [aa[0]], False, True)
                    bot.run()


def badusage():
    raise Error, ('Usage:\n'
                  '%s --rebuild         Rebuild abbreviation table\n'
                  '%s --query=abbrev    Query abbreviation\n'
                  '%s taxon             Find authority and add it to taxon\n'
                  '%s taxon authority   Add authority to taxon\n'
                  % (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0]))

def main():
    wikipedia.username = 'nomialbot'
    try:
        auth = Authority()
        article = None
        expand = True
        try:
            opts, args = getopt.getopt(sys.argv[1:], 'zdnra:q:',
                                       ['noexpand', 'rebuild', 'article=',
                                        'query=', 'disambig', 'noquery'])
            for o, a in opts:
                if o in ('-q', '--query'):
                    print auth.find_abbreviation(a.decode())
                elif o in ('-r', '--rebuild'):
                    auth.rebuild_abbreviations()
                elif o in ('-d', '--disambig'):
                    auth.disambiguate()
                elif o in ('-a', '--article'):
                    article = a
                elif o in ('-n', '--noexpand'):
                    expand = False
                elif o in ('-z', '--noquery'):
                    auth.noquery = True
                else:
                    badusage()
                    return
        except getopt.GetoptError:
            badusage()
            return
        if len(args) == 1:
            auth.find_and_add_authority(article or args[0], args[0], expand)
        elif len(args) == 2:
            auth.add_authority(article or args[0], args[0], args[1], expand)
        else:
            badusage()
            return
    except Error:
        return

if __name__ == '__main__':
    try:
        main()
    finally:
        wikipedia.stopme()