Benutzer:Christoph Burgmer/topiclist.py

Dieses Tool soll dabei helfen die Themenlisten der Portale zu verwalten und mit Artikeln aus den passenden Kategorien zu ergänzen.
Dazu braucht es ein wenig geübten Umgang mit dem Computer und Python.
Bisher kann eine bestehende Liste sortiert und neue Artikel über Catscan hinzugefügt werden.
TODO (sehr unwahrscheinlich, dass diese Punkte erledigt werden):
Überprüfung der Links auf BKLen oder Weiterleitungen
Späteres Entfernen automatisch hinzugefügter Artikel
Aufsetzung auf den Toolserver mit idiotensicherer Handhabung
#!/usr/bin/python
# -*- coding: utf8 -*-

"""
2008 Christoph Burgmer ([[de.wikipedia.org/wiki/User:Christoph Burgmer]])

This script helps to maintain a list of topics of Wikipedia articles.
Features:
    - Sorting of entries grouped by sections (default: alphabetically)
    - Parsing and cleaning of different entry types (see examples below)
        Types understood (all need language specific adaption)
        - normal entry (e.g. [[Acht Unsterbliche]])
        - redirect (e.g. [[Baxian]] (→ [[Acht Unsterbliche]]))
        - disambiguation (e.g. [[Tai Chi]] (Begriffsklärung))
        - reverse name (e.g. Watts, Alan ([[Alan Watts]])
    - Merging with articles lists generated by catscan
        ([http://tools.wikimedia.de/~daniel/WikiSense/CategoryIntersect.php])
    - Adding of reverse names

The script takes the following input:

    1. Existing topic list (might be empty). Format:
        == A ==
        [[Acht Unsterbliche]]
        [[Alan Watts]]
        == B ==
        [[Baxian]] (→ [[Acht Unsterbliche]])
        == T ==
        [[Tai Chi]] (Begriffsklärung)
        == W ==
        Watts, Alan ([[Alan Watts]])

    2. Optional CSV list generated by catscan:
        0	Acht_Schulen_des_Daoismus	Daoismus		3436616
        0	Acht_Unsterbliche	Daoismus		295493
        0	Alan_Watts	Person_des_Daoismus		278493

    3. Optional prefix:
        A prefix for that words beginning with this string will be added in
        reverse naming, e.g. 'Sexual-Praktiken, Taoistische' for
        'Taoistische Sexual-Praktiken' with prefix 'Tao'

License: MIT License

Copyright (c) 2008 Christoph Burgmer
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

import re
import sys
import codecs
import unicodedata

CATEGORIES = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# main sections of list, all articles are grouped into one of the sections above
#   if their first character fits one of the strings
CATEGORY_OTHER = '0-9'
# section for articles that don't fint into one of the sections above

SIMPLE_REGEX = re.compile(ur'\[\[([^\]]+)\]\]\s*-?\s*$')
# Schema: [[Anshan (China)]] -

REDIRECT_REGEX = re.compile(\
    ur'\[\[([^\]]+)\]\]\s*\(→?\s*\[\[([^\]]+)\]\]\)\s*-?\s*$')
# Schema: [[Chang E]] (→ [[Chang'e]]) -

BKL_REGEX = re.compile(\
    ur'\[\[([^\]]+)\]\]\s*\((?:BKL|Begriffsklärung)\)\s*-?\s*$')
# Schema: [[Chang'an]] (Begriffsklärung) -

NAME_REGEX = re.compile(ur'([^(]+)\s*\(?→?\s*\[\[([^\]]+)\]\]\)?\s*-?\s*$')
# Schema: Cantlie, Sir James ([[James Cantlie]]) -

# finde zwei Einträge pro Zeile
ERRORNEOUS_REGEX = re.compile(ur'\[\[[^\]]*\|[^\]]*\]\]|' \
    + ur'(\[\[([^\]]+)\]\][^-]+-.*\[\[([^\]]+)\]\].*-?)')

IGNORE_REGEX = re.compile(ur'(=.*=\s*$)|(\s*$)')

def readTable(fileHandle):
    entries = []
    for line in fileHandle:
        if IGNORE_REGEX.match(line):
            continue
        elif ERRORNEOUS_REGEX.match(line):
            print >> sys.stderr, ("line: \"" + line \
                + "\" errorneous").encode('utf8')
            continue
        elif SIMPLE_REGEX.match(line):
            matchObj = SIMPLE_REGEX.match(line)
            article = matchObj.group(1).strip()
            entries.append((article, None, "Simple"))
        elif REDIRECT_REGEX.match(line):
            matchObj = REDIRECT_REGEX.match(line)
            redirect = matchObj.group(1).strip()
            target = matchObj.group(2).strip()
            entries.append((redirect, target, "Redirect"))
        elif BKL_REGEX.match(line):
            matchObj = BKL_REGEX.match(line)
            bkl = matchObj.group(1).strip()
            entries.append((bkl, None, "BKL"))
        elif NAME_REGEX.match(line):
            matchObj = NAME_REGEX.match(line)
            name = matchObj.group(1).strip()
            target = matchObj.group(2).strip()
            entries.append((name, target, "Name"))
        else:
            print >> sys.stderr, ("line: \"" + line \
                + "\" not parsed").encode('utf8')
    return entries

def formatEntry((a, b, typ)):
    if typ == 'Simple':
        return '[[' + a + ']]'
    elif typ == 'Redirect':
        return '[[' + a + ']]' + u' (→ [[' + b + ']])'
    elif typ == 'BKL':
        return '[[' + a + ']]' + u' (Begriffsklärung)'
    elif typ == 'Name':
        return a + ' ([[' + b + ']])'

def main():
    if len(sys.argv) < 2:
        print """Usage: topiclist.py listFile [catscanCSV [reversePrefix]]

where:
    - listFile is a name of a file containing a topic list of wiki links grouped
        alphabetically (default)
    - catscanCSV is a name of a file of comma separated values generated by
        catscan
    - reversePrefix is a string inducing reverse writing for names beginning
        with this prefix being added to the list (e.g. 'chin' will result in
        "language, Chinese (→ [[Chinese language]])" being added to the list)"""
        sys.exit()

    # read list
    infile = codecs.open(sys.argv[1], "r", "utf-8")
    entries = readTable(infile)

    # check for double entries
    have = set([])
    haveLower = set([])
    for entry in entries:
        a, _, _ = entry
        if a in have:
            print >> sys.stderr, ("warning: article \"" + a \
                + "\" is double entry in original list").encode('utf8')
        else:
            have.add(a)
            if a.lower() in haveLower:
                print >> sys.stderr, ("warning: article \"" + a \
                    + "\" in differerent upper/lower case combination " \
                    + "included in original list").encode('utf8')
            else:
                haveLower.add(a.lower())

    # add catscan list if given
    if len(sys.argv) > 2:
        catscanFile = open(sys.argv[2], "r")
        import csv
        # import table from csv file
        fileDialect = csv.Sniffer().sniff(catscanFile.readline())
        fileDialect.delimiter = "\t"
        catscanFile.seek(0)
        table = csv.reader(catscanFile, dialect=fileDialect)
        redir = set([]) # articles aleady included as redirect
        nonLink = set([]) # names (i.e. inverted) without link
        for entry in entries:
            a, _, typ = entry
            if typ == 'Redirect':
                redir.add(a.lower())
            elif typ == 'Name':
                nonLink.add(a.lower())
        for line in table:
            # only from article space
            if line[0] != '0':
                continue
            article = line[1].decode('utf8').replace('_', ' ')
            if not article.lower() in haveLower:
                entries.append((article, None, 'Simple'))
                have.add(article)
                haveLower.add(article.lower())
            elif not article in have:
                print >> sys.stderr, ("warning: article '" + article \
                    + "' in differerent upper/lower case combination included")\
                    .encode('utf8')
            if article.lower() in redir:
                print >> sys.stderr, ("warning: article '" + article \
                    + "' already included as redirect").encode('utf8')
            if article.lower() in nonLink:
                print >> sys.stderr, ("warning: article '" + article \
                    + "' included as non link").encode('utf8')

    # add inverted antries if prefix given
    if len(sys.argv) > 3:
        prefix = sys.argv[3]
        for entry in entries:
            a, b, typ = entry
            if typ != 'Name' and a.startswith(prefix):
                prefixPart, rest = a.split(' ', 1)
                prefixName = rest + ', ' + prefixPart
                if not prefixName.lower() in haveLower:
                    if typ == 'Redirect':
                        link = b
                    else:
                        link = a
                    entries.append((prefixName, link, 'Name'))

    # group entries
    categories = dict([(char, []) for char in CATEGORIES])
    categories[CATEGORY_OTHER] = []

    for entry in entries:
        a,_,_ = entry
        firstChar = a[0].lower()
        if firstChar not in CATEGORIES \
            and unicodedata.normalize('NFD', firstChar)[0] in CATEGORIES:
            firstChar = unicodedata.normalize('NFD', firstChar)[0]
        if firstChar in CATEGORIES:
            categories[firstChar].append(entry)
        else:
            # put all non alphabetical articles into one category
            categories[CATEGORY_OTHER].append(entry)

    # print out list
    categories[CATEGORY_OTHER].sort()
    print "== " + CATEGORY_OTHER + " ==".encode('utf8')
    print " -\n".join([formatEntry(entry) \
        for entry in categories[CATEGORY_OTHER]]).encode('utf8')
    print
    for char in CATEGORIES:
        categories[char].sort()
        print "== " + char.upper() + " ==".encode('utf8')
        print " -\n".join([formatEntry(entry) for entry in categories[char]])\
            .encode('utf8')
        print

if __name__ == "__main__":
    main()