Benutzer:Christoph Burgmer/topiclist.py
Erscheinungsbild
Dieses Tool soll dabei helfen die Themenlisten der Portale zu verwalten und mit Artikeln aus den passenden Kategorien zu ergänzen.
Dazu braucht es ein wenig geübten Umgang mit dem Computer und Python.
Bisher kann eine bestehende Liste sortiert und neue Artikel über Catscan hinzugefügt werden.
TODO (sehr unwahrscheinlich, dass diese Punkte erledigt werden):
- Überprüfung der Links auf BKLen oder Weiterleitungen
- Späteres Entfernen automatisch hinzugefügter Artikel
- Aufsetzung auf den Toolserver mit idiotensicherer Handhabung
#!/usr/bin/python
# -*- coding: utf8 -*-
"""
2008 Christoph Burgmer ([[de.wikipedia.org/wiki/User:Christoph Burgmer]])
This script helps to maintain a list of topics of Wikipedia articles.
Features:
- Sorting of entries grouped by sections (default: alphabetically)
- Parsing and cleaning of different entry types (see examples below)
Types understood (all need language specific adaption)
- normal entry (e.g. [[Acht Unsterbliche]])
- redirect (e.g. [[Baxian]] (→ [[Acht Unsterbliche]]))
- disambiguation (e.g. [[Tai Chi]] (Begriffsklärung))
- reverse name (e.g. Watts, Alan ([[Alan Watts]])
- Merging with articles lists generated by catscan
([http://tools.wikimedia.de/~daniel/WikiSense/CategoryIntersect.php])
- Adding of reverse names
The script takes the following input:
1. Existing topic list (might be empty). Format:
== A ==
[[Acht Unsterbliche]]
[[Alan Watts]]
== B ==
[[Baxian]] (→ [[Acht Unsterbliche]])
== T ==
[[Tai Chi]] (Begriffsklärung)
== W ==
Watts, Alan ([[Alan Watts]])
2. Optional CSV list generated by catscan:
0 Acht_Schulen_des_Daoismus Daoismus 3436616
0 Acht_Unsterbliche Daoismus 295493
0 Alan_Watts Person_des_Daoismus 278493
3. Optional prefix:
A prefix for that words beginning with this string will be added in
reverse naming, e.g. 'Sexual-Praktiken, Taoistische' for
'Taoistische Sexual-Praktiken' with prefix 'Tao'
License: MIT License
Copyright (c) 2008 Christoph Burgmer
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
import re
import sys
import codecs
import unicodedata
CATEGORIES = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# main sections of list, all articles are grouped into one of the sections above
# if their first character fits one of the strings
CATEGORY_OTHER = '0-9'
# section for articles that don't fint into one of the sections above
SIMPLE_REGEX = re.compile(ur'\[\[([^\]]+)\]\]\s*-?\s*$')
# Schema: [[Anshan (China)]] -
REDIRECT_REGEX = re.compile(\
ur'\[\[([^\]]+)\]\]\s*\(→?\s*\[\[([^\]]+)\]\]\)\s*-?\s*$')
# Schema: [[Chang E]] (→ [[Chang'e]]) -
BKL_REGEX = re.compile(\
ur'\[\[([^\]]+)\]\]\s*\((?:BKL|Begriffsklärung)\)\s*-?\s*$')
# Schema: [[Chang'an]] (Begriffsklärung) -
NAME_REGEX = re.compile(ur'([^(]+)\s*\(?→?\s*\[\[([^\]]+)\]\]\)?\s*-?\s*$')
# Schema: Cantlie, Sir James ([[James Cantlie]]) -
# finde zwei Einträge pro Zeile
ERRORNEOUS_REGEX = re.compile(ur'\[\[[^\]]*\|[^\]]*\]\]|' \
+ ur'(\[\[([^\]]+)\]\][^-]+-.*\[\[([^\]]+)\]\].*-?)')
IGNORE_REGEX = re.compile(ur'(=.*=\s*$)|(\s*$)')
def readTable(fileHandle):
entries = []
for line in fileHandle:
if IGNORE_REGEX.match(line):
continue
elif ERRORNEOUS_REGEX.match(line):
print >> sys.stderr, ("line: \"" + line \
+ "\" errorneous").encode('utf8')
continue
elif SIMPLE_REGEX.match(line):
matchObj = SIMPLE_REGEX.match(line)
article = matchObj.group(1).strip()
entries.append((article, None, "Simple"))
elif REDIRECT_REGEX.match(line):
matchObj = REDIRECT_REGEX.match(line)
redirect = matchObj.group(1).strip()
target = matchObj.group(2).strip()
entries.append((redirect, target, "Redirect"))
elif BKL_REGEX.match(line):
matchObj = BKL_REGEX.match(line)
bkl = matchObj.group(1).strip()
entries.append((bkl, None, "BKL"))
elif NAME_REGEX.match(line):
matchObj = NAME_REGEX.match(line)
name = matchObj.group(1).strip()
target = matchObj.group(2).strip()
entries.append((name, target, "Name"))
else:
print >> sys.stderr, ("line: \"" + line \
+ "\" not parsed").encode('utf8')
return entries
def formatEntry((a, b, typ)):
if typ == 'Simple':
return '[[' + a + ']]'
elif typ == 'Redirect':
return '[[' + a + ']]' + u' (→ [[' + b + ']])'
elif typ == 'BKL':
return '[[' + a + ']]' + u' (Begriffsklärung)'
elif typ == 'Name':
return a + ' ([[' + b + ']])'
def main():
if len(sys.argv) < 2:
print """Usage: topiclist.py listFile [catscanCSV [reversePrefix]]
where:
- listFile is a name of a file containing a topic list of wiki links grouped
alphabetically (default)
- catscanCSV is a name of a file of comma separated values generated by
catscan
- reversePrefix is a string inducing reverse writing for names beginning
with this prefix being added to the list (e.g. 'chin' will result in
"language, Chinese (→ [[Chinese language]])" being added to the list)"""
sys.exit()
# read list
infile = codecs.open(sys.argv[1], "r", "utf-8")
entries = readTable(infile)
# check for double entries
have = set([])
haveLower = set([])
for entry in entries:
a, _, _ = entry
if a in have:
print >> sys.stderr, ("warning: article \"" + a \
+ "\" is double entry in original list").encode('utf8')
else:
have.add(a)
if a.lower() in haveLower:
print >> sys.stderr, ("warning: article \"" + a \
+ "\" in differerent upper/lower case combination " \
+ "included in original list").encode('utf8')
else:
haveLower.add(a.lower())
# add catscan list if given
if len(sys.argv) > 2:
catscanFile = open(sys.argv[2], "r")
import csv
# import table from csv file
fileDialect = csv.Sniffer().sniff(catscanFile.readline())
fileDialect.delimiter = "\t"
catscanFile.seek(0)
table = csv.reader(catscanFile, dialect=fileDialect)
redir = set([]) # articles aleady included as redirect
nonLink = set([]) # names (i.e. inverted) without link
for entry in entries:
a, _, typ = entry
if typ == 'Redirect':
redir.add(a.lower())
elif typ == 'Name':
nonLink.add(a.lower())
for line in table:
# only from article space
if line[0] != '0':
continue
article = line[1].decode('utf8').replace('_', ' ')
if not article.lower() in haveLower:
entries.append((article, None, 'Simple'))
have.add(article)
haveLower.add(article.lower())
elif not article in have:
print >> sys.stderr, ("warning: article '" + article \
+ "' in differerent upper/lower case combination included")\
.encode('utf8')
if article.lower() in redir:
print >> sys.stderr, ("warning: article '" + article \
+ "' already included as redirect").encode('utf8')
if article.lower() in nonLink:
print >> sys.stderr, ("warning: article '" + article \
+ "' included as non link").encode('utf8')
# add inverted antries if prefix given
if len(sys.argv) > 3:
prefix = sys.argv[3]
for entry in entries:
a, b, typ = entry
if typ != 'Name' and a.startswith(prefix):
prefixPart, rest = a.split(' ', 1)
prefixName = rest + ', ' + prefixPart
if not prefixName.lower() in haveLower:
if typ == 'Redirect':
link = b
else:
link = a
entries.append((prefixName, link, 'Name'))
# group entries
categories = dict([(char, []) for char in CATEGORIES])
categories[CATEGORY_OTHER] = []
for entry in entries:
a,_,_ = entry
firstChar = a[0].lower()
if firstChar not in CATEGORIES \
and unicodedata.normalize('NFD', firstChar)[0] in CATEGORIES:
firstChar = unicodedata.normalize('NFD', firstChar)[0]
if firstChar in CATEGORIES:
categories[firstChar].append(entry)
else:
# put all non alphabetical articles into one category
categories[CATEGORY_OTHER].append(entry)
# print out list
categories[CATEGORY_OTHER].sort()
print "== " + CATEGORY_OTHER + " ==".encode('utf8')
print " -\n".join([formatEntry(entry) \
for entry in categories[CATEGORY_OTHER]]).encode('utf8')
print
for char in CATEGORIES:
categories[char].sort()
print "== " + char.upper() + " ==".encode('utf8')
print " -\n".join([formatEntry(entry) for entry in categories[char]])\
.encode('utf8')
print
if __name__ == "__main__":
main()