Bruger:Wegge/Statistik/DumpToDat.py
Udseende
Many eys makes all bugs shallow
Dette er version 0.1 af det program, skrevet i Python, som jeg bruger til at maltraktere et XML-dump, så der kommer plotbare data ud af det. Det er ikke testet synderlig meget, så hvis du finder en fejl, vil jeg gerne høre om den. Programmet er frigivet under GPL, hvilket er tilstrækkeligt tæt på GFDL, til at kildeteksten kan ligge på Wikipedia.
#! /usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright(2006) Anders Wegge Jakobsen
# Available under the FSF GPL license
#
from xml.sax import saxutils
from xml.sax import make_parser
import sys, time, exceptions, calendar
import profile
# Page::Title, Page::FirstRevDate
# User::FirstEditDate
# Date::Users, Date::Pages
debug = 0
danamespaces = [ 'Media', 'Speciel', 'Diskussion', 'Bruger',
'Bruger diskussion', 'Wikipedia', 'Wikipedia diskussion',
'Billede', 'Billede diskussion', 'MediaWiki',
'MediaWiki diskussion', 'Skabelon', 'Skabelon diskussion',
u'Hjælp', u'Hjælp diskussion', 'Kategori',
'Kategori diskussion', 'WikiWegge' ]
darobots = [ 'WeggeBot', 'TwidRobot' ]
class DayStat:
def __init__(self):
self.TimeStamp = 0
self.NewArticlesUser = 0
self.NewArticlesRobot = 0
self.NewUsers = 0
class WikiArticle:
def __init__ (self):
self.Title = ''
self.Oldest = 0
self.Newest = 0
self.isArticleNs = False
self.isRobotGen = False
self.isRedirect = False
class WikiUser:
def __init__ (self):
self.Name = ''
self.First = 0
self.Last = 0
class WikiDump (saxutils.DefaultHandler):
def __init__(self):
# Parser state
self.nowIn = []
self.UserName = []
self.Title = []
self.Text = []
self.TimeStamp = 0
# Users -> First edit date
self.Users = {}
# Page -> First revision, Title, isRedirect
self.Pages = {}
def isRedirect(self, text):
# Is this page a redirect?
return text.find("#REDIRECT [[") == 0
def isRealArticle(self, text):
# Is this page a real article?
return text.find("[[") != 0 and not self.isRedirect(text)
def isArticleNs(self, title):
# Is this an article in the main namespace?
if ':' in title:
for ns in danamespaces:
if ns + ':' not in title:
return False
else:
return True
def isRobot(self, contributor):
# Is this an article in the main namespace?
return contributor in darobots
def characters(self, ch):
if self.nowIn[-1] == 'title':
self.Title.append(ch)
if self.nowIn[-1] == 'timestamp':
self.TimeStampText.append(ch)
if self.nowIn[-1] == 'username':
self.UserName.append(ch)
if self.nowIn[-1] == 'text':
self.Text.append(ch)
def startElement (self, name, attrs):
self.nowIn.append(name)
if debug > 9:
print '>> ' + name
if name == 'title':
self.Title = []
if name == 'timestamp':
self.TimeStampText = []
if name == 'username':
self.UserName = []
if name == 'text':
self.Text = []
def endElement (self, name):
if debug > 9:
print '<<' + name
if self.nowIn.pop() != name:
print 'Something is rotten!'
print 'Removing: ' + name + ' from: '
print self.nowIn
raise Exception
if name == 'title':
self.Title = "".join(self.Title)
return
if name == 'revision':
self.Text = "".join(self.Text)
# We now have the revision text and timestamp
if self.Title in self.Pages:
wa = self.Pages[self.Title]
if wa.Oldest > self.TimeStamp:
wa.isRobotGen = self.isRobot(self.UserName)
if wa.Newest < self.TimeStamp:
wa.isRedirect = self.isRedirect(self.Text)
else:
wa = WikiArticle()
wa.isArticleNs = self.isArticleNs(self.Title)
wa.Title = self.Title
wa.Oldest = 0 + self.TimeStamp # Trip
wa.Newest = self.TimeStamp
wa.isRobotGen = self.isRobot(self.UserName)
wa.isRedirect = self.isRedirect(self.Text)
self.Pages[self.Title] = wa
return
if name == 'timestamp':
self.TimeStampText = "".join(self.TimeStampText)
tmt = time.strptime(self.TimeStampText, '%Y-%m-%dT%H:%M:%SZ')
self.TimeStamp = calendar.timegm((tmt.tm_year, tmt.tm_mon,
tmt.tm_mday, 0, 0, 0))
return
if name == 'contributor':
# Was it a registered user?
self.UserName = "".join(self.UserName)
if self.UserName in self.Users:
wu = self.Users[self.UserName]
if wu.First > self.TimeStamp:
wu.First = self.TimeStamp
if wu.Last < self.TimeStamp:
wu.Last = self.TimeStamp
self.Users[self.UserName] = wu
elif self.UserName:
wu = WikiUser()
wu.Name = self.UserName
wu.First = self.TimeStamp
wu.Last = self.TimeStamp
self.Users[self.UserName] = wu
return
def main():
filename = 'dawiki-20060220-pages-meta-history.xml'
# filename = '/tmp/wikiwegge.xml'
wd = WikiDump()
parser = make_parser()
parser.setContentHandler(wd)
parser.parse(filename)
hist = {}
for i in wd.Users.values():
if i.First in hist:
h = hist[i.First]
else:
h = DayStat()
h.TimeStamp = i.First
h.NewUsers += 1
hist[i.First] = h
for i in wd.Pages.values():
if i.Oldest in hist:
h = hist[i.Oldest]
else:
h = DayStat()
h.TimeStamp = i.Oldest
if i.isRobotGen and not i.isRedirect and i.isArticleNs:
h.NewArticlesRobot += 1
elif not i.isRobotGen and not i.isRedirect and i.isArticleNs:
h.NewArticlesUser += 1
hist[i.Oldest] = h
NewPagesUsersByDate = hist.values()
NewPagesUsersByDate.sort()
#
# And now ... output
#
print '#\n# Stats based on %s\n#\n' % filename
print '# Timestamp, Number of users, Total pages by user,' \
+' Total pages by robots'
users = 0
userpages = 0
botpages = 0
for i in NewPagesUsersByDate:
users += i.NewUsers
botpages += i.NewArticlesRobot
userpages += i.NewArticlesUser
print '%d %d %d %d' % (i.TimeStamp, users, userpages, botpages)
if __name__ == "__main__":
main()