Jump to content

User:Tsirel/Bot code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Tsirel (talk | contribs) at 20:02, 14 March 2009 (bot indeed). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

program

import sys
import copy

sys.path.append("/home/boris/Wiki/Bot/pywikipedia")

import wikipedia

mysite=wikipedia.getSite()
params_page=wikipedia.Page(mysite,'User:Tsirel/Bot parameters')
params_txt=params_page.get()

params_txt=params_txt.splitlines()
assert params_txt.pop(0)=="== files of parameters ==","bad parameters"

def parse_param (name):
    assert params_txt.pop(0)=="=== "+name+".dat ===","bad parameter "+name
    assert params_txt.pop(0)=="","bad parameter "+name
    assert params_txt.pop(0)=="<pre>","bad parameter "+name
    param=""
    while params_txt[0]!="

":

       param += params_txt.pop(0)
   params_txt.pop(0)
   if params_txt:
       assert params_txt.pop(0)=="","bad parameter "+name
   return eval(param)

corenames = parse_param ("corenames") aroundnames = parse_param ("aroundnames") rvnumber = parse_param ("rvnumber") rvtype = parse_param ("rvtype") headings = parse_param ("headings")

source_page=wikipedia.Page(mysite,'User:Tsirel/Catalog source') source_txt=source_page.get()

source_txt=source_txt.splitlines()

assert source_txt.pop(0)=="

","bad source"
assert source_txt.pop()=="

","bad source"

  1. #
  2. parse #
  3. #
  4. Main infile "all.in"; its lines: "article tags". #
  5. Tags are space-separated, #
  6. each either "(rvnumber:rvtype)" #
  7. or a name (corename or aroundname) and maybe suffix "1" or "2". #
  8. Parameters from four infiles. #
  9. Outfile: all.dat (for format.py). #
  10. #

def parse (source_txt, corenames, aroundnames, rvnumber, rvtype):

   # only abbreviations will be used; full texts are for "format.py"
   corenames = map ( (lambda(x,y):x), corenames )
   aroundnames = map ( (lambda(x,y):x), aroundnames )
   rvnumber = map ( (lambda(x,y):x), rvnumber )
   rvtype = map ( (lambda(x,y):x), rvtype )
   # distinction of "core" and "around" is for "format.py"
   names = corenames + aroundnames
   # parameters are ready; the main loop follows
   biglist = []                            # for parsed lines
   notags = []                             # for lines without tags (if any)
   everything = {}                         # for articles (duplication check)
   for line in source_txt:                 # main loop: lines of the main input file
       line=line.strip()
       if line=="":  continue              # ignore empty lines (if any)
       assert line.startswith("[["), "BAD LINE: "+line         # syntax error in the main input file
       article=line[2:line.index("]]")]    # extract article from "article tags"
       article=article[0].upper()+article[1:]        # article -> Article
       # if article in everything:           # duplication?
           # print "AGAIN "+article
       everything[article]=0               # the 0 is of no use
       rest=line[line.index("]]")+2:].split()        # list of tags
       if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
       tags0=[]                            # for "(A:B)" tags converted to (A,B)
       tags1=[]                            # for other tags
       for tag in rest:                    # loop over all tags of the given line
           if tag.startswith("("):         # "(A:B)" tag?
               maintag=tag[1:]
               assert maintag.endswith(")"), "!!! "+maintag    # syntax error
               maintag=maintag[:-1]
               maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
               maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
               for x in maintag1:          # syntax check
                   assert x in rvnumber, "?? "+x+" "+whole_line
               for x in maintag2:          # syntax check
                   assert x in rvtype, "?? "+x+" "+whole_line
               tags0.append((maintag1,maintag2))     # store (A,B)
           else:                           # other tag, not "A:B)"
               assert len(tag)==3 or len(tag)==4, "bad tag"     # 3 chars, and maybe 1 char suffix
               assert tag[0:3] in names, "BAD TAG: "+tag
               if len(tag)==4:
                   assert tag[3] in ["1","2"], "bad tag"      # suffix syntax check
               tags1.append(tag)           # store the other tag
       biglist.append( (tags0,tags1,article) )       # store the parsed line
   return(biglist)
                                                1. end of parse ###################################
  1. #
  2. format.py #
  3. #
  4. Main infile "all.dat" (produced by parse.py). #
  5. A tag not of the form "(A:B)" directs a line to the first #
  6. (core:selected) or the third (around the core) section, #
  7. according to corenames and aroundnames. #
  8. A tag of the form "(A:B)" directs a line to the the second section #
  9. (core:other) unless it is directed to the first section. #
  10. In the first section a line is special if the tag has a suffix #
  11. "1" or "2". #
  12. Parameters from five infiles. #
  13. Outfile: all.out (to be uploaded). #
  14. #

def format (biglist):

   #   Internal   F U N C T I O N S
   #
   # frmt: format a line
   #
   # whole is (tags0,tags1,article), special is True or False
   # tags0: list of (A,B) pairs corresponding to "(A:B)" tags
   # tags1: list of other tags
   # article of the form "X(Y)" turns into "X(Y)|X"
   #  but article of the form "X!(Y)" turns into "X(Y)"
   # returns "article / tags
" if special is False, # or "article / tags
" if special is True.
   def frmt (whole,special):
       (tags0,tags1,article) = whole
       k=article.find("(",5)
       if k>0:                             # article of the form "X(Y)" or "X!(Y)" ?
           if article[k-1] != "!":
               article += "|"+article[0:k].strip()        # "X(Y)|X"
           else:
               article = article[:k-1]+article[k:]        # "X(Y)"
       if special:
           res=""+article+" / "
       else:
           res=""+article+" / "
       for z in tags1:                     # first, other tags, not "(A:B)"
           res += z[0:3]+" "               # remove suffix (if any)
       for tag0 in tags0:                  # last, "(A:B)" tags
           (first,second)=tag0
           res += "("+first+":"+second+") "     # convert (A,B) back to "(A:B)"
       res = res[:-1]+"
" # remove the last space return res
   # pr: prints text to the string "formatted"
   def pr(text):
       global formatted
       formatted += text+"\n"
   # flst: format and print list of lines by frmt
   # lst: list of usual lines
   # lst1, lst2: lists of special lines
   # (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1)
   # two columns are produced unless the list is short
   def flst (lst,lst1,lst2):
       assert len(lst2) <= len(lst1), "bad special lines"
       assert len(lst1) <= 1, "bad special lines"
       l = len(lst)
       if l<4 and lst2==[]:                # one column?
           if lst1!=[]:                    #  special line?
               pr(frmt(lst1[0],True))      #  format special line
           for whole in lst:
               pr(frmt(whole,False))       #  format usual line
       else:                               # two columns
           k=(l+1-len(lst1)+len(lst2))//2       # a half of lines - to the left column
           pr("{{Top}} may refer to:

") # start the left column

           if lst1!=[]:                    #  special line?
               pr(frmt(lst1[0],True))      #  format special line
           for whole in lst[:k]:
               pr(frmt(whole,False))       #  format usual line
           pr("

| class="col-break " | ") # start the right column

           if lst2!=[]:                    #  special line?
               pr(frmt(lst2[0],True))      #  format special line
           for whole in lst[k:]:
               pr(frmt(whole,False))       #  format usual line
           pr("Template:Bottom")                # finish the right column
       return
   #   e n d   o f  internal   f u n c t i o n s


   # initialize lists for "(A:B)" tags
   # A[x+y] will collect all lines containing "(x:y)"
   A = {}
   for (x,xx) in rvnumber:
       for (y,yy) in rvtype:
           A[x+y]=[]
   # initialize lists for other core tags
   # B[z] will collect all usual lines containing z
   # B1[z] will collect all special lines containing tag z with suffix 1
   # B2[z] will collect all special lines containing tag z with suffix 2
   B = dict ( map ( (lambda(x,y):(x,[])), corenames ) )
   B1 = copy.deepcopy(B)
   B2 = copy.deepcopy(B)
   # initialize lists for other non-core tags
   # C[z] will collect all lines containing z
   C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) )
   # initialize counters
   count_sel_links=0
   count_sel_articles=0
   count_other_links=0
   count_other_articles=0
   count_around_links = 0
   count_around_articles = 0
   # distribute the lines (according to tags) to the lists
   for whole in biglist:
       (tags0,tags1,article) = whole
       tags11=[]                           # core-selected tags to be collected here
       tags12=[]                           # non-core tags to be collected here
       for z in tags1:
           if z[0:3] in B:                 # tag (without suffix) belongs to core?
               tags11.append(z)            # store in tags11
           else:
               tags12.append(z)            # store in tags12
       if tags11!=[]:                      # some core-selected tags?
           count_sel_articles += 1
           count_sel_links += len(tags11)
       elif tags0!=[]:                     # some core-other (but no core-selected) tags?
           count_other_articles += 1
       if tags12!=[]:                      # some non-core tags?
           count_around_articles += 1
           count_around_links += len(tags12)
       for z in tags11:                    # process core-selected tags
           if len(z)==3:                   # tag with no suffix?
               B[z].append(whole)          # store usual line
           elif z[3]=="1":                 # tag with suffix "1"?
               B1[z[0:3]].append(whole)    # store special line
           else:                           # tag with suffix "2"
               B2[z[0:3]].append(whole)    # store special line
       if tags11==[]:                      # no core-selected tags? then process "(A:B)" tags
           for tag0 in tags0:              # process "(A:B)" tags
               (first,second)=tag0
               for x in first:
                   for y in second:
                       A[x+y].append(whole)
                       count_other_links += 1
       for z in tags12:                    # process non-core tags
           C[z].append(whole)              # store line
   # sort each list alphabetically; sort key is article (not tag)
   for z in A:
       A[z].sort(key=(lambda wh: wh[2]))
   for z in B:
       B[z].sort(key=(lambda wh: wh[2]))
   for z in C:
       C[z].sort(key=(lambda wh: wh[2]))
   # lists are ready; now start generating the output
   pr(headings[0])
   pr("")
   pr(headings[1])
   pr("")
   # format and print selected core topics
   for (z,text) in corenames:
       pr("==="+text+" ("+z+")===")
       pr("")
       flst(B[z],B1[z],B2[z])
       pr("")
   # format and print other core topics
   pr(headings[2])
   pr("")
   for (x,text1) in rvnumber:
       pr("==="+text1+" ("+x+":)===")
       pr("")
       for (y,text2) in rvtype:
           if A[x+y] != []:
               pr("===="+text2+" ("+x+":"+y+")====")
               pr("")
               flst(A[x+y],[],[])
               pr("")
   # format and print non-core topics
   pr(headings[3])
   pr("")
   for (z,text) in aroundnames:
       pr("==="+text+" ("+z+")===")
       pr("")
       flst(C[z],[],[])
       pr("")
   pr(headings[4])
   pr("{{Top}} may refer to:

")

   pr( '"Core": %i (%i)
' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links ) ) pr( '"Around": %i (%i)
' % ( count_around_articles, count_around_links) ) pr("

| class="col-break " | ")

   pr( '"Core selected": %i (%i)
' % (count_sel_articles,count_sel_links) ) pr( '"Core others": %i (%i)
' % ( count_other_articles, count_other_links ) ) pr("Template:Bottom") pr("") pr( "Here k(n) means: n links to k articles. (Some articles are linked more than once.)" ) return
                                                1. end of format ####################################


                                            1. now execute them #####################################

parsed = parse (source_txt, corenames, aroundnames, rvnumber, rvtype)

formatted = "" # initialize the output string

format (parsed)

out_page=wikipedia.Page(mysite,'User:Tsirel/Catalog')

out_page.put(formatted) ### write to Wikipedia ###

                                                      1. the end #########################################

files of parameters

corenames.dat

[('bsc', 'Basic notions'), ('mnt', 'Moments'), ('inq', 'Inequalities'), ('Mar', 'Markov chains, processes,
fields, networks'), ('Gau', 'Gaussian random variables, vectors, functions'), ('cnd', 'Conditioning'), ('spd',
'Specific distributions'), ('emm', 'Empirical measure'), ('lmt', 'Limit theorems'), ('lrd', 'Large deviations'), 
('scl', 'Stochastic calculus'), ('Mal', 'Malliavin calculus'), ('anl', 'Analytic aspects (including measure
theoretic)')]

aroundnames.dat

[('grl', 'General aspects'), ('fnd', 'Foundations of probability theory'), ('gmb', 'Gambling'), ('cnc',
'Coincidence'), ('alg', 'Algorithmics'), ('Bay', 'Bayesian approach'), ('fnc', 'Financial mathematics'), ('phs', 
'Physics'), ('gnt', 'Genetics'), ('spr', 'Stochastic process'), ('geo', 'Geometric probability'), ('emp',
'Empirical findings'), ('hst', 'Historical'), ('msc', 'Miscellany')]

rvnumber.dat

[("1","A single random variable"),
 ("2","Two random variables"),
 ("3","Three random variables"),
 ("F","Finitely many random variables"),
 ("L","A large number of random variables (finite but tending to infinity)"),
 ("S","An infinite sequence of random variables"),
 ("U","Uncountably many random variables (continuous-time processes etc)")]

rvtype.dat

[("B","Binary"), ("D","Discrete"),
 ("C", "Continuous"), ("R","Real-valued, arbitrary"),
 ("M","Random point of a manifold"),
 ("G","General (random element of an abstract space)")]

headings.dat

["==Core probability: selected topics==",
 "'''[[Probability theory]]'''",
 "==Core probability: other articles, by number and type of random variables==",
 "==Around the core==",
 "==Counters of articles=="]