Jump to content

User:Tsirel/Bot code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Tsirel (talk | contribs) at 20:16, 14 March 2009. The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

program

<code>
import sys
import copy

sys.path.append("/home/boris/Wiki/Bot/pywikipedia")

import wikipedia

mysite=wikipedia.getSite()
params_page=wikipedia.Page(mysite,'User:Tsirel/Bot parameters')
params_txt=params_page.get()

params_txt=params_txt.splitlines()
assert params_txt.pop(0)=="== files of parameters ==","bad parameters"

def parse_param (name):
    assert params_txt.pop(0)=="=== "+name+".dat ===","bad parameter "+name
    assert params_txt.pop(0)=="","bad parameter "+name
    assert params_txt.pop(0)=="<pre>","bad parameter "+name
    param=""
    while params_txt[0]!="</pre>":
        param += params_txt.pop(0)
    params_txt.pop(0)
    if params_txt:
        assert params_txt.pop(0)=="","bad parameter "+name
    return eval(param)
 
corenames = parse_param ("corenames")
aroundnames = parse_param ("aroundnames")
rvnumber = parse_param ("rvnumber")
rvtype = parse_param ("rvtype")
headings = parse_param ("headings")

source_page=wikipedia.Page(mysite,'User:Tsirel/Catalog source')
source_txt=source_page.get()

source_txt=source_txt.splitlines()
assert source_txt.pop(0)=="<pre>","bad source"
assert source_txt.pop()=="</pre>","bad source"
    
######################################################################
#                                                                    #
#          parse                                                     #
#                                                                    #
# Main infile "all.in"; its lines: "[[article]] tags".               #
# Tags are space-separated,                                          #
#  each either "(rvnumber:rvtype)"                                   #
#  or a name (corename or aroundname) and maybe suffix "1" or "2".   #
# Parameters from four infiles.                                      #
# Outfile: all.dat (for format.py).                                  #
#                                                                    #
######################################################################

def parse (source_txt, corenames, aroundnames, rvnumber, rvtype):

    # only abbreviations will be used; full texts are for "format.py"

    corenames = map ( (lambda(x,y):x), corenames )
    aroundnames = map ( (lambda(x,y):x), aroundnames )
    rvnumber = map ( (lambda(x,y):x), rvnumber )
    rvtype = map ( (lambda(x,y):x), rvtype )

    # distinction of "core" and "around" is for "format.py"

    names = corenames + aroundnames

    # parameters are ready; the main loop follows

    biglist = []                            # for parsed lines
    notags = []                             # for lines without tags (if any)
    everything = {}                         # for articles (duplication check)

    for line in source_txt:                 # main loop: lines of the main input file
        line=line.strip()
        if line=="":  continue              # ignore empty lines (if any)
        assert line.startswith("[["), "BAD LINE: "+line         # syntax error in the main input file
        article=line[2:line.index("]]")]    # extract article from "[[article]] tags"
        article=article[0].upper()+article[1:]        # article -> Article
        # if article in everything:           # duplication?
            # print "AGAIN "+article
        everything[article]=0               # the 0 is of no use
        rest=line[line.index("]]")+2:].split()        # list of tags
        if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
        tags0=[]                            # for "(A:B)" tags converted to (A,B)
        tags1=[]                            # for other tags
        for tag in rest:                    # loop over all tags of the given line
            if tag.startswith("("):         # "(A:B)" tag?
                maintag=tag[1:]
                assert maintag.endswith(")"), "!!! "+maintag    # syntax error
                maintag=maintag[:-1]
                maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
                maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
                for x in maintag1:          # syntax check
                    assert x in rvnumber, "?? "+x+" "+whole_line
                for x in maintag2:          # syntax check
                    assert x in rvtype, "?? "+x+" "+whole_line
                tags0.append((maintag1,maintag2))     # store (A,B)
            else:                           # other tag, not "A:B)"
                assert len(tag)==3 or len(tag)==4, "bad tag"     # 3 chars, and maybe 1 char suffix
                assert tag[0:3] in names, "BAD TAG: "+tag
                if len(tag)==4:
                    assert tag[3] in ["1","2"], "bad tag"      # suffix syntax check
                tags1.append(tag)           # store the other tag
        biglist.append( (tags0,tags1,article) )       # store the parsed line

    return(biglist)

########################  end  of  parse  ###################################

######################################################################
#                                                                    #
#          format.py                                                 #
#                                                                    #
# Main infile "all.dat" (produced by parse.py).                      #
# A tag not of the form "(A:B)" directs a line to the first          #
#  (core:selected) or the third (around the core) section,           #
#  according to corenames and aroundnames.                           #
# A tag of the form "(A:B)" directs a line to the the second section #
#  (core:other) unless it is directed to the first section.          #
# In the first section a line is special if the tag has a suffix     #
#  "1" or "2".                                                       #
# Parameters from five infiles.                                      #
# Outfile: all.out (to be uploaded).                                 #
#                                                                    #
######################################################################

def format (biglist):
    #   Internal   F U N C T I O N S
    #
    # frmt: format a line
    #
    # whole is (tags0,tags1,article), special is True or False
    # tags0: list of (A,B) pairs corresponding to "(A:B)" tags
    # tags1: list of other tags
    # article of the form "X(Y)" turns into "X(Y)|X"
    #  but article of the form "X!(Y)" turns into "X(Y)"
    # returns "[[article]] / tags<br>" if special is False,
    #  or "'''[[article]]''' / tags<br>" if special is True.

    def frmt (whole,special):
        (tags0,tags1,article) = whole
        k=article.find("(",5)
        if k>0:                             # article of the form "X(Y)" or "X!(Y)" ?
            if article[k-1] != "!":
                article += "|"+article[0:k].strip()        # "X(Y)|X"
            else:
                article = article[:k-1]+article[k:]        # "X(Y)"
        if special:
            res="'''[["+article+"]]''' / "
        else:
            res="[["+article+"]] / "
        for z in tags1:                     # first, other tags, not "(A:B)"
            res += z[0:3]+" "               # remove suffix (if any)
        for tag0 in tags0:                  # last, "(A:B)" tags
            (first,second)=tag0
            res += "("+first+":"+second+") "     # convert (A,B) back to "(A:B)"
        res = res[:-1]+"<br>"               # remove the last space
        return res

    # pr: prints text to the string "formatted"

    def pr(text):
        global formatted
        formatted += text+"\n"

    # flst: format and print list of lines by frmt
    # lst: list of usual lines
    # lst1, lst2: lists of special lines
    # (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1)
    # two columns are produced unless the list is short

    def flst (lst,lst1,lst2):
        assert len(lst2) <= len(lst1), "bad special lines"
        assert len(lst1) <= 1, "bad special lines"
        l = len(lst)
        if l<4 and lst2==[]:                # one column?
            if lst1!=[]:                    #  special line?
                pr(frmt(lst1[0],True))      #  format special line
            for whole in lst:
                pr(frmt(whole,False))       #  format usual line
        else:                               # two columns
            k=(l+1-len(lst1)+len(lst2))//2       # a half of lines - to the left column
            pr("{{Top}}")                   # start the left column
            if lst1!=[]:                    #  special line?
                pr(frmt(lst1[0],True))      #  format special line
            for whole in lst[:k]:
                pr(frmt(whole,False))       #  format usual line
            pr("{{Mid}}")                   # start the right column
            if lst2!=[]:                    #  special line?
                pr(frmt(lst2[0],True))      #  format special line
            for whole in lst[k:]:
                pr(frmt(whole,False))       #  format usual line
            pr("{{Bottom}}")                # finish the right column
        return

    #   e n d   o f  internal   f u n c t i o n s


    # initialize lists for "(A:B)" tags
    # A[x+y] will collect all lines containing "(x:y)"

    A = {}
    for (x,xx) in rvnumber:
        for (y,yy) in rvtype:
            A[x+y]=[]

    # initialize lists for other core tags
    # B[z] will collect all usual lines containing z
    # B1[z] will collect all special lines containing tag z with suffix 1
    # B2[z] will collect all special lines containing tag z with suffix 2

    B = dict ( map ( (lambda(x,y):(x,[])), corenames ) )

    B1 = copy.deepcopy(B)
    B2 = copy.deepcopy(B)

    # initialize lists for other non-core tags
    # C[z] will collect all lines containing z

    C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) )

    # initialize counters

    count_sel_links=0
    count_sel_articles=0
    count_other_links=0
    count_other_articles=0
    count_around_links = 0
    count_around_articles = 0

    # distribute the lines (according to tags) to the lists

    for whole in biglist:
        (tags0,tags1,article) = whole
        tags11=[]                           # core-selected tags to be collected here
        tags12=[]                           # non-core tags to be collected here
        for z in tags1:
            if z[0:3] in B:                 # tag (without suffix) belongs to core?
                tags11.append(z)            # store in tags11
            else:
                tags12.append(z)            # store in tags12
        if tags11!=[]:                      # some core-selected tags?
            count_sel_articles += 1
            count_sel_links += len(tags11)
        elif tags0!=[]:                     # some core-other (but no core-selected) tags?
            count_other_articles += 1
        if tags12!=[]:                      # some non-core tags?
            count_around_articles += 1
            count_around_links += len(tags12)
        for z in tags11:                    # process core-selected tags
            if len(z)==3:                   # tag with no suffix?
                B[z].append(whole)          # store usual line
            elif z[3]=="1":                 # tag with suffix "1"?
                B1[z[0:3]].append(whole)    # store special line
            else:                           # tag with suffix "2"
                B2[z[0:3]].append(whole)    # store special line
        if tags11==[]:                      # no core-selected tags? then process "(A:B)" tags
            for tag0 in tags0:              # process "(A:B)" tags
                (first,second)=tag0
                for x in first:
                    for y in second:
                        A[x+y].append(whole)
                        count_other_links += 1
        for z in tags12:                    # process non-core tags
            C[z].append(whole)              # store line

    # sort each list alphabetically; sort key is article (not tag)

    for z in A:
        A[z].sort(key=(lambda wh: wh[2]))
    for z in B:
        B[z].sort(key=(lambda wh: wh[2]))
    for z in C:
        C[z].sort(key=(lambda wh: wh[2]))

    # lists are ready; now start generating the output

    pr(headings[0])
    pr("")
    pr(headings[1])
    pr("")

    # format and print selected core topics

    for (z,text) in corenames:
        pr("==="+text+" ("+z+")===")
        pr("")
        flst(B[z],B1[z],B2[z])
        pr("")

    # format and print other core topics

    pr(headings[2])
    pr("")

    for (x,text1) in rvnumber:
        pr("==="+text1+" ("+x+":)===")
        pr("")
        for (y,text2) in rvtype:
            if A[x+y] != []:
                pr("===="+text2+" ("+x+":"+y+")====")
                pr("")
                flst(A[x+y],[],[])
                pr("")

    # format and print non-core topics

    pr(headings[3])
    pr("")

    for (z,text) in aroundnames:
        pr("==="+text+" ("+z+")===")
        pr("")
        flst(C[z],[],[])
        pr("")

    pr(headings[4])
    pr("{{Top}}")
    pr( '"Core": %i (%i)<br>' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links ) )
    pr( '"Around": %i (%i)<br>' % ( count_around_articles, count_around_links) )
    pr("{{Mid}}")
    pr( '"Core selected": %i (%i)<br>' % (count_sel_articles,count_sel_links) )
    pr( '"Core others": %i (%i)<br>' % ( count_other_articles, count_other_links ) )
    pr("{{Bottom}}")
    pr("")
    pr( "Here ''k''(''n'') means: ''n'' links to ''k'' articles. (Some articles are linked more than once.)" )
    return
########################  end  of  format  ####################################


######################  now execute them  #####################################

parsed = parse (source_txt, corenames, aroundnames, rvnumber, rvtype)

formatted = ""           # initialize the output string

format (parsed)

out_page=wikipedia.Page(mysite,'User:Tsirel/Catalog')

out_page.put(formatted)            ###  write to Wikipedia  ###

###########################  the end  #########################################
</code>

files of parameters

corenames.dat

[('bsc', 'Basic notions'), ('mnt', 'Moments'), ('inq', 'Inequalities'), ('Mar', 'Markov chains, processes,
fields, networks'), ('Gau', 'Gaussian random variables, vectors, functions'), ('cnd', 'Conditioning'), ('spd',
'Specific distributions'), ('emm', 'Empirical measure'), ('lmt', 'Limit theorems'), ('lrd', 'Large deviations'), 
('scl', 'Stochastic calculus'), ('Mal', 'Malliavin calculus'), ('anl', 'Analytic aspects (including measure
theoretic)')]

aroundnames.dat

[('grl', 'General aspects'), ('fnd', 'Foundations of probability theory'), ('gmb', 'Gambling'), ('cnc',
'Coincidence'), ('alg', 'Algorithmics'), ('Bay', 'Bayesian approach'), ('fnc', 'Financial mathematics'), ('phs', 
'Physics'), ('gnt', 'Genetics'), ('spr', 'Stochastic process'), ('geo', 'Geometric probability'), ('emp',
'Empirical findings'), ('hst', 'Historical'), ('msc', 'Miscellany')]

rvnumber.dat

[("1","A single random variable"),
 ("2","Two random variables"),
 ("3","Three random variables"),
 ("F","Finitely many random variables"),
 ("L","A large number of random variables (finite but tending to infinity)"),
 ("S","An infinite sequence of random variables"),
 ("U","Uncountably many random variables (continuous-time processes etc)")]

rvtype.dat

[("B","Binary"), ("D","Discrete"),
 ("C", "Continuous"), ("R","Real-valued, arbitrary"),
 ("M","Random point of a manifold"),
 ("G","General (random element of an abstract space)")]

headings.dat

["==Core probability: selected topics==",
 "'''[[Probability theory]]'''",
 "==Core probability: other articles, by number and type of random variables==",
 "==Around the core==",
 "==Counters of articles=="]