Jump to content

User:Tsirel/Bot code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Tsirel (talk | contribs) at 16:15, 22 March 2009 (syntax). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
import sys
import copy
import codecs  # for debug

sys.path.append("/home/boris/Wiki/Bot/pywikipedia")

import wikipedia

mysite=wikipedia.getSite()
params_page=wikipedia.Page(mysite,'User:Tsirel/Bot parameters')
params_txt=params_page.get()

params_txt=params_txt.splitlines()
assert params_txt.pop(0)=="== files of parameters ==","bad parameters"

def parse_param (name):
    assert params_txt.pop(0)=="=== "+name+".dat ===","bad parameter "+name
    assert params_txt.pop(0)=="","bad parameter "+name
    assert params_txt.pop(0)=="<pre>","bad parameter "+name
    param=""
    while params_txt[0]!="</pre>":
        param += params_txt.pop(0)
    params_txt.pop(0)
    if params_txt:
        assert params_txt.pop(0)=="","bad parameter "+name
    return eval(param)

corenames = parse_param ("corenames")
aroundnames = parse_param ("aroundnames")
rvnumber = parse_param ("rvnumber")
rvtype = parse_param ("rvtype")
headings = parse_param ("headings")

source_page=wikipedia.Page(mysite,'Talk:Catalog of articles in probability theory/Source')
source_txt=source_page.get()                # read from Wikipedia

source_txt=source_txt.splitlines()
assert source_txt.pop(0)=="<pre>","bad source"
assert source_txt.pop()=="</pre>","bad source"

"parse" subroutine

######################################################################
#                                                                    #
#          parse                                                     #
#                                                                    #
# Main infile "all.in"; its lines: "[[article]] tags".               #
# Tags are space-separated,                                          #
#  each either "(rvnumber:rvtype)"                                   #
#  or a name (corename or aroundname) and maybe suffix "1" or "2".   #
# Parameters from four infiles.                                      #
# Outfile: all.dat (for format.py).                                  #
#                                                                    #
######################################################################

def parse (source_txt, corenames, aroundnames, rvnumber, rvtype):

    # only abbreviations will be used; full texts are for "format.py"

    corenames = map ( (lambda(x,y):x), corenames )
    aroundnames = map ( (lambda(x,y):x), aroundnames )
    rvnumber = map ( (lambda(x,y):x), rvnumber )
    rvtype = map ( (lambda(x,y):x), rvtype )

    # distinction of "core" and "around" is for "format.py"

    names = corenames + aroundnames

    # parameters are ready; the main loop follows

    biglist = []                            # for parsed lines
    notags = []                             # for lines without tags (if any)
    everything = {}                         # for articles (duplication check)

    for line in source_txt:                 # main loop: lines of the main input file
        line=line.strip()
        if line=="":  continue              # ignore empty lines (if any)
        assert line.startswith("[["), "BAD LINE: "+line         # syntax error in the main input file
        article=line[2:line.index("]]")]    # extract article from "[[article]] tags"
        article=article[0].upper()+article[1:]        # article -> Article
        # if article in everything:           # duplication?
            # print "AGAIN "+article
        everything[article]=0               # the 0 is of no use
        rest=line[line.index("]]")+2:].split()        # list of tags
        if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
        tags0=[]                            # for "(A:B)" tags converted to (A,B)
        tags1=[]                            # for other tags
        for tag in rest:                    # loop over all tags of the given line
            if tag.startswith("("):         # "(A:B)" tag?
                maintag=tag[1:]
                assert maintag.endswith(")"), "!!! "+maintag    # syntax error
                maintag=maintag[:-1]
                maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
                maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
                for x in maintag1:          # syntax check
                    assert x in rvnumber, "?? "+x+" "+whole_line
                for x in maintag2:          # syntax check
                    assert x in rvtype, "?? "+x+" "+whole_line
                tags0.append((maintag1,maintag2))     # store (A,B)
            else:                           # other tag, not "A:B)"
                assert len(tag)==3 or len(tag)==4, "bad tag"     # 3 chars, and maybe 1 char suffix
                assert tag[0:3] in names, "BAD TAG: "+tag
                if len(tag)==4:
                    assert tag[3] in ["1","2"], "bad tag"      # suffix syntax check
                tags1.append(tag)           # store the other tag
        biglist.append( (tags0,tags1,article) )       # store the parsed line

    return(biglist)

########################  end  of  parse  ###################################

"format" subroutine

######################################################################
#                                                                    #
#          format                                                   #
#                                                                    #
# Main infile "all.dat" (produced by parse.py).                      #
# A tag not of the form "(A:B)" directs a line to the first          #
#  (core:selected) or the third (around the core) section,           #
#  according to corenames and aroundnames.                           #
# A tag of the form "(A:B)" directs a line to the the second section #
#  (core:other) unless it is directed to the first section.          #
# In the first section a line is special if the tag has a suffix     #
#  "1" or "2".                                                       #
# Parameters from five infiles.                                      #
# Outfile: all.out (to be uploaded).                                 #
#                                                                    #
######################################################################

def format (biglist):
    #   Internal   F U N C T I O N S
    #
    # frmt: format a line
    #
    # whole is (tags0,tags1,article), special is True or False
    # tags0: list of (A,B) pairs corresponding to "(A:B)" tags
    # tags1: list of other tags
    # article of the form "X(Y)" turns into "X(Y)|X"
    #  but article of the form "X!(Y)" turns into "X(Y)"
    # returns "[[article]] / tags<br>" if special is False,
    #  or "'''[[article]]''' / tags<br>" if special is True.

    def frmt (whole,special):
        (tags0,tags1,article) = whole
        k=article.find("(",5)
        if k>0:                             # article of the form "X(Y)" or "X!(Y)" ?
            if article[k-1] != "!":
                article += "|"+article[0:k].strip()        # "X(Y)|X"
            else:
                article = article[:k-1]+article[k:]        # "X(Y)"
        if special:
            res="'''[["+article+"]]''' / "
        else:
            res="[["+article+"]] / "
        for z in tags1:                     # first, other tags, not "(A:B)"
            res += z[0:3]+" "               # remove suffix (if any)
        for tag0 in tags0:                  # last, "(A:B)" tags
            (first,second)=tag0
            res += "("+first+":"+second+") "     # convert (A,B) back to "(A:B)"
        res = res[:-1]+"<br>"               # remove the last space
        return res

    # pr: prints text to the string "formatted"

    def pr(text):
        global formatted
        formatted += text+"\n"

    # flst: format and print list of lines by frmt
    # lst: list of usual lines
    # lst1, lst2: lists of special lines
    # (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1)
    # two columns are produced unless the list is short

    def flst (lst,lst1,lst2):
        assert len(lst2) <= len(lst1), "bad special lines"
        assert len(lst1) <= 1, "bad special lines"
        l = len(lst)
        if l<4 and lst2==[]:                # one column?
            if lst1!=[]:                    #  special line?
                pr(frmt(lst1[0],True))      #  format special line
            for whole in lst:
                pr(frmt(whole,False))       #  format usual line
        else:                               # two columns
            k=(l+1-len(lst1)+len(lst2))//2       # a half of lines - to the left column
            pr("{{Top}}")                   # start the left column
            if lst1!=[]:                    #  special line?
                pr(frmt(lst1[0],True))      #  format special line
            for whole in lst[:k]:
                pr(frmt(whole,False))       #  format usual line
            pr("{{Mid}}")                   # start the right column
            if lst2!=[]:                    #  special line?
                pr(frmt(lst2[0],True))      #  format special line
            for whole in lst[k:]:
                pr(frmt(whole,False))       #  format usual line
            pr("{{Bottom}}")                # finish the right column
        return

    #   e n d   o f  internal   f u n c t i o n s


    # initialize lists for "(A:B)" tags
    # A[x+y] will collect all lines containing "(x:y)"

    A = {}
    for (x,xx) in rvnumber:
        for (y,yy) in rvtype:
            A[x+y]=[]

    # initialize lists for other core tags
    # B[z] will collect all usual lines containing z
    # B1[z] will collect all special lines containing tag z with suffix 1
    # B2[z] will collect all special lines containing tag z with suffix 2

    B = dict ( map ( (lambda(x,y):(x,[])), corenames ) )

    B1 = copy.deepcopy(B)
    B2 = copy.deepcopy(B)

    # initialize lists for other non-core tags
    # C[z] will collect all lines containing z

    C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) )

    # initialize counters

    count_sel_links=0
    count_sel_articles=0
    count_other_links=0
    count_other_articles=0
    count_around_links = 0
    count_around_articles = 0

    # distribute the lines (according to tags) to the lists

    for whole in biglist:
        (tags0,tags1,article) = whole
        tags11=[]                           # core-selected tags to be collected here
        tags12=[]                           # non-core tags to be collected here
        for z in tags1:
            if z[0:3] in B:                 # tag (without suffix) belongs to core?
                tags11.append(z)            # store in tags11
            else:
                tags12.append(z)            # store in tags12
        if tags11!=[]:                      # some core-selected tags?
            count_sel_articles += 1
            count_sel_links += len(tags11)
        elif tags0!=[]:                     # some core-other (but no core-selected) tags?
            count_other_articles += 1
        if tags12!=[]:                      # some non-core tags?
            count_around_articles += 1
            count_around_links += len(tags12)
        for z in tags11:                    # process core-selected tags
            if len(z)==3:                   # tag with no suffix?
                B[z].append(whole)          # store usual line
            elif z[3]=="1":                 # tag with suffix "1"?
                B1[z[0:3]].append(whole)    # store special line
            else:                           # tag with suffix "2"
                B2[z[0:3]].append(whole)    # store special line
        if tags11==[]:                      # no core-selected tags? then process "(A:B)" tags
            for tag0 in tags0:              # process "(A:B)" tags
                (first,second)=tag0
                for x in first:
                    for y in second:
                        A[x+y].append(whole)
                        count_other_links += 1
        for z in tags12:                    # process non-core tags
            C[z].append(whole)              # store line

    # sort each list alphabetically; sort key is article (not tag)

    for z in A:
        A[z].sort(key=(lambda wh: wh[2]))
    for z in B:
        B[z].sort(key=(lambda wh: wh[2]))
    for z in C:
        C[z].sort(key=(lambda wh: wh[2]))

    # lists are ready; now start generating the output

    pr(headings[0])
    pr("")
    pr(headings[1])
    pr("")

    # format and print selected core topics

    for (z,text) in corenames:
        pr("==="+text+" ("+z+")===")
        pr("")
        flst(B[z],B1[z],B2[z])
        pr("")

    # format and print other core topics

    pr(headings[2])
    pr("")

    for (x,text1) in rvnumber:
        pr("==="+text1+" ("+x+":)===")
        pr("")
        for (y,text2) in rvtype:
            if A[x+y] != []:
                pr("===="+text2+" ("+x+":"+y+")====")
                pr("")
                flst(A[x+y],[],[])
                pr("")

    # format and print non-core topics

    pr(headings[3])
    pr("")

    for (z,text) in aroundnames:
        pr("==="+text+" ("+z+")===")
        pr("")
        flst(C[z],[],[])
        pr("")

    pr(headings[4])
    pr("{{Top}}")
    pr( '"Core": %i (%i)<br>' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links ) )
    pr( '"Around": %i (%i)<br>' % ( count_around_articles, count_around_links) )
    pr("{{Mid}}")
    pr( '"Core selected": %i (%i)<br>' % (count_sel_articles,count_sel_links) )
    pr( '"Core others": %i (%i)<br>' % ( count_other_articles, count_other_links ) )
    pr("{{Bottom}}")
    pr("")
    pr( "Here ''k''(''n'') means: ''n'' links to ''k'' articles. (Some articles are linked more than once.)" )
    return
########################  end  of  format  ####################################

executing "parse" and "format", writing the result

######################  now execute them  #####################################

parsed = parse (source_txt, corenames, aroundnames, rvnumber, rvtype)

formatted = ""           # initialize the output string

format (parsed)

out_page=wikipedia.Page(mysite,'User:Tsirel/Catalog')

out_page.put(formatted)            ###  write to Wikipedia  ###

###########################  the end  #########################################