Jump to content

User:Tsirel/Bot code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Tsirel (talk | contribs) at 19:46, 6 January 2009 (aroundnames.dat). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

parse.py

######################################################################
#                                                                    #
#          parse.py                                                  #
#                                                                    #
# Main infile "all.in"; its lines: "[[article]] tags".               #
# Tags are space-separated,                                          #
#  each either "(rvnumber:rvtype)"                                   #
#  or a name (corename or aroundname) and maybe suffix "1" or "2".   #
# Parameters from four infiles.                                      #
# Outfile: all.dat (for format.py).                                  #
#                                                                    #
######################################################################

# read parameters

corenames = eval(open("corenames.dat").read())
aroundnames = eval(open("aroundnames.dat").read())
rvnumber = eval(open("rvnumber.dat").read())
rvtype = eval(open("rvtype.dat").read())

# only abbreviations will be used; full texts are for "format.py"

corenames = map ( (lambda(x,y):x), corenames )
aroundnames = map ( (lambda(x,y):x), aroundnames )
rvnumber = map ( (lambda(x,y):x), rvnumber )
rvtype = map ( (lambda(x,y):x), rvtype )

# distinction of "core" and "around" is for "format.py"

names = corenames + aroundnames

# parameters are ready; the main loop follows

biglist = []                            # for parsed lines
notags = []                             # for lines without tags (if any)
everything = {}                         # for articles (duplication check)

for line in open("all.in"):             # main loop: lines of the main input file
    line=line.strip()
    if line=="":  continue              # ignore empty lines (if any)
    if not line.startswith("[["):
        print "BAD LINE: "+line         # syntax error in the main input file
        continue
    article=line[2:line.index("]]")]    # extract article from "[[article]] tags"
    article=article[0].upper()+article[1:]        # article -> Article
    if article in everything:           # duplication?
        print "AGAIN "+article
    everything[article]=0               # the 0 is of no use
    rest=line[line.index("]]")+2:].split()        # list of tags
    if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
    tags0=[]                            # for "(A:B)" tags converted to (A,B)
    tags1=[]                            # for other tags
    for tag in rest:                    # loop over all tags of the given line
        if tag.startswith("("):         # "(A:B)" tag?
            maintag=tag[1:]
            if not maintag.endswith(")"):  print "!!! "+maintag;  break    # syntax error
            maintag=maintag[:-1]
            maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
            maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
            for x in maintag1:          # syntax check
                if x not in rvnumber:
                    print "?? "+x+" "+whole_line
                    raise "oops-1"      # syntax error
            for x in maintag2:          # syntax check
                if x not in rvtype:
                    print "?? "+x+" "+whole_line
                    raise "oops-2"      # syntax error
            tags0.append((maintag1,maintag2))     # store (A,B)
        else:                           # other tag, not "A:B)"
            assert len(tag)==3 or len(tag)==4     # 3 chars, and maybe 1 char suffix
            if tag[0:3] not in names:
                print "BAD TAG: "+tag
                continue                # syntax error
            if len(tag)==4: assert tag[3] in ["1","2"]      # suffix syntax check
            tags1.append(tag)           # store the other tag
    biglist.append( (tags0,tags1,article) )       # store the parsed line

# the main loop is finished

print >>open("all.dat","w"), biglist    # write main out file

if notags != []:                        # report lines with no tags (if any)
    print "*** notags ***"
    print notags

########################  the end  ###################################

format.py

######################################################################
#                                                                    #
#          format.py                                                 #
#                                                                    #
# Main infile "all.dat" (produced by parse.py).                      #
# A tag not of the form "(A:B)" directs a line to the first          #
#  (core:selected) or the third (around the core) section,           #
#  according to corenames and aroundnames.                           #
# A tag of the form "(A:B)" directs a line to the the second section #
#  (core:other) unless it is directed to the first section.          #
# In the first section a line is special if the tag has a suffix     #
#  "1" or "2".                                                       #
# Parameters from five infiles.                                      #
# Outfile: all.out (to be uploaded).                                 #
#                                                                    #
######################################################################

import sys
import copy

# read parameters

corenames = eval(open("corenames.dat").read())
aroundnames = eval(open("aroundnames.dat").read())
rvnumber = eval(open("rvnumber.dat").read())
rvtype = eval(open("rvtype.dat").read())
headings  = eval(open("headings.dat").read())

# read the main infile

biglist = eval(open("all.dat").read())

#   F U N C T I O N S
#
# frmt: format a line
#
# whole is (tags0,tags1,article), special is True or False
# tags0: list of (A,B) pairs corresponding to "(A:B)" tags
# tags1: list of other tags
# article of the form "X(Y)" turns into "X(Y)|X"
#  but article of the form "X!(Y)" turns into "X(Y)"
# returns "[[article]] / tags<br>" if special is False,
#  or "'''[[article]]''' / tags<br>" if special is True.

def frmt (whole,special):
    (tags0,tags1,article) = whole
    k=article.find("(",5)
    if k>0:                             # article of the form "X(Y)" or "X!(Y)" ?
        if article[k-1] != "!":
            article += "|"+article[0:k].strip()        # "X(Y)|X"
        else:
            article = article[:k-1]+article[k:]        # "X(Y)"
    if special:
        res="'''[["+article+"]]''' / "
    else:
        res="[["+article+"]] / "
    for z in tags1:                     # first, other tags, not "(A:B)"
        res += z[0:3]+" "               # remove suffix (if any)
    for tag0 in tags0:                  # last, "(A:B)" tags
        (first,second)=tag0
        res += "("+first+":"+second+") "     # convert (A,B) back to "(A:B)"
    res = res[:-1]+"<br>"               # remove the last space
    return res

# flst: format and print list of lines by frmt
# lst: list of usual lines
# lst1, lst2: lists of special lines
# (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1)
# two columns are produced unless the list is short

def flst (lst,lst1,lst2):
    assert len(lst2) <= len(lst1)
    assert len(lst1) <= 1
    l = len(lst)
    if l<4 and lst2==[]:                # one column?
        if lst1!=[]:                    #  special line?
            print frmt(lst1[0],True)    #  format special line
        for whole in lst:
            print frmt(whole,False)     #  format usual line
    else:                               # two columns
        k=(l+1-len(lst1)+len(lst2))//2       # a half of lines - to the left column
        print "{{Top}}"                 # start the left column
        if lst1!=[]:                    #  special line?
            print frmt(lst1[0],True)    #  format special line
        for whole in lst[:k]:
            print frmt(whole,False)     #  format usual line
        print "{{Mid}}"                 # start the right column
        if lst2!=[]:                    #  special line?
            print frmt(lst2[0],True)    #  format special line
        for whole in lst[k:]:
            print frmt(whole,False)     #  format usual line
        print "{{Bottom}}"              # finish the right column
    return

#   e n d   o f   f u n c t i o n s

# initialize lists for "(A:B)" tags
# A[x+y] will collect all lines containing "(x:y)"

A = {}
for (x,xx) in rvnumber:
    for (y,yy) in rvtype:
        A[x+y]=[]

# initialize lists for other core tags
# B[z] will collect all usual lines containing z
# B1[z] will collect all special lines containing tag z with suffix 1
# B2[z] will collect all special lines containing tag z with suffix 2

B = dict ( map ( (lambda(x,y):(x,[])), corenames ) )

B1 = copy.deepcopy(B)
B2 = copy.deepcopy(B)

# initialize lists for other non-core tags
# C[z] will collect all lines containing z

C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) )

# initialize counters

count_sel_links=0
count_sel_articles=0
count_other_links=0
count_other_articles=0
count_around_links = 0
count_around_articles = 0

# distribute the lines (according to tags) to the lists

for whole in biglist:
    (tags0,tags1,article) = whole
    tags11=[]                           # core-selected tags to be collected here
    tags12=[]                           # non-core tags to be collected here
    for z in tags1:
        if z[0:3] in B:                 # tag (without suffix) belongs to core?
            tags11.append(z)            # store in tags11
        else:
            tags12.append(z)            # store in tags12
    if tags11!=[]:                      # some core-selected tags?
        count_sel_articles += 1
        count_sel_links += len(tags11)
    elif tags0!=[]:                     # some core-other (but no core-selected) tags?
        count_other_articles += 1
    if tags12!=[]:                      # some non-core tags?
        count_around_articles += 1
        count_around_links += len(tags12)
    for z in tags11:                    # process core-selected tags
        if len(z)==3:                   # tag with no suffix?
            B[z].append(whole)          # store usual line
        elif z[3]=="1":                 # tag with suffix "1"?
            B1[z[0:3]].append(whole)    # store special line
        else:                           # tag with suffix "2"
            B2[z[0:3]].append(whole)    # store special line
    if tags11==[]:                      # no core-selected tags? then process "(A:B)" tags
        for tag0 in tags0:              # process "(A:B)" tags
            (first,second)=tag0
            for x in first:
                for y in second:
                    A[x+y].append(whole)
                    count_other_links += 1
    for z in tags12:                    # process non-core tags
        C[z].append(whole)              # store line

# sort each list alphabetically; sort key is article (not tag)

for z in A:
    A[z].sort(key=(lambda wh: wh[2]))
for z in B:
    B[z].sort(key=(lambda wh: wh[2]))
for z in C:
    C[z].sort(key=(lambda wh: wh[2]))

# lists are ready; now start generating the output

sys.stdout = open("all.out","w")        # redirect "print" to the outfile

print headings[0]
print
print headings[1]
print

# format and print selected core topics

for (z,text) in corenames:
    print "==="+text+" ("+z+")==="
    print
    flst(B[z],B1[z],B2[z])
    print

# format and print other core topics

print headings[2]
print

for (x,text1) in rvnumber:
    print "==="+text1+" ("+x+":)==="
    print
    for (y,text2) in rvtype:
        if A[x+y] != []:
            print "===="+text2+" ("+x+":"+y+")===="
            print
            flst(A[x+y],[],[])
            print

# format and print non-core topics

print headings[3]
print

for (z,text) in aroundnames:
    print "==="+text+" ("+z+")==="
    print
    flst(C[z],[],[])
    print


print headings[4]
print "{{Top}}"
print '"Core": %i (%i)<br>' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links )
print '"Around": %i (%i)<br>' % ( count_around_articles, count_around_links)
print "{{Mid}}"
print '"Core selected": %i (%i)<br>' % (count_sel_articles,count_sel_links)
print '"Core others": %i (%i)<br>' % ( count_other_articles, count_other_links )
print "{{Bottom}}"
print
print "Here ''k''(''n'') means: ''n'' links to ''k'' articles. (Some articles are linked more than once.)"

########################  the end  ####################################

files of parameters

corenames.dat

[('bsc', 'Basic notions'), ('mnt', 'Moments'), ('inq', 'Inequalities'), ('Mar', 'Markov chains, processes,
fields, networks'), ('Gau', 'Gaussian random variables, vectors, functions'), ('cnd', 'Conditioning'), ('spd',
'Specific distributions'), ('emm', 'Empirical measure'), ('lmt', 'Limit theorems'), ('lrd', 'Large deviations'), 
('scl', 'Stochastic calculus'), ('Mal', 'Malliavin calculus'), ('anl', 'Analytic aspects (including measure
theoretic)')]

aroundnames.dat

[('grl', 'General aspects'), ('fnd', 'Foundations of probability theory'), ('gmb', 'Gambling'), ('cnc',
'Coincidence'), ('alg', 'Algorithmics'), ('Bay', 'Bayesian approach'), ('fnc', 'Financial mathematics'), ('phs', 
'Physics'), ('gnt', 'Genetics'), ('spr', 'Stochastic process'), ('geo', 'Geometric probability'), ('emp',
'Empirical findings'), ('hst', 'Historical'), ('msc', 'Miscellany')]

rvnumber.dat

[("1","A single random variable"),
 ("2","Two random variables"),
 ("3","Three random variables"),
 ("F","Finitely many random variables"),
 ("L","A large number of random variables (finite but tending to infinity)"),
 ("S","An infinite sequence of random variables"),
 ("U","Uncountably many random variables (continuous-time processes etc)")]