Jump to content

User:Tsirel/Bot code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Tsirel (talk | contribs) at 19:32, 6 January 2009. The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

parse.py

######################################################################
#                                                                    #
#          parse.py                                                  #
#                                                                    #
# Main infile "all.in"; its lines: "[[article]] tags".               #
# Tags are space-separated,                                          #
#  each either "(rvnumber:rvtype)"                                   #
#  or a name (corename or aroundname) and maybe suffix "1" or "2".   #
# Parameters from four infiles.                                      #
# Outfile: all.dat (for format.py).                                  #
#                                                                    #
######################################################################

# read parameters

corenames = eval(open("corenames.dat").read())
aroundnames = eval(open("aroundnames.dat").read())
rvnumber = eval(open("rvnumber.dat").read())
rvtype = eval(open("rvtype.dat").read())

# only abbreviations will be used; full texts are for "format.py"

corenames = map ( (lambda(x,y):x), corenames )
aroundnames = map ( (lambda(x,y):x), aroundnames )
rvnumber = map ( (lambda(x,y):x), rvnumber )
rvtype = map ( (lambda(x,y):x), rvtype )

# distinction of "core" and "around" is for "format.py"

names = corenames + aroundnames

# parameters are ready; the main loop follows

biglist = []                            # for parsed lines
notags = []                             # for lines without tags (if any)
everything = {}                         # for articles (duplication check)

for line in open("all.in"):             # main loop: lines of the main input file
    line=line.strip()
    if line=="":  continue              # ignore empty lines (if any)
    if not line.startswith("[["):
        print "BAD LINE: "+line         # syntax error in the main input file
        continue
    article=line[2:line.index("]]")]    # extract article from "[[article]] tags"
    article=article[0].upper()+article[1:]        # article -> Article
    if article in everything:           # duplication?
        print "AGAIN "+article
    everything[article]=0               # the 0 is of no use
    rest=line[line.index("]]")+2:].split()        # list of tags
    if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
    tags0=[]                            # for "(A:B)" tags converted to (A,B)
    tags1=[]                            # for other tags
    for tag in rest:                    # loop over all tags of the given line
        if tag.startswith("("):         # "(A:B)" tag?
            maintag=tag[1:]
            if not maintag.endswith(")"):  print "!!! "+maintag;  break    # syntax error
            maintag=maintag[:-1]
            maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
            maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
            for x in maintag1:          # syntax check
                if x not in rvnumber:
                    print "?? "+x+" "+whole_line
                    raise "oops-1"      # syntax error
            for x in maintag2:          # syntax check
                if x not in rvtype:
                    print "?? "+x+" "+whole_line
                    raise "oops-2"      # syntax error
            tags0.append((maintag1,maintag2))     # store (A,B)
        else:                           # other tag, not "A:B)"
            assert len(tag)==3 or len(tag)==4     # 3 chars, and maybe 1 char suffix
            if tag[0:3] not in names:
                print "BAD TAG: "+tag
                continue                # syntax error
            if len(tag)==4: assert tag[3] in ["1","2"]      # suffix syntax check
            tags1.append(tag)           # store the other tag
    biglist.append( (tags0,tags1,article) )       # store the parsed line

# the main loop is finished

print >>open("all.dat","w"), biglist    # write main out file

if notags != []:                        # report lines with no tags (if any)
    print "*** notags ***"
    print notags

########################  the end  ###################################

format.py

######################################################################
#                                                                    #
#          format.py                                                 #
#                                                                    #
# Main infile "all.dat" (produced by parse.py).                      #
# A tag not of the form "(A:B)" directs a line to the first          #
#  (core:selected) or the third (around the core) section,           #
#  according to corenames and aroundnames.                           #
# A tag of the form "(A:B)" directs a line to the the second section #
#  (core:other) unless it is directed to the first section.          #
# In the first section a line is special if the tag has a suffix     #
#  "1" or "2".                                                       #
# Parameters from five infiles.                                      #
# Outfile: all.out (to be uploaded).                                 #
#                                                                    #
######################################################################

import sys
import copy

# read parameters

corenames = eval(open("corenames.dat").read())
aroundnames = eval(open("aroundnames.dat").read())
rvnumber = eval(open("rvnumber.dat").read())
rvtype = eval(open("rvtype.dat").read())
headings  = eval(open("headings.dat").read())

# read the main infile

biglist = eval(open("all.dat").read())

#   F U N C T I O N S
#
# frmt: format a line
#
# whole is (tags0,tags1,article), special is True or False
# tags0: list of (A,B) pairs corresponding to "(A:B)" tags
# tags1: list of other tags
# article of the form "X(Y)" turns into "X(Y)|X"
#  but article of the form "X!(Y)" turns into "X(Y)"
# returns "[[article]] / tags<br>" if special is False,
#  or "'''[[article]]''' / tags<br>" if special is True.

def frmt (whole,special):
    (tags0,tags1,article) = whole
    k=article.find("(",5)
    if k>0:                             # article of the form "X(Y)" or "X!(Y)" ?
        if article[k-1] != "!":
            article += "|"+article[0:k].strip()        # "X(Y)|X"
        else:
            article = article[:k-1]+article[k:]        # "X(Y)"
    if special:
        res="'''[["+article+"]]''' / "
    else:
        res="[["+article+"]] / "
    for z in tags1:                     # first, other tags, not "(A:B)"
        res += z[0:3]+" "               # remove suffix (if any)
    for tag0 in tags0:                  # last, "(A:B)" tags
        (first,second)=tag0
        res += "("+first+":"+second+") "     # convert (A,B) back to "(A:B)"
    res = res[:-1]+"<br>"               # remove the last space
    return res

# flst: format and print list of lines by frmt
# lst: list of usual lines
# lst1, lst2: lists of special lines
# (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1)
# two columns are produced unless the list is short

def flst (lst,lst1,lst2):
    assert len(lst2) <= len(lst1)
    assert len(lst1) <= 1
    l = len(lst)
    if l<4 and lst2==[]:                # one column?
        if lst1!=[]:                    #  special line?
            print frmt(lst1[0],True)    #  format special line
        for whole in lst:
            print frmt(whole,False)     #  format usual line
    else:                               # two columns
        k=(l+1-len(lst1)+len(lst2))//2       # a half of lines - to the left column
        print "{{Top}}"                 # start the left column
        if lst1!=[]:                    #  special line?
            print frmt(lst1[0],True)    #  format special line
        for whole in lst[:k]:
            print frmt(whole,False)     #  format usual line
        print "{{Mid}}"                 # start the right column
        if lst2!=[]:                    #  special line?
            print frmt(lst2[0],True)    #  format special line
        for whole in lst[k:]:
            print frmt(whole,False)     #  format usual line
        print "{{Bottom}}"              # finish the right column
    return

#   e n d   o f   f u n c t i o n s

# initialize lists for "(A:B)" tags
# A[x+y] will collect all lines containing "(x:y)"

A = {}
for (x,xx) in rvnumber:
    for (y,yy) in rvtype:
        A[x+y]=[]

# initialize lists for other core tags
# B[z] will collect all usual lines containing z
# B1[z] will collect all special lines containing tag z with suffix 1
# B2[z] will collect all special lines containing tag z with suffix 2

B = dict ( map ( (lambda(x,y):(x,[])), corenames ) )

B1 = copy.deepcopy(B)
B2 = copy.deepcopy(B)

# initialize lists for other non-core tags
# C[z] will collect all lines containing z

C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) )

# initialize counters

count_sel_links=0
count_sel_articles=0
count_other_links=0
count_other_articles=0
count_around_links = 0
count_around_articles = 0

# distribute the lines (according to tags) to the lists

for whole in biglist:
    (tags0,tags1,article) = whole
    tags11=[]                           # core-selected tags to be collected here
    tags12=[]                           # non-core tags to be collected here
    for z in tags1:
        if z[0:3] in B:                 # tag (without suffix) belongs to core?
            tags11.append(z)            # store in tags11
        else:
            tags12.append(z)            # store in tags12
    if tags11!=[]:                      # some core-selected tags?
        count_sel_articles += 1
        count_sel_links += len(tags11)
    elif tags0!=[]:                     # some core-other (but no core-selected) tags?
        count_other_articles += 1
    if tags12!=[]:                      # some non-core tags?
        count_around_articles += 1
        count_around_links += len(tags12)
    for z in tags11:                    # process core-selected tags
        if len(z)==3:                   # tag with no suffix?
            B[z].append(whole)          # store usual line
        elif z[3]=="1":                 # tag with suffix "1"?
            B1[z[0:3]].append(whole)    # store special line
        else:                           # tag with suffix "2"
            B2[z[0:3]].append(whole)    # store special line
    if tags11==[]:                      # no core-selected tags? then process "(A:B)" tags
        for tag0 in tags0:              # process "(A:B)" tags
            (first,second)=tag0
            for x in first:
                for y in second:
                    A[x+y].append(whole)
                    count_other_links += 1
    for z in tags12:                    # process non-core tags
        C[z].append(whole)              # store line

# sort each list alphabetically; sort key is article (not tag)

for z in A:
    A[z].sort(key=(lambda wh: wh[2]))
for z in B:
    B[z].sort(key=(lambda wh: wh[2]))
for z in C:
    C[z].sort(key=(lambda wh: wh[2]))

# lists are ready; now start generating the output

sys.stdout = open("all.out","w")        # redirect "print" to the outfile

print headings[0]
print
print headings[1]
print

# format and print selected core topics

for (z,text) in corenames:
    print "==="+text+" ("+z+")==="
    print
    flst(B[z],B1[z],B2[z])
    print

# format and print other core topics

print headings[2]
print

for (x,text1) in rvnumber:
    print "==="+text1+" ("+x+":)==="
    print
    for (y,text2) in rvtype:
        if A[x+y] != []:
            print "===="+text2+" ("+x+":"+y+")===="
            print
            flst(A[x+y],[],[])
            print

# format and print non-core topics

print headings[3]
print

for (z,text) in aroundnames:
    print "==="+text+" ("+z+")==="
    print
    flst(C[z],[],[])
    print


print headings[4]
print "{{Top}}"
print '"Core": %i (%i)<br>' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links )
print '"Around": %i (%i)<br>' % ( count_around_articles, count_around_links)
print "{{Mid}}"
print '"Core selected": %i (%i)<br>' % (count_sel_articles,count_sel_links)
print '"Core others": %i (%i)<br>' % ( count_other_articles, count_other_links )
print "{{Bottom}}"
print
print "Here ''k''(''n'') means: ''n'' links to ''k'' articles. (Some articles are linked more than once.)"

########################  the end  ####################################