User:Tsirel/Bot code

######################################################################
#                                                                    #
#          parse.py                                                  #
#                                                                    #
# Main infile "all.in"; its lines: "[[article]] tags".               #
# Tags are space-separated,                                          #
#  each either "(rvnumber:rvtype)"                                   #
#  or a name (corename or aroundname) and maybe suffix "1" or "2".   #
# Parameters from four infiles.                                      #
# Outfile: all.dat (for format.py).                                  #
#                                                                    #
######################################################################

# read parameters

corenames = eval(open("corenames.dat").read())
aroundnames = eval(open("aroundnames.dat").read())
rvnumber = eval(open("rvnumber.dat").read())
rvtype = eval(open("rvtype.dat").read())

# only abbreviations will be used; full texts are for "format.py"

corenames = map ( (lambda(x,y):x), corenames )
aroundnames = map ( (lambda(x,y):x), aroundnames )
rvnumber = map ( (lambda(x,y):x), rvnumber )
rvtype = map ( (lambda(x,y):x), rvtype )

# distinction of "core" and "around" is for "format.py"

names = corenames + aroundnames

# parameters are ready; the main loop follows

biglist = []                            # for parsed lines
notags = []                             # for lines without tags (if any)
everything = {}                         # for articles (duplication check)

for line in open("all.in"):             # main loop: lines of the main input file
    line=line.strip()
    if line=="":  continue              # ignore empty lines (if any)
    if not line.startswith("[["):
        print "BAD LINE: "+line         # syntax error in the main input file
        continue
    article=line[2:line.index("]]")]    # extract article from "[[article]] tags"
    article=article[0].upper()+article[1:]        # article -> Article
    if article in everything:           # duplication?
        print "AGAIN "+article
    everything[article]=0               # the 0 is of no use
    rest=line[line.index("]]")+2:].split()        # list of tags
    if rest==[]:  notags.append(article);  continue    # no tags: report, do not process
    tags0=[]                            # for "(A:B)" tags converted to (A,B)
    tags1=[]                            # for other tags
    for tag in rest:                    # loop over all tags of the given line
        if tag.startswith("("):         # "(A:B)" tag?
            maintag=tag[1:]
            if not maintag.endswith(")"):  print "!!! "+maintag;  break    # syntax error
            maintag=maintag[:-1]
            maintag1=maintag[:maintag.index(":")]      # A from "(A:B)"
            maintag2=maintag[maintag.index(":")+1:]    # B from "(A:B)"
            for x in maintag1:          # syntax check
                if x not in rvnumber:
                    print "?? "+x+" "+whole_line
                    raise "oops-1"      # syntax error
            for x in maintag2:          # syntax check
                if x not in rvtype:
                    print "?? "+x+" "+whole_line
                    raise "oops-2"      # syntax error
            tags0.append((maintag1,maintag2))     # store (A,B)
        else:                           # other tag, not "A:B)"
            assert len(tag)==3 or len(tag)==4     # 3 chars, and maybe 1 char suffix
            if tag[0:3] not in names:
                print "BAD TAG: "+tag
                continue                # syntax error
            if len(tag)==4: assert tag[3] in ["1","2"]      # suffix syntax check
            tags1.append(tag)           # store the other tag
    biglist.append( (tags0,tags1,article) )       # store the parsed line

# the main loop is finished

print >>open("all.dat","w"), biglist    # write main out file

if notags != []:                        # report lines with no tags (if any)
    print "*** notags ***"
    print notags

########################  the end  ###################################