User:Tsirel/Bot code
import sys import copy import codecs # for debug
sys.path.append("/home/boris/Wiki/Bot/pywikipedia")
import wikipedia
mysite=wikipedia.getSite() params_page=wikipedia.Page(mysite,'User:Tsirel/Bot parameters') params_txt=params_page.get()
params_txt=params_txt.splitlines() assert params_txt.pop(0)=="== files of parameters ==","bad parameters"
def parse_param (name):
assert params_txt.pop(0)=="=== "+name+".dat ===","bad parameter "+name assert params_txt.pop(0)=="","bad parameter "+name
assert params_txt.pop(0)=="
","bad parameter "+name param="" while params_txt[0]!="
":
param += params_txt.pop(0) params_txt.pop(0) if params_txt: assert params_txt.pop(0)=="","bad parameter "+name return eval(param)
corenames = parse_param ("corenames") aroundnames = parse_param ("aroundnames") rvnumber = parse_param ("rvnumber") rvtype = parse_param ("rvtype") headings = parse_param ("headings")
source_page=wikipedia.Page(mysite,'Talk:Catalog of articles in probability theory/Source') source_txt=source_page.get() # read from Wikipedia
source_txt=source_txt.splitlines()
assert source_txt.pop(0)=="
","bad source" assert source_txt.pop()=="
","bad source"
"parse" subroutine
###################################################################### # # # parse # # # # Main infile "all.in"; its lines: "[[article]] tags". # # Tags are space-separated, # # each either "(rvnumber:rvtype)" # # or a name (corename or aroundname) and maybe suffix "1" or "2". # # Parameters from four infiles. # # Outfile: all.dat (for format.py). # # # ###################################################################### def parse (source_txt, corenames, aroundnames, rvnumber, rvtype): # only abbreviations will be used; full texts are for "format.py" corenames = map ( (lambda(x,y):x), corenames ) aroundnames = map ( (lambda(x,y):x), aroundnames ) rvnumber = map ( (lambda(x,y):x), rvnumber ) rvtype = map ( (lambda(x,y):x), rvtype ) # distinction of "core" and "around" is for "format.py" names = corenames + aroundnames # parameters are ready; the main loop follows biglist = [] # for parsed lines notags = [] # for lines without tags (if any) everything = {} # for articles (duplication check) for line in source_txt: # main loop: lines of the main input file line=line.strip() if line=="": continue # ignore empty lines (if any) assert line.startswith("[["), "BAD LINE: "+line # syntax error in the main input file article=line[2:line.index("]]")] # extract article from "[[article]] tags" article=article[0].upper()+article[1:] # article -> Article # if article in everything: # duplication? # print "AGAIN "+article everything[article]=0 # the 0 is of no use rest=line[line.index("]]")+2:].split() # list of tags if rest==[]: notags.append(article); continue # no tags: report, do not process tags0=[] # for "(A:B)" tags converted to (A,B) tags1=[] # for other tags for tag in rest: # loop over all tags of the given line if tag.startswith("("): # "(A:B)" tag? maintag=tag[1:] assert maintag.endswith(")"), "!!! "+maintag # syntax error maintag=maintag[:-1] maintag1=maintag[:maintag.index(":")] # A from "(A:B)" maintag2=maintag[maintag.index(":")+1:] # B from "(A:B)" for x in maintag1: # syntax check assert x in rvnumber, "?? "+x+" "+whole_line for x in maintag2: # syntax check assert x in rvtype, "?? "+x+" "+whole_line tags0.append((maintag1,maintag2)) # store (A,B) else: # other tag, not "A:B)" assert len(tag)==3 or len(tag)==4, "bad tag" # 3 chars, and maybe 1 char suffix assert tag[0:3] in names, "BAD TAG: "+tag if len(tag)==4: assert tag[3] in ["1","2"], "bad tag" # suffix syntax check tags1.append(tag) # store the other tag biglist.append( (tags0,tags1,article) ) # store the parsed line return(biglist) ######################## end of parse ###################################
"format" subroutine
###################################################################### # # # format # # # # Main infile "all.dat" (produced by parse.py). # # A tag not of the form "(A:B)" directs a line to the first # # (core:selected) or the third (around the core) section, # # according to corenames and aroundnames. # # A tag of the form "(A:B)" directs a line to the the second section # # (core:other) unless it is directed to the first section. # # In the first section a line is special if the tag has a suffix # # "1" or "2". # # Parameters from five infiles. # # Outfile: all.out (to be uploaded). # # # ###################################################################### def format (biglist): # Internal F U N C T I O N S # # frmt: format a line # # whole is (tags0,tags1,article), special is True or False # tags0: list of (A,B) pairs corresponding to "(A:B)" tags # tags1: list of other tags # article of the form "X(Y)" turns into "X(Y)|X" # but article of the form "X!(Y)" turns into "X(Y)" # returns "[[article]] / tags<br>" if special is False, # or "'''[[article]]''' / tags<br>" if special is True. def frmt (whole,special): (tags0,tags1,article) = whole k=article.find("(",5) if k>0: # article of the form "X(Y)" or "X!(Y)" ? if article[k-1] != "!": article += "|"+article[0:k].strip() # "X(Y)|X" else: article = article[:k-1]+article[k:] # "X(Y)" if special: res="'''[["+article+"]]''' / " else: res="[["+article+"]] / " for z in tags1: # first, other tags, not "(A:B)" res += z[0:3]+" " # remove suffix (if any) for tag0 in tags0: # last, "(A:B)" tags (first,second)=tag0 res += "("+first+":"+second+") " # convert (A,B) back to "(A:B)" res = res[:-1]+"<br>" # remove the last space return res # pr: prints text to the string "formatted" def pr(text): global formatted formatted += text+"\n" # flst: format and print list of lines by frmt # lst: list of usual lines # lst1, lst2: lists of special lines # (len(lst1),len(lst2)) must be (0,0), (1,0) or (1,1) # two columns are produced unless the list is short def flst (lst,lst1,lst2): assert len(lst2) <= len(lst1), "bad special lines" assert len(lst1) <= 1, "bad special lines" l = len(lst) if l<4 and lst2==[]: # one column? if lst1!=[]: # special line? pr(frmt(lst1[0],True)) # format special line for whole in lst: pr(frmt(whole,False)) # format usual line else: # two columns k=(l+1-len(lst1)+len(lst2))//2 # a half of lines - to the left column pr("{{Top}}") # start the left column if lst1!=[]: # special line? pr(frmt(lst1[0],True)) # format special line for whole in lst[:k]: pr(frmt(whole,False)) # format usual line pr("{{Mid}}") # start the right column if lst2!=[]: # special line? pr(frmt(lst2[0],True)) # format special line for whole in lst[k:]: pr(frmt(whole,False)) # format usual line pr("{{Bottom}}") # finish the right column return # e n d o f internal f u n c t i o n s # initialize lists for "(A:B)" tags # A[x+y] will collect all lines containing "(x:y)" A = {} for (x,xx) in rvnumber: for (y,yy) in rvtype: A[x+y]=[] # initialize lists for other core tags # B[z] will collect all usual lines containing z # B1[z] will collect all special lines containing tag z with suffix 1 # B2[z] will collect all special lines containing tag z with suffix 2 B = dict ( map ( (lambda(x,y):(x,[])), corenames ) ) B1 = copy.deepcopy(B) B2 = copy.deepcopy(B) # initialize lists for other non-core tags # C[z] will collect all lines containing z C = dict ( map ( (lambda(x,y):(x,[])), aroundnames ) ) # initialize counters count_sel_links=0 count_sel_articles=0 count_other_links=0 count_other_articles=0 count_around_links = 0 count_around_articles = 0 # distribute the lines (according to tags) to the lists for whole in biglist: (tags0,tags1,article) = whole tags11=[] # core-selected tags to be collected here tags12=[] # non-core tags to be collected here for z in tags1: if z[0:3] in B: # tag (without suffix) belongs to core? tags11.append(z) # store in tags11 else: tags12.append(z) # store in tags12 if tags11!=[]: # some core-selected tags? count_sel_articles += 1 count_sel_links += len(tags11) elif tags0!=[]: # some core-other (but no core-selected) tags? count_other_articles += 1 if tags12!=[]: # some non-core tags? count_around_articles += 1 count_around_links += len(tags12) for z in tags11: # process core-selected tags if len(z)==3: # tag with no suffix? B[z].append(whole) # store usual line elif z[3]=="1": # tag with suffix "1"? B1[z[0:3]].append(whole) # store special line else: # tag with suffix "2" B2[z[0:3]].append(whole) # store special line if tags11==[]: # no core-selected tags? then process "(A:B)" tags for tag0 in tags0: # process "(A:B)" tags (first,second)=tag0 for x in first: for y in second: A[x+y].append(whole) count_other_links += 1 for z in tags12: # process non-core tags C[z].append(whole) # store line # sort each list alphabetically; sort key is article (not tag) for z in A: A[z].sort(key=(lambda wh: wh[2])) for z in B: B[z].sort(key=(lambda wh: wh[2])) for z in C: C[z].sort(key=(lambda wh: wh[2])) # lists are ready; now start generating the output pr(headings[0]) pr("") pr(headings[1]) pr("") # format and print selected core topics for (z,text) in corenames: pr("==="+text+" ("+z+")===") pr("") flst(B[z],B1[z],B2[z]) pr("") # format and print other core topics pr(headings[2]) pr("") for (x,text1) in rvnumber: pr("==="+text1+" ("+x+":)===") pr("") for (y,text2) in rvtype: if A[x+y] != []: pr("===="+text2+" ("+x+":"+y+")====") pr("") flst(A[x+y],[],[]) pr("") # format and print non-core topics pr(headings[3]) pr("") for (z,text) in aroundnames: pr("==="+text+" ("+z+")===") pr("") flst(C[z],[],[]) pr("") pr(headings[4]) pr("{{Top}}") pr( '"Core": %i (%i)<br>' % ( count_sel_articles + count_other_articles, count_sel_links + count_other_links ) ) pr( '"Around": %i (%i)<br>' % ( count_around_articles, count_around_links) ) pr("{{Mid}}") pr( '"Core selected": %i (%i)<br>' % (count_sel_articles,count_sel_links) ) pr( '"Core others": %i (%i)<br>' % ( count_other_articles, count_other_links ) ) pr("{{Bottom}}") pr("") pr( "Here ''k''(''n'') means: ''n'' links to ''k'' articles. (Some articles are linked more than once.)" ) return ######################## end of format ####################################
executing "parse" and "format", writing the result
###################### now execute them ##################################### parsed = parse (source_txt, corenames, aroundnames, rvnumber, rvtype) formatted = "" # initialize the output string format (parsed) out_page=wikipedia.Page(mysite,'User:Tsirel/Catalog') out_page.put(formatted) ### write to Wikipedia ### ########################### the end #########################################