User:WatchlistBot/source.py
Appearance
import catlib import wikipedia import codecs # the maximum number of articles per page MAX = 9000 # should we write to file or directly to wikipedia? DBG = False # Define some namespaces ARTICLE = 0 ARTICLE_TALK = 1 USER = 2 USER_TALK = 3 WIKIPEDIA = 4 WIKIPEDIA_TALK = 5 IMAGE = 6 IMAGE_TALK = 7 TEMPLATE = 10 TEMPLATE_TALK = 11 CATEGORY = 14 CATEGORY_TALK = 15 PORTAL = 100 PORTAL_TALK = 101 # some of the output strings # this one is for the top of all bot-created pages BOT_WARN = "<div class=\"notice\" " + \ "style=\"background:#ffe1a7; border:1px solid #AAA; " + \ "padding:0.2em; margin:0.5em auto;\"> " + \ "[[Image:Stop_hand.svg|left|20px]] This page is automatically " + \ "recreated from time to time. Accordingly, any changes you " + \ "make here will be overwitten. See below for details.</div>\n\n" # this text is used to start the first page, if we're splitting (use SPLIT_INTRO for main page, # SPLIT_INTRO_NEXT for next pages) SPLIT_INTRO1 = "There are too many articles (more than " + str(MAX) + ") in this project " + \ "to list them all on one page. This page and the ones linked " SPLIT_INTRO2 = "contain " SPLIT_INTRO = SPLIT_INTRO1 + "below " + SPLIT_INTRO2 SPLIT_INTRO_NEXT = SPLIT_INTRO1 + "from the main page " + SPLIT_INTRO2 # this text starts the first page, if we're not splitting ONE_PAGE_INTRO = "This page contains " # this text is the rest of the intro, in either case (use END_INTRO1 + tagText + END_INTRO2 # + template + END_INTRO3 + pageName + END_INTRO4 + pageName + END_INTRO5) END_INTRO1 = "links to all articles, categories, images, portal pages " + \ "templates, and project pages " END_INTRO2 = "with {{tl|" END_INTRO3 = "}} on their talk page. It was " + \ "generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " END_INTRO4 = "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" END_INTRO5 = "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n\n" class Watchlist: # the name of the template used to tag articles, e.g., "Numismaticnotice" template = "" # the name of the project, e.g., "Numismatics" project = "" # the location of the article list (output) -- without prefix, so for # "Wikipedia:WikiProject Numismatics/Articles", use "Articles" articleOut = "" # a list for all articles articles = [] # a list for all article talk pages articlesTalk = [] # a list for all Wikipedia pages wikis = [] # a list for all Wikipedia talk pages wikisTalk = [] # a list for all templates templates = [] # a list for all template talk pages templatesTalk = [] # a list for all categories categories = [] # a list for all category talk pages categoriesTalk = [] # a list for all images images = [] # a list for all image talk pages imagesTalk = [] # a list for all portals portals = [] # a list for all portal talk pages portalsTalk = [] # certain pages need to be included explicitly (for example, if they share # a talk page) includePages = [] def __init__(self, template, project, articleOut, includePages = []): self.template = template self.project = project self.articleOut = articleOut self.articles = [] self.articlesTalk = [] self.wikis = [] self.wikisTalk = [] self.templates = [] self.templatesTalk = [] self.categories = [] self.categoriesTalk = [] self.images = [] self.imagesTalk = [] self.portals = [] self.portalsTalk = [] self.includePages = includePages def processPageName (self, name): """ Process one page name, updating the lists as appropriate. """ result = name.split(":") if (len(result) == 1): self.articles.append(result[0]) self.articlesTalk.append("Talk:"+result[0]) elif (result[0] == "Talk"): self.articles.append(result[1]) self.articlesTalk.append("Talk:"+result[1]) elif (result[0] == "Wikipedia talk" or result[0] == "Wikipedia"): self.wikis.append("Wikipedia:"+result[1]) self.wikisTalk.append("Wikipedia talk:"+result[1]) elif (result[0] == "Template talk" or result[0] == "Template"): self.templates.append("Template:"+result[1]) self.templatesTalk.append("Template talk:"+result[1]) elif (result[0] == "Category talk" or result[0] == "Category"): self.categories.append(":Category:"+result[1]) self.categoriesTalk.append("Category talk:"+result[1]) elif (result[0] == "Image talk" or result[0] == "Image"): self.images.append(":Image:"+result[1]) self.imagesTalk.append("Image talk:"+result[1]) elif (result[0] == "Portal talk" or result[0] == "Portal"): self.portals.append("Portal:"+result[1]) self.portalsTalk.append("Portal talk:"+result[1]) def scanCat (self, catName, recurse): cat = catlib.Category(wikipedia.getSite(), catName) pages = cat.articles(recurse) for page in pages: self.processPageName(page.title()) self.categories.append(":Category:"+catName) self.categoriesTalk.append("Category talk:"+catName) def removeDuplicatesAndSort (self): self.articles = dict.fromkeys(self.articles).keys() self.articles.sort() self.articlesTalk = dict.fromkeys(self.articlesTalk).keys() self.articlesTalk.sort() self.wikis = dict.fromkeys(self.wikis).keys() self.wikis.sort() self.wikisTalk = dict.fromkeys(self.wikisTalk).keys() self.wikisTalk.sort() self.templates = dict.fromkeys(self.templates).keys() self.templates.sort() self.templatesTalk = dict.fromkeys(self.templatesTalk).keys() self.templatesTalk.sort() self.categories = dict.fromkeys(self.categories).keys() self.categories.sort() self.categoriesTalk = dict.fromkeys(self.categoriesTalk).keys() self.categoriesTalk.sort() self.images = dict.fromkeys(self.images).keys() self.images.sort() self.imagesTalk = dict.fromkeys(self.imagesTalk).keys() self.imagesTalk.sort() self.portals = dict.fromkeys(self.portals).keys() self.portals.sort() self.portalsTalk = dict.fromkeys(self.portalsTalk).keys() self.portalsTalk.sort() def getTaggedPages (self): """ Get the pages that include templateName Add the articles to the appropriate lists """ page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template) refs = page.getReferences(onlyTemplateInclusion=True) for page in refs: self.processPageName(page.title()) # include the explicitly named pages for page in self.includePages: self.processPageName(page) # remove duplicates and sort the lists self.removeDuplicatesAndSort() # organize the categories hierarchically (actually, no -- this takes too # much time) #self.catText = organizeCategories() def getPagesFromTaggedCategories (self): page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template) refs = page.getReferences(onlyTemplateInclusion=True) # include the explicitly named pages articles = [] for page in refs: result = page.title().split(":") if (result[0] == "Category talk"): # we expect this findArticlesInCategory("Category:" + result[1], articles) # add the category to the list as well articles.append(page.title()) articles = dict.fromkeys(articles).keys() articles.sort() for page in articles: self.processPageName(page) # remove duplicates and sort the lists self.removeDuplicatesAndSort() # organize the categories hierarchically (actually, no -- this takes too # much time) #self.catText = organizeCategories() def writeList (self, taggedPagesFlag): """ write the output to the specified page on Wikipedia taggedPagesFlag tells whether we're looking for tagged pages (true) or tagged categories (false) """ tagText = "" if (not taggedPagesFlag): tagText = "in categories " # the output page, without spaces wikipedia.output(u"Preparing output") output = self.project.replace(" ", "_") + "/" + \ self.articleOut.replace(" ", "_") totalArticles = len(self.articles) + len(self.wikis) + \ len(self.templates) + len(self.categories) + \ len(self.images) + len(self.portals) mainText = BOT_WARN # double the number of articles because of talk pages splitting = (totalArticles*2 > MAX) if (splitting): mainText += SPLIT_INTRO else: mainText += ONE_PAGE_INTRO mainText += END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + END_INTRO4 + output + END_INTRO5 mainText += "==Regular content (count: " + str(totalArticles) + ")==\n" # the number of articles listed on this page count = 0 # the page number pageNo = 1 # the text for this subpage (if no subpages, will just be on the main # page) mainText += "===Articles (count: " + str(len(self.articles)) + ")===\n" prevChar = firstChar = "Z" #initialize to something late in the alphabet subText = "" # make sure the first batch of articles goes to the main page firstBatch = True for s in self.articles: # if the first letter is a new one, put a heading if (s[0] != prevChar): subText += "====" + s[0] + "====\n" prevChar = s[0] if (count == 0): firstChar = prevChar # put the article name subText += "*[[" + s + "]]\n" # update the article count count = count+1 # if we've put all the articles we can on this page if (count > MAX): count = 0 if (firstBatch): firstBatch = False mainText += subText else: mainText += "====[[/Page" + str(pageNo) + "|" + \ firstChar + "-" + prevChar + "]]====\n" subText = subText.replace("<range>", firstChar + " through " + \ prevChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 firstChar = prevChar subText = "===Articles <range>===\n" + \ "====" + prevChar + "====\n" # if we have too many articles, and we've already started the # second (or more) page if (splitting and not firstBatch): mainText += "====[[/Page" + str(pageNo) + "|" + \ firstChar + " through " + prevChar + "]]====\n" subText = subText.replace("<range>", firstChar + " through " + prevChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: # we have only one page, or this is the first batch mainText += subText mainText += "===Wikipedia (count: " + str(len(self.wikis)) + ")===\n" # if we need to put these articles on the next page (because we've already started # the second page or we can't fit all the wikipedia articles on the main page) wikisOnNext = not firstBatch or count + len(self.wikis) > MAX if (wikisOnNext): subText = BOT_WARN + SPLIT_INTRO_NEXT + \ END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + "/Page" + str(pageNo) + END_INTRO4 + output + "/" + str(pageNo) + \ END_INTRO5 + \ "===Wikipedia===\n" mainText += "*[[/Page" + str(pageNo) + "#Wikipedia|Wikipedia]]\n" else: subText = "" count += len(self.wikis) for s in self.wikis: subText += "*[[" + s + "]]\n" # if the wiki pages are going on the main page, put them there if (not wikisOnNext): mainText += subText subText = "" mainText += "===Templates (count: " + str(len(self.templates)) + ")===\n" # if we need to put these articles on the next page (because wikis are already # on the next page, or we can't fit all the template articles on the main page) templatesOnNext = wikisOnNext or count + len(self.templates) > MAX if (templatesOnNext): # if we have not already started the next page if (not wikisOnNext): subText = BOT_WARN + SPLIT_INTRO_NEXT + \ END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + "/Page" + str(pageNo) + END_INTRO4 + output + "/" + \ + str(pageNo) + END_INTRO5 subText += "===Templates===\n" mainText += "*[[/Page" + str(pageNo) + "#Templates|Templates]]\n" else: count += len(self.templates) for s in self.templates: subText += "*[[" + s + "]]\n" # if the templates are going on the main page, put them there if (not templatesOnNext): mainText += subText subText = "" mainText += "===Portals (count: " + str(len(self.portals)) + ")===\n" # if we need to put these articles on the next page (because templates are already # on the next page, or we can't fit all the portals on the main page) portalsOnNext = templatesOnNext or count + len(self.portals) > MAX if (portalsOnNext): # if we have not already started the next page if (not templatesOnNext): subText = BOT_WARN + SPLIT_INTRO_NEXT + \ END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + "/Page" + str(pageNo) + END_INTRO4 + output + "/" + \ + str(pageNo) + END_INTRO5 subText += "===Portals===\n" mainText += "*[[/Page" + str(pageNo) + "#Portals|Portals]]\n" else: count += len(self.templates) for s in self.portals: subText += "*[[" + s + "]]\n" # if the portals are going on the main page, put them there if (not portalsOnNext): mainText += subText subText = "" mainText += "===Categories (count: " + str(len(self.categories)) + ")===\n" # if we need to put these articles on the next page (because portals are already # on the next page, or we can't fit all the categories on the main page) categoriesOnNext = portalsOnNext or count + len(self.categories) > MAX if (categoriesOnNext): # if we have not already started the next page if (not portalsOnNext): subText = BOT_WARN + SPLIT_INTRO_NEXT + \ END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + "/Page" + str(pageNo) + END_INTRO4 + output + "/" + \ + str(pageNo) + END_INTRO5 subText += "===Categories===\n" mainText += "*[[/Page" + str(pageNo) + "#Categories|Categories]]\n" else: count += len(self.templates) for s in self.categories: subText += "*[[" + s + "]]\n" # if the categories are going on the main page, put them there if (not categoriesOnNext): mainText += subText subText = "" mainText += "===Images (count: " + str(len(self.images)) + ")===\n" # if we need to put these articles on the next page (because categories are already # on the next page, or we can't fit all the images on the main page) imagesOnNext = categoriesOnNext or count + len(self.images) > MAX if (imagesOnNext): # if we have not already started the next page if (not categoriesOnNext): subText = BOT_WARN + SPLIT_INTRO_NEXT + \ END_INTRO1 + tagText + END_INTRO2 + self.template + END_INTRO3 + \ output + "/Page" + str(pageNo) + END_INTRO4 + output + "/" + \ + str(pageNo) + END_INTRO5 subText += "===Images===\n" mainText += "*[[/Page" + str(pageNo) + "#Images|Images]]\n" else: count += len(self.templates) for s in self.images: subText += "*[[" + s + "]]\n" # if the images are going on the main page, put them there if (not imagesOnNext): mainText += subText subText = "" mainText += "==Talk pages==\n" mainText += "===Articles===\n" prevChar = firstChar = "Z" #initialize to anything but A if (splitting): subText = "This article contains links to some talk pages " + tagText + \ "with {{tl|" + self.template + "}} " + \ "on their talk page. It was generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "/Page" + str(pageNo) + "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" + \ "===Articles <range>===\n" else: subText = "" count = 0 for s in self.articlesTalk: if (count == 0): firstChar = s.split(":")[1][0] subText += "*[[" + s + "]]\n" count = count+1 if (count > MAX): count = 0 endChar = s.split(":")[1][0] mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + "-" + endChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + \ endChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 firstChar = endChar subText = "===Articles <range>===\n" if (splitting): endChar = s.split(":")[1][0] mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + " through " + endChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + endChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: mainText += subText mainText += "===Wikipedia===\n" if (splitting): subText = "This article contains links to some talk pages " + tagText + \ "with {{tl|" + self.template + "}} " + \ "on their talk page. It was generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "/Page" + str(pageNo) + "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" + \ "===Wikipedia===\n" mainText += "*[[/Page" + str(pageNo) + "#Wikipedia|Wikipedia]]\n" else: subText = "" for s in self.wikisTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Templates===\n" if (splitting): subText += "===Templates===\n" mainText += "*[[/Page" + str(pageNo) + "#Templates|Templates]]\n" for s in self.templatesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Categories===\n" if (splitting): subText += "===Categories===\n" mainText += "*[[/Page" + str(pageNo) + "#Categories|Categories]]\n" for s in self.categoriesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Portals===\n" if (splitting): subText += "===Portals===\n" mainText += "*[[/Page" + str(pageNo) + "#Portals|Portals]]\n" for s in self.portalsTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Images===\n" if (splitting): subText += "===Images===\n" mainText += "*[[/Page" + str(pageNo) + "#Images|Images]]\n" for s in self.imagesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" if (splitting): self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: mainText += subText self.writeProjPage(self.articleOut, mainText) def writeProjPage (self, pageName, text): pageName = "Wikipedia:WikiProject " + self.project + "/" + pageName comment = "full update by [[User:WatchlistBot|WatchlistBot]]" page = wikipedia.Page(wikipedia.getSite(), pageName) writePage(page, text, comment) def organizeCategories (tag = "", topLevelCat = "Tamil Nadu", project = "Tamil Nadu", pageName="Categories", category = "Category-Class Tamil Nadu articles"): """ organize the categories hierarchically write the results to "Wikipedia:WikiProject <project>/<page>" if tag is given, find all categories which are tagged if category is given, find all categories in the specified category """ # get the list of categories dummy = [] catList = [] if (len(tag) != 0): getTagged(tag, catList, dummy) else: cat = catlib.Category(wikipedia.getSite(), "Category:" + category) pages = cat.articles() for page in pages: # we get the talk page, so convert it to the category page catList.append("Category:" + page.titleWithoutNamespace()) print len(catList) text = "This is the category structure for [[Wikipedia:WikiProject " + \ project + "|" + project + "]]<br>\n" cat = catlib.Category(wikipedia.getSite(), "Category:" + topLevelCat) text += "[[:Category:"+topLevelCat+"]]<br>\n" text = organizeCatsNextLevel(text, cat, "|—", catList) page = wikipedia.Page(wikipedia.getSite(), "Wikipedia:WikiProject " + project + "/" + pageName) writePage(page, text, "full update by [[User:WatchlistBot|WatchlistBot]]") def organizeCatsNextLevel (text, cat, substring, catList): """ recursively organize the category text text is the text so far, add to that cat is the catlib.Category of the previous level substring is the text to put before each category catList is the list of categories to include returns the text so far """ subcats = cat.subcategories() for subcat in subcats: # if this subcategory is included in our project if (subcat.title() in catList): # if it has not already been listed (to prevent duplication, # but more importantly, to prevent infinite loops) if (text.find(subcat.title()) == -1): text += substring + "[[:" + subcat.title() + "]]<br>\n" text = organizeCatsNextLevel(text, subcat, "| "+substring, catList) else: # it's already been listed text += substring + "[[:" + subcat.title() + "]] (already included, see above)<br>\n" # don't recurse in this case, to prevent infinite loops return text def getExcluded (project): """ get the list of pages which should not be tagged even though they're in tagged categories this can also be used to get excluded categories, if they're listed on the project exclusion page """ page = wikipedia.Page(wikipedia.getSite(), "User:WatchlistBot/" + project) if (page.exists()): text = page.get() # find the "----" the list of articles is below the line start = text.find("----\n") result = text[start+4:].split("[[") pages = [] for page in result: end = page.find("]]") if (end != -1): pages.append(getTalkVersion(page[:end])) return pages return [] def getTalkVersion (name): """ given a page name, convert it to the associated talk page """ result = name.split(":") if (len(result) == 1): #article return "Talk:"+name if (len(result) == 3): #category return "Category talk:"+result[2] if (result[0].find("Talk") != -1 or result[0].find("talk") != -1): return name return result[0] + " talk:" + result[1] def writePage (page, text, comment): if (not DBG): if (wikipedia.getSite().messages): wikipedia.output(u"Exiting -- you have message") return False page.put(text, comment, minorEdit=False) else: pageName = page.title() start = pageName.find("/"); if (start != -1): pageName = pageName[start+1:] start = pageName.find("/"); if (start != -1): pageName = pageName[start+1:] start = pageName.find(":"); if (start != -1): pageName = pageName[start+1:] ## page = wikipedia.Page(wikipedia.getSite(), ## "User:mom2jandk/" + pageName) ## page.put(text, comment, minorEdit=False) wikipedia.output(u"Writing file " + pageName + u".txt") f = codecs.open(pageName + ".txt", mode="w", encoding="utf8") f.write(text) f.close() return True def untagPage (pageName, tag): """ remove the tag from the given talk page, if it is there """ page = wikipedia.Page(wikipedia.getSite(), pageName) if page.exists(): if not page.isRedirectPage(): text = page.get() tagStart = text.find("{{"+tag) if (tagStart == -1): wikipedia.output("Page " + page.title() + " not tagged") else: # find the end of the tag (add 3 for the }}\n) tagEnd = text[tagStart:].find("}}") + tagStart + 3 text = text[:tagStart] + text[tagEnd:] return writePage(page, text, "Removing " + tag) return True def getClass (page): """ given a page, get the class tag """ namespace = page.namespace() if (namespace == TEMPLATE_TALK): return "template" if (namespace == IMAGE_TALK): return "image" if (namespace == CATEGORY_TALK): return "category" if (namespace == IMAGE_TALK): return "image" return "" def tagPage (pageName, tag, params = "", classify = False): """ tag the given talk page with the tag params is an optional list of parameters for the tag (like class=Stub) if classify is true, include class= """ # get the talk page page = wikipedia.Page(wikipedia.getSite(), pageName) if (classify): cl = getClass(page) if (cl != ""): params += "|class=" + getClass(page) if page.exists(): if not page.isRedirectPage(): text = page.get() return tagIt(page, text, tag+params) else: wikipedia.output("Page " + page.title() + " is a redirect") else: # we don't mind if the page doesn't exist yet, just create it return tagIt(page, "", tag+params) return True def tagIt (page, text, tag): text = "{{" + tag + "}}\n\n" + text return writePage(page, text, "Adding " + tag) def findArticlesInCategory (catName, articles, confirm = False, includeCats = False): """ find all the articles in the given category, and return a list If confirm is true, check each article with the user articles is the list so far includeCats indicates whether category talk pages should be included """ # get the category (don't include it, since tagging articles and categories # is handled separately) cat = catlib.Category(wikipedia.getSite(), catName) # get all pages in this category pages = cat.articles() for page in pages: # if confirming, check if (confirm): response = wikipedia.input(u"Do you want to tag " + page.title() + u"? (y for yes)") if (not confirm or response == "y"): # add the appropriate prefix prefix = wikipedia.getSite().namespace(page.namespace() + 1) + ":" namespace = page.namespace() if (namespace == TEMPLATE or namespace == ARTICLE or namespace == IMAGE or namespace == PORTAL or namespace == WIKIPEDIA): articles.append(prefix + page.titleWithoutNamespace()) elif (namespace == TEMPLATE_TALK or namespace == ARTICLE_TALK or namespace == IMAGE_TALK or namespace == PORTAL_TALK or namespace == WIKIPEDIA_TALK): articles.append(page.title()) elif (namespace == CATEGORY_TALK and includeCats): articles.append(page.title()) elif (namespace == USER or namespace == USER_TALK): # ignore these namespace = namespace else: print "Unexpected namespace on " + page.title() + ": " + str(page.namespace()) #remove duplicates articles = dict.fromkeys(articles).keys() def updateCategoryList (catList, catName, taggedCats, otherTaggedCats, keywords, excluded = [], questionText = u"Do you want to tag ", confirm = True): """ if catList starts with "", it means we're trying to quit, so just return starting at catName, make a list, catList, of all subcategories ask the user first, and allow the user the choice to recurse through subcategories taggedCats is the list of categories that are already tagged and can thus be skipped otherTaggedCats is the list (possibly empty) of categories that are tagged with a related tag -- these should be skipped, with no recursion keywords are words that if they're in the category, it will be tagged without confirmation excluded are categories to skip (treat as if user said 'n') if confirm is false, no confirmation question will be asked (all will be included) """ # check if we're quitting if (len(catList) > 1 and catList[0] == ""): return catList cat = catlib.Category(wikipedia.getSite(), "Category:" + catName) response = "z" # if we have not already decided to tag this cat if (catName not in catList): # if the categories is already in the taggedCats, treat that like a # "y" from the user if ("Category:"+catName in taggedCats): response = "y" # if the category is in otherTaggedCats, treat it like a "n" if ("Category:"+catName in otherTaggedCats): response = "n" elif ("Category talk:"+catName in excluded): response = "n" else: # if the name has a keyword in it, treat that like a "y" from the user for keyword in keywords: if (keyword in catName): response = "y" # if confirm is False, treat it as if the user already said yes if (confirm == False): response = "y" # if response is still "z", ask the user if (response == "z"): response = wikipedia.input(questionText + cat.title() + u"? (y for yes, yn for yes but no recursion, s for stop recursion)") if (response == "s"): # put "" into the catlist at the beginning as a marker catList.insert(0, "") return catList # add the category to the list if (response == "y" or response == "yn"): catList.append(cat.titleWithoutNamespace()) # recurse through subcategories if (response == "y"): subcats = cat.subcategories() for subcat in subcats: updateCategoryList(catList, subcat.titleWithoutNamespace(), taggedCats, otherTaggedCats, keywords, excluded, questionText, confirm) return catList def tagCategories (catName = "Tamil Nadu", tag = "WP India", otherTag = "", project = "India", params = "|class=cat|tamilnadu=yes", keywords = ["Tamil Nadu"]): """ tag all categories in the specified category and subcategories with the specified tag (at the top of the page) if otherTag is not "", skip categories which are tagged with othertag check with the user for each category keywords are words that if they're in the category, it will be tagged without confirmation """ wikipedia.put_throttle.setDelay(10, absolute = True) # get the list of categories which are already tagged taggedCatList = [] taggedArticleList = [] getTagged(tag, taggedCatList, taggedArticleList) otherTaggedCatList = [] if (not otherTag == ""): getTagged(otherTag, otherTaggedCatList, taggedArticleList) # get the list of categories and articles that are to be excluded (articles # will be ignored) excluded = getExcluded(project) # get the category list catList = [] catList = updateCategoryList(catList, catName, taggedCatList, otherTaggedCatList, keywords, excluded) # if the first element of catList is "", remove it, it was just a marker if (catList[0] == ""): catList.remove("") # remove duplicates and sort catList = dict.fromkeys(catList).keys() catList.sort() # remove categories which are already tagged for cat in catList: if (not "Category:"+cat in taggedCatList): tagPage("Category talk:" + cat, tag, params) def untagCategories (catList = [], tag = "Electron", project = "Electronics"): """ untag all specified categories """ wikipedia.put_throttle.setDelay(10, absolute = True) for cat in catList: untagPage("Category talk:" + cat, tag) def getTagged (tag, catList, articles): """ get a list of categories and articles which contain the specified tag """ page = wikipedia.Page(wikipedia.getSite(), "Template:" + tag) refs = page.getReferences(onlyTemplateInclusion=True) for page in refs: name = page.title() result = name.split(":") if (result[0] == "Category talk"): catList.append("Category:"+result[1]) else: articles.append(name) def untag (catList = [], tag = "Numismaticnotice", returnList = False): """ remove the tag from all articles in the specified categories this is useful when the bot makes a mistake if returnList is true, just return a list, don't actually untag """ articles = [] for catName in catList: findArticlesInCategory("Category:"+catName, articles, False) articles = dict.fromkeys(articles).keys() articles.sort() if (returnList): return articles else: for article in articles: untagPage(article, tag) wikipedia.stopme() def classify (catName="Unassessed Texas articles", tag="WikiProject Texas", comment="Texas assessment, class="): """ go through all articles in the specified category and classify them as image, template, category, portal, or NA. Articles are left as is (as are lists and disambig pages) """ articles = [] findArticlesInCategory("Category:"+catName, articles, False, True) templatesToTag = [] categoriesToTag = [] imagesToTag = [] portalsToTag = [] # dabsToTag = [] for article in articles: # if this is a template if (article.find("Template talk:") != -1): templatesToTag.append(article) # if this is a category page if (article.find("Category talk:") != -1): categoriesToTag.append(article) # if this is an image if (article.find("Image talk:") != -1): imagesToTag.append(article) # if this is a portal if (article.find("Portal talk:") != -1): portalsToTag.append(article) # # if this is a regular talk page, assume it's disambig # if (article.find("Talk:") != -1): # dabsToTag.append(article) addParams(templatesToTag, "class", "template", tag, comment + "template") addParams(categoriesToTag, "class", "category", tag, comment + "category") addParams(imagesToTag, "class", "image", tag, comment + "image") addParams(portalsToTag, "class", "portal", tag, comment + "portal") # addParams(dabsToTag, "class", "dab", tag, comment + "dab") def addParams (firstCat = "Unassessed Louisville articles", secondCat = "Louisville stubs", recurse = True, paramName = "class", paramValue = "Stub", tag = "WikiProject Louisville", comment = "Louisville assessment, adding class=Stub"): """ find the articles in the intersection of firstCat and secondCat if recurse is true, include all subcats of secondCat (but not firstCat) paramName is the parameter to add (e.g., "class") paramValue is the value to assign (e.g., "NA") tag is the name of the template tag comment is the text to use for the comment when saving """ # get the list of articles in the first category firstArticles = [] findArticlesInCategory("Category:"+firstCat, firstArticles, False) # get the list of articles in the second category secondCatList = [] secondCatList = updateCategoryList(secondCatList, secondCat, [], [], "Do you want to include ", False) secondArticles = [] for cat in secondCatList: findArticlesInCategory("Category:"+cat, secondArticles, False) # get the list of articles that is in both articles = [] for article in firstArticles: if (article in secondArticles): articles.append(article) addParams(articles, paramName, paramValue, tag, comment) def addParams (articles, paramName, paramValue, tag, comment): """ articles is the list of articles to change paramName is the parameter to add (e.g., "class") paramValue is the value to assign (e.g., "NA") tag is the name of the template tag comment is the text to use for the comment when saving """ for article in articles: page = wikipedia.Page(wikipedia.getSite(), article) text = page.get() # skip the first character so we don't have to worry about upper/lower tagStart = text.find(tag[1:]) tagEnd = text[tagStart:].find("}}") tagEnd = tagStart + tagEnd paramStart = text[tagStart:tagEnd].find(paramName) if (paramStart != -1): paramStart = tagStart + paramStart - 1 paramEnd = text[paramStart+1:tagEnd].find("|") if (paramEnd != -1): paramEnd = paramStart + paramEnd + 1 else: paramEnd = tagEnd else: paramStart = tagEnd paramEnd = tagEnd text = text[:paramStart] + "|" + paramName + "=" + paramValue + \ text[paramEnd:] if (not writePage(page, text, comment)): break def replaceTag (oldTag="LouisvilleWikiProject", newTag="WikiProject Louisville"): """ replace the oldTag with the newTag (can be used to replace a tag with a tag plus parameters) """ articles = [] getTagged(oldTag, [], articles) for article in articles: page = wikipedia.Page(wikipedia.getSite(), article) text = page.get() text = wikipedia.replaceExceptMathNowikiAndComments( text, oldTag, newTag) if (not writePage(page, text, "replacing " + oldTag + " with " + newTag)): break def tag (tag = "Numismaticnotice", params = "", otherTag = "Exonumianotice", project = "Numismatics", confirm=False, catList = [], returnList = False, assessmentTag = "numismatic articles", classify=True): """ tag articles in tagged categories if a page is already tagged with otherTag, skip it (use otherTag = "" for none) catList is a list of categories to check in. If empty, use tagged categories if params is given, include it after the tag, when tagging an article if returnList is true, don't actually tag anything, just return the list in this case, also don't skip a page just because it's already tagged assessmentTag is a text string contained in the assessment categories, use "" to ignore if classify is true, include class= """ # get the list of all tagged articles in taggedArticles # if catList was given, leave it as is. Otherwise, populate catList with # all tagged categories taggedArticles = [] if (len(catList) == 0): getTagged(tag, catList, taggedArticles) # skip the assessment categories (otherwise, we won't skip articles # which are currently tagged but shouldn't be) newCatList = [] for cat in catList: if (assessmentTag != "" and cat.find(assessmentTag) == -1): newCatList.append(cat) catList = newCatList else: dummy = [] getTagged(tag, dummy, taggedArticles) # put "Category:" in front of the category names newCatList = [] for cat in catList: newCatList.append("Category:"+cat) catList = newCatList # add the articles tagged with otherTag to the list of taggedArticles if (otherTag != ""): getTagged(otherTag, [], taggedArticles) # get the list of untagged articles in the categories in catList (which # was either supplied as a parameter, or was populated with tagged categories) untaggedArticles = [] for cat in catList: findArticlesInCategory(cat, untaggedArticles, confirm) # remove duplicates and sort untaggedArticles = dict.fromkeys(untaggedArticles).keys() untaggedArticles.sort() # if we're returning a list, stop here if (returnList): return untaggedArticles # make a list of articles that need to be tagged (by removing articles # that are already tagged from list of all articles) for article in taggedArticles: if (article in untaggedArticles): untaggedArticles.remove(article) # remove excluded articles excluded = getExcluded(project) for page in excluded: if (page in untaggedArticles): untaggedArticles.remove(page) if (len(untaggedArticles) == 0): wikipedia.output(u"No untagged articles") print "Tagging " + str(len(untaggedArticles)) + " articles" # tag the articles for article in untaggedArticles: tagPage(article, tag, params, classify) wikipedia.stopme() def fixWrongTags (catList = ["Coin games", "Electronic currencies", "Digital currency exchangers", "Digital gold currencies", "Money", "Money stubs", "Foreign exchange market", "Ancient mints", "Challenge coin"]): """ untag the articles in the specified categories, but only if they are not in other categories that require them to be tagged """ # find articles that should be tagged needTagList = tag("Numismaticnotice", "", "Exonumianotice", "Numismatics", False, [], True) # now get the list of articles to untag (returns all articles in the # specified categories, without checking if they're tagged) untagList = untag(catList, "Numismaticnotice", True) # if an article is in the untagList and not in the needTagList, untag it for article in untagList: if (not article in needTagList): untagPage(article, "Numismaticnotice") def findDoubleTags (catList = []): """ find articles that are in numismatics as well as exonumia categories """ # find articles that think they should be tagged Exonumia and Numismaticnotice numArticles = tag("Numismaticnotice", "", "", "Numismatics", False, [], True) getTagged("Numismaticnotice", [], numArticles) exoArticles = tag("Exonumianotice", "", "", "Numismatics", False, [], True) getTagged("Exonumianotice", [], exoArticles) bothArticles = [] for article in numArticles: if (article in exoArticles): bothArticles.append(article) text = "" for article in bothArticles: text += "*[["+article+"]]<br>\n" print text wikipedia.stopme() projects = ["Numismatics", "Numismatics", "Hawaii", "Texas", "Ice Hockey", "Louisville", "Kentucky", "Texas State Highways", "Dallas", "Comics", "Pittsburgh", "Baseball", "Bell System", "LGBT studies", "San Francisco Bay Area", "Africa", "Electronics", "Tennessee", "Automobiles", "Cricket"] def listProjects (): """ print out a list of active projects, with numbers to use for an individual update """ for proj in range(len(projects)): print(str(proj) + ": " + projects[proj]) def update (projectNums = []): """ update the project watchlists. If projectNum is given, only update the given project number (see projects for list, remember to start at 0) """ templates = ["Numismaticnotice", "Exonumianotice", "WPHawaii", "WikiProject Texas", "Ice hockey", "WikiProject Louisville", "WikiProject Kentucky", "Texas State Highway WikiProject", "WikiProject Dallas", "comicsproj", "PittsburghWikiProject", "Baseball-WikiProject", "WikiProject Bell System", "LGBTProject", "SFBAProject", "AfricaProject", "Electron", "WikiProject Tennessee", "AutomobileWatch", "CricketWatch"] articleOuts = ["Articles", "Exonumia articles", "Hawaii recent changes", "Articles", "Articles", "Watchall", "Watchall", "Watchlist", "Articles", "Articles", "Articles", "Articles", "Articles", "Articles", "Watchlist", "Watchlist", "Articles", "Articles", "Articles", "Articles"] # pages to include even though they aren't tagged includePagesLists = [["Template:Currencies of Africa", "Template:Currencies of the Americas", "Template:Currencies of Asia", "Template:Currencies of Europe", "Template:Currencies of Oceania"], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] # true if we're getting tagged articles, false if we're getting articles # in tagged categories taggedPagesFlags = [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False] if (len(projectNums) == 0): projectNums = range(len(templates)) for i in projectNums: template, project = templates[i], projects[i] articleOut, includePagesList = articleOuts[i], includePagesLists[i] taggedPagesFlag = taggedPagesFlags[i] print "Updating watchlist for: %s using template: %s. Saving to: %s" \ % (project, template, articleOut) wl = Watchlist(template, project, articleOut, includePagesList) if (taggedPagesFlag): wl.getTaggedPages() else: wl.getPagesFromTaggedCategories() wl.writeList(taggedPagesFlag) wikipedia.stopme()