Jump to content

User:MastCell/dermimages.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
# File: Dermimages.py
# By MastCell
# Released for any and all reuse and modification
# Use at your own risk.
# ========================
# This script does the following:
#  1. Load all articles linked to the Dermatology Task Force,
#     using Category:Dermatology task force articles.
#  2. Check each page for images, removing those which are part of
#     common templates
#  3. Output the results in a sortable wikitable which can be
#     cut-and-pasted onto Wikipedia.
#
# The goal is to assess how many dermatology-related articles currently
# lack images, and to assess the overall prevalence of images across
# derm-related articles.
#
# The script can be run from the command line. It will create a file
# called "dermimages_output.txt" in the same directory where the script
# itself resides. Note that if such a file already exists, it will be
# overwritten. The output format is meant to be cut-and-pasted into
# Wikipedia as a wikitable.
# =========================

# MWclient module for Wikimedia API calls
import mwclient


# Global set of image names to exclude
# (include images from templates, featured article stars, etc
#  which should not be counted as "content" images)
# Modify as needed.
global_exclude_list =\
    set(["Normal Epidermis and Dermis with Intradermal Nevus 10x.JPG",\
        "LinkFA-star.png",\
        "Featured article star.svg",\
        "Symbol support vote.svg",\
        "Rod of Asclepius2.svg",\
        "Mergefrom.svg",\
        "Gray944.png",\
        "Question book-new.svg",\
        "Ambox contradict.svg",\
        "Mitotic spindle color micrograph.gif",\
        "Ambox content.png",\
        "Text document with red question mark.svg",\
        "Edit-clear.svg",\
        "UK-Medical-Bio-Stub.svg",\
        "Flag of Germany.svg",\
        "Commons-logo.svg",\
        "Wiki letter w.svg",\
        "Chromosome.svg",\
        "DNA stub.png",\
        "Merge-arrow.svg"])


# Global dictionary to count how many pages have zero images, one image, two images, ...
global_imagenums = {"Total pages": 0,\
                    "Pages with zero images": 0,\
                    "Pages with one image": 0,\
                    "Pages with two images": 0,\
                    "Pages with three images": 0,\
                    "Pages with four or more images": 0}


# Open the site and collect pages from category
# (Note that these will generally be article talk pages, since that's
#  where the Derm task force template is typically placed)
wpHandle = mwclient.Site('en.wikipedia.org')
dermTalkPages = wpHandle.Pages['Category:Dermatology task force articles']


# Main program loop: load and process each page
def main_program():
    setUpTable()
    for page in dermTalkPages:
        # Make sure we're dealing with the article page, rather than talk page
        page = wpHandle.Pages[page.page_title]

        # Increment the total page count
        global_imagenums["Total pages"] += 1
        
        # Load images and process them
        imageHandle = page.images
        imageList = imageHandle()
        processPage(page, imageList)
    closeTable()
    outputDictionary()
    
# Page processing function
# Note to self: need to encode the page and image names. Otherwise
# the script will eventually choke with a UnicodeEncodingError.
# Hopefully the XMLchar replacements will be properly rendered by
# Wikimedia.
def processPage(page, imageList):
    outputFile.write("|-\n")
    outputFile.write("| [[")
    outputFile.write(page.name.encode("iso-8859-15", "xmlcharrefreplace"))
    outputFile.write("]] ||")

    imageCount = 0
    for image in imageList:
        if (image.page_title not in global_exclude_list):
            if (imageCount > 0):
                outputFile.write("<br>\n")
            imageCount += 1
            outputFile.write('[[:' + image.name.encode("iso-8859-15", "xmlcharrefreplace"))
            outputFile.write(']]')
    outputFile.write("\n" + ' || ' + str(imageCount) + "\n")
    incrementPageCounter(imageCount)


# Function to update the dictionary of pages
# I'm sure there's a more elegant way to do this, but...
def incrementPageCounter(numImages):
    if (numImages is 0):
        global_imagenums["Pages with zero images"] += 1
    elif (numImages is 1):
        global_imagenums["Pages with one image"] += 1
    elif (numImages is 2):
        global_imagenums["Pages with two images"] += 1
    elif (numImages is 3):
        global_imagenums["Pages with three images"] += 1
    else:
        global_imagenums["Pages with four or more images"] += 1
        

# Output the table header boilerplate
def setUpTable():
    outputFile.write('{| class="wikitable sortable" border="1"')
    outputFile.write("\n")
    outputFile.write('! Page !! class="unsortable" | Images !! Number of images')
    outputFile.write("\n")


# Output the table footer boilerplate
def closeTable():
    outputFile.write('|}')


# Output the dictionary counts
def outputDictionary():
    outputFile.write("\n== Totals by number of images ==\n")
    for key, value in global_imagenums.iteritems():
        outputFile.write(key + ": ")
        outputFile.write(str(value))
        outputFile.write("\n")
    
    
########################################
# Main program
# ------------
# Opens a handle to the output file, then runs the main loop
########################################
with open('dermimages_output.txt', 'w') as outputFile:
    main_program()