Jump to content

User:PNG crusade bot/Source code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Remember the dot (talk | contribs) at 02:03, 12 January 2007 (Created page with '{{GPL-self}} <pre> 'PNG crusade bot 'Copyright (C) 2007 English Wikipedia user "Remember the dot" 'This program is free software; you can redistribute it and/or m...'). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)

Template:GPL-self

'PNG crusade bot
'Copyright (C) 2007  English Wikipedia user "Remember the dot"

'This program is free software; you can redistribute it and/or modify
'it under the terms of the GNU General Public License as published by
'the Free Software Foundation; either version 2 of the License, or
'(at your option) any later version.

'This program is distributed in the hope that it will be useful,
'but WITHOUT ANY WARRANTY; without even the implied warranty of
'MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
'GNU General Public License for more details.

'You should have received a copy of the GNU General Public License
'along with this program; if not, write to the Free Software
'Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

''' <summary>
''' The PNG Crusade Bot
''' </summary>
''' <remarks>Requires the project to reference and import System.Drawing and System.Web in addition to the standard references and imports for a VB .NET console program</remarks>
Module PngCrusadeBot
    ''' <summary>
    ''' The category to search. Make sure TargetCategory and TargetTemplate match!
    ''' </summary>
    ''' <remarks>Sample values: "Images which should be in PNG format" "Images with inappropriate GIF compression" "Images with inappropriate JPEG compression"</remarks>
    Private Const TargetCategory As String = "Images which should be in PNG format"
    ''' <summary>
    ''' The template associated with the category target. Make sure TargetCategory and TargetTemplate match!
    ''' </summary>
    ''' <remarks>Sample values: {{ShouldBePNG}} {{BadGIF}} {{BadJPEG}}</remarks>
    Private Const TargetTemplate As String = "{{ShouldBePNG}}"

    Private XmlReaderSettingsForHTML As New Xml.XmlReaderSettings
    Private TempPath As String = IO.Path.GetTempFileName + ".png"
    Private Cookies As New Net.CookieContainer

    ''' <summary>
    ''' Gets a web response using the UserAgent string "Mozilla/5.0 (compatible)"
    ''' </summary>
    ''' <remarks>Usually you'll want to use HttpGet</remarks>
    Private Function HttpGetResponse(ByVal uri As String) As Net.WebResponse
        Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create(uri)
        request.UserAgent = "Mozilla/5.0 (compatible)"
        request.CookieContainer = Cookies
        Return request.GetResponse
    End Function

    ''' <summary>
    ''' Gets a HTTP response stream
    ''' </summary>
    Private Function HttpGet(ByVal uri As String) As IO.Stream
        Return HttpGetResponse(uri).GetResponseStream
    End Function

    ''' <summary>
    ''' Gets an XML document over HTTP
    ''' </summary>
    Private Function HttpGetXML(ByVal uri As String) As Xml.XPath.XPathNavigator
        Return New Xml.XPath.XPathDocument(Xml.XmlReader.Create(HttpGet(uri), XmlReaderSettingsForHTML)).CreateNavigator()
    End Function

    Private Function WikifyTimestamp(ByVal timestampAttribute As String)
        'sample timestamp attribute: "2006-11-05T18:13:50Z"
        'sample return value: "18:13:50, 5 November 2006"
        Return timestampAttribute.Substring(11, 8) + ", " + FormatNumber(timestampAttribute.Substring(8, 2), 0, TriState.False) + " "c + DateAndTime.MonthName(timestampAttribute.Substring(5, 2)) + " "c + timestampAttribute.Substring(0, 4)
    End Function

    ''' <summary>
    ''' From a navigator positioned on an image or ih element, retrieves one line's worth of revision history
    ''' </summary>
    Private Function GetRevisionHistoryLine(ByVal navigator As Xml.XPath.XPathNavigator)
        Dim username As String = navigator.GetAttribute("user", "")
        Dim comment As String = navigator.GetAttribute("comment", "")
        GetRevisionHistoryLine = WikifyTimestamp(navigator.GetAttribute("timestamp", "")) + " . . [[User:" + username + "|"c + username + "]] ([[User talk:" + username + "|Talk]] | [[Special:Contributions/" + username + "|Contribs]]) . . " + navigator.GetAttribute("width", "") + "x"c + navigator.GetAttribute("height", "") + " (" + FormatNumber(navigator.GetAttribute("size", ""), 0, TriState.UseDefault, TriState.UseDefault, TriState.True) + " bytes)"
        If comment <> "" Then
            'change "<" to "<" to prevent any tags from working
            'change EOLs to spaces so everything is on one line
            GetRevisionHistoryLine += " (''" + comment.Replace("<", "<").Replace(vbLf, " ") + "'')"
        End If
    End Function

    ''' <summary>
    ''' Retrieves an image from Wikipedia
    ''' </summary>
    ''' <param name="imageName">The title of the image to retrive</param>
    ''' <param name="imageSize">The size of the image</param>
    ''' <remarks>Uses the Query API</remarks>
    Private Function GetImage(ByVal imageName As String, ByRef imageSize As String, ByRef revisionHistory As String) As Image
        Dim navigator As Xml.XPath.XPathNavigator = HttpGetXML("http://en.wikipedia.org/w/query.php?format=xml&what=imageinfo&iihistory&iiurl&titles=Image:" + HttpUtility.UrlEncode(imageName)).SelectSingleNode("/yurik/pages/page")
        Dim nodeIterator As Xml.XPath.XPathNodeIterator

        If navigator Is Nothing Then Return Nothing 'this bot will generate an exception if it tries to process images on the commons

        nodeIterator = navigator.Select("imghistory/ih")
        navigator = navigator.SelectSingleNode("image")
        imageSize = navigator.GetAttribute("size", "")
        GetImage = Image.FromStream(HttpGet(navigator.GetAttribute("url", "")))

        'determine revision history
        revisionHistory = GetRevisionHistoryLine(navigator)

        If nodeIterator.Count <> 0 Then 'there is more to the history
            Dim index As Integer
            Dim upperBound As Integer = nodeIterator.Count - 1
            Dim oldRevisionHistoryLines(upperBound) As String

            'the older revision history lines are old-to-new when they need to be new-to-old, so this algorithm reverses them
            For index = 0 To upperBound
                nodeIterator.MoveNext()
                navigator = nodeIterator.Current

                oldRevisionHistoryLines(upperBound - index) = GetRevisionHistoryLine(navigator)
            Next

            'join the array together, putting in line breaks
            revisionHistory += "<br />" + Environment.NewLine + String.Join("<br />" + Environment.NewLine, oldRevisionHistoryLines)
        End If
    End Function

    ''' <remarks>Uses the MediaWiki API</remarks>
    Private Function GetImageLinks(ByVal imageName As String) As String()
        Dim imageLinksIterator As Xml.XPath.XPathNodeIterator = HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&format=xml&list=imagelinks&illimit=500&titles=Image:" + HttpUtility.UrlEncode(imageName)).Select("/api/query/imagelinks/il/@title")
        Dim imageLinksArray(imageLinksIterator.Count - 1) As String
        Dim index As Integer

        For index = 0 To imageLinksArray.GetUpperBound(0)
            imageLinksIterator.MoveNext() 'get the iterator going the first time, and increment it every time after that
            imageLinksArray(index) = imageLinksIterator.Current.Value
        Next

        Return imageLinksArray
    End Function

    ''' <remarks>Uses the MediaWiki API</remarks>
    Private Function GetPageSource(ByVal pageTitle As String) As String
        Return HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=content&titles=" + HttpUtility.UrlEncode(pageTitle)).SelectSingleNode("/api/query/pages/page/revisions/rev").Value
    End Function

    ''' <remarks>Uses the Query API</remarks>
    Private Function GetPagesInCategory(ByVal categoryTitle As String) As String()
        Dim pagesIterator As Xml.XPath.XPathNodeIterator = HttpGetXML("http://en.wikipedia.org/w/query.php?what=category&format=xml&cptitle=" + HttpUtility.UrlEncode(categoryTitle)).Select("/yurik/pages/page/title")
        Dim pages(pagesIterator.Count - 1) As String
        Dim index As Integer

        For index = 0 To pages.GetUpperBound(0)
            pagesIterator.MoveNext()
            pages(index) = pagesIterator.Current.Value
        Next

        Return pages
    End Function

    Private Sub SetPageSource(ByVal pageTitle As String, ByVal value As String, Optional ByVal editSummary As String = "", Optional ByVal minor As Boolean = False)
        Dim requestUriBeginning As String = "http://en.wikipedia.org/w/index.php?title=" + HttpUtility.UrlEncode(pageTitle.Replace(" "c, "_"c)) + "&action="
        'get edit token
        Dim reader As Xml.XmlReader = Xml.XmlReader.Create(HttpGet(requestUriBeginning + "edit"), XmlReaderSettingsForHTML)
        Dim namespaces As New Xml.XmlNamespaceManager(reader.NameTable)
        Dim navigator As Xml.XPath.XPathNavigator = New Xml.XPath.XPathDocument(reader).CreateNavigator
        Dim editToken As String
        Dim editTime As String
        namespaces.AddNamespace("html", "http://www.w3.org/1999/xhtml")
        editToken = navigator.SelectSingleNode("//html:input[@name='wpEditToken']", namespaces).GetAttribute("value", "")
        editTime = navigator.SelectSingleNode("//html:input[@name='wpEdittime']", namespaces).GetAttribute("value", "")

        'post back data
        Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create(requestUriBeginning + "submit")
        Dim requestString As String

        request.Method = "POST"
        request.UserAgent = "Mozilla/5.0 (compatible)"
        requestString = "wpSection=&scrollTop=&wpSave=Save%20page&wpEditToken=" + editToken + "&wpStarttime=" + Date.Now.ToUniversalTime.ToString("yyyyMMddHHmmss") + "&wpEdittime=" + editTime + "&wpSummary=" + HttpUtility.UrlEncode(editSummary) + "&"c
        If minor Then requestString += "wpMinoredit=1&"
        requestString += "wpTextbox1=" + HttpUtility.UrlEncode(value) 'i don't think Wikipedia likes it when this is URL encoded
        request.ContentLength = requestString.Length
        request.ContentType = "application/x-www-form-urlencoded; encoding=UTF-8"
        request.CookieContainer = Cookies 'contains login info
        Dim bytes() As Byte = System.Text.Encoding.UTF8.GetBytes(requestString)
        request.GetRequestStream.Write(bytes, 0, bytes.Length)
        request.GetRequestStream.Close()
        request.GetResponse.Close() 'a response is not neccessary and it leaks connections to not close it

        'for debugging the output
        'Dim s As String = New IO.StreamReader(request.GetResponse.GetResponseStream).ReadToEnd()
    End Sub

    Private Sub Login(ByVal username As String, ByVal password As String)
        'thanks to http://www.netomatix.com/HttpPostData.aspx
        Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create("http://en.wikipedia.org/w/api.php")
        Dim requestString As String

        request.Method = "POST"
        request.UserAgent = "Mozilla/5.0 (compatible)"
        requestString = "action=login&lgname=" + username + "&lgpassword=" + password
        request.ContentLength = requestString.Length
        request.ContentType = "application/x-www-form-urlencoded"
        request.CookieContainer = Cookies 'store the login info for later
        request.GetRequestStream.Write(System.Text.Encoding.ASCII.GetBytes(requestString), 0, requestString.Length)
        request.GetRequestStream.Close()
        request.GetResponse() 'retrieve and store the cookies (they contain login tokens)

        'for debugging the output
        'Dim s As String = New IO.StreamReader(request.GetResponse.GetResponseStream).ReadToEnd()
    End Sub

    ''' <summary>Uploads local image to wiki site.</summary>
    ''' <param name="filePath">Path of image file.</param>
    ''' <param name="description">Image description.</param>
    ''' <remarks>
    ''' Ported from DotNetWikiBot
    ''' 
    ''' DotNetWikiBot Framework is available under The MIT License (also called X11 license). Below is the text of the license.
    '''
    ''' Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
    '''
    ''' The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
    '''
    ''' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    ''' </remarks>
    Private Sub UploadImage(ByVal filePath As String, ByVal imageTitle As String, ByVal description As String)
        Dim fileName As String = IO.Path.GetFileName(filePath).Substring(0, 1).ToUpper() + IO.Path.GetFileName(filePath).Substring(1)
        Dim webReq As Net.HttpWebRequest = Net.HttpWebRequest.Create("http://en.wikipedia.org/wiki/Special:Upload")
        'webReq.Proxy.Credentials = CredentialCache.DefaultCredentials;
        webReq.Method = "POST"
        Dim boundary As String = "----------" + DateTime.Now.Ticks.ToString("x")
        webReq.ContentType = "multipart/form-data; boundary=" + boundary
        webReq.UserAgent = "Mozilla/5.0 (compatible)"
        webReq.CookieContainer = Cookies
        Dim sb As New System.Text.StringBuilder
        Dim ph As String = "--" + boundary + Environment.NewLine + "Content-Disposition: form-data; name="""
        sb.Append(ph + "wpIgnoreWarning""" + Environment.NewLine + Environment.NewLine + "1"c + Environment.NewLine)
        sb.Append(ph + "wpUploadAffirm""" + Environment.NewLine + Environment.NewLine + "1"c + Environment.NewLine)
        sb.Append(ph + "wpUpload""" + Environment.NewLine + Environment.NewLine + "upload bestand" + Environment.NewLine)
        'sb.Append(ph + "wpLicense""" + environment.newline + environment.newline + HttpUtility.UrlEncode(license) + "\r\n");
        sb.Append(ph + "wpUploadDescription""" + Environment.NewLine + Environment.NewLine + description + Environment.NewLine)
        sb.Append(ph + "wpUploadFile""; filename=""" + imageTitle + """"c + Environment.NewLine + "Content-Type: application/octet-stream" + Environment.NewLine + Environment.NewLine)
        Dim postHeaderBytes() As Byte = System.Text.Encoding.UTF8.GetBytes(sb.ToString())
        Dim fileBytes() As Byte = IO.File.ReadAllBytes(filePath)
        Dim boundaryBytes() As Byte = System.Text.Encoding.ASCII.GetBytes(Environment.NewLine + "--" + boundary + Environment.NewLine)
        webReq.ContentLength = postHeaderBytes.Length + fileBytes.Length + boundaryBytes.Length
        Dim reqStream As IO.Stream = webReq.GetRequestStream()
        reqStream.Write(postHeaderBytes, 0, postHeaderBytes.Length)
        reqStream.Write(fileBytes, 0, fileBytes.Length)
        reqStream.Write(boundaryBytes, 0, boundaryBytes.Length)
        Dim webResp As Net.WebResponse = webReq.GetResponse()
        Dim strmReader As New IO.StreamReader(webResp.GetResponseStream())
        Dim respStr As String = strmReader.ReadToEnd()
        strmReader.Close()
        webResp.Close()
    End Sub

    ''' <summary>
    ''' Stops processing for a while.
    ''' </summary>
    ''' <param name="message">The message to write to the screen.</param>
    ''' <remarks>All bots must pause between edits.</remarks>
    Private Sub Pause(ByVal message As String)
        Console.WriteLine(message + ". Sleeping...")
        Threading.Thread.Sleep("10000") 'wait 10 seconds
    End Sub

    ''' <summary>
    ''' Searches a block of text without being case-sensitive and treating "_" as " ", then performs the replacement
    ''' </summary>
    Private Function WikiReplace(ByVal text As String, ByVal find As String, ByVal replace As String) As String
        Dim index As Integer

        'WikiReplace is the return value, text is just used for the search
        WikiReplace = text
        text = text.ToLower.Replace("_"c, " ")
        find = find.ToLower.Replace("_"c, " ")

        index = text.IndexOf(find) 'initial setup
        While index <> -1
            WikiReplace = WikiReplace.Substring(0, index) + replace + WikiReplace.Substring(index + find.Length)
            index = text.IndexOf(find, index + 1)
        End While
    End Function

    ''' <summary>
    ''' Searches a block of text without being case-sensitive, then performs the replacement
    ''' </summary>
    Private Function CaseInsensitiveReplace(ByVal text As String, ByVal find As String, ByVal replace As String) As String
        Dim index As Integer

        'WCaseInsensitiveReplace is the return value, text is just used for the search
        CaseInsensitiveReplace = text
        text = text.ToLower
        find = find.ToLower

        index = text.IndexOf(find) 'initial setup
        While index <> -1
            CaseInsensitiveReplace = CaseInsensitiveReplace.Substring(0, index) + replace + CaseInsensitiveReplace.Substring(index + find.Length)
            index = text.IndexOf(find, index + 1)
        End While
    End Function

    ''' <remarks>Uses MediaWiki API</remarks>
    Private Function PageExists(ByVal pageTitle As String) As Boolean
        Return HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&format=xml&titles=" + HttpUtility.UrlEncode(pageTitle)).SelectSingleNode("/api/query/pages/page").GetAttribute("pageid", "") <> ""
    End Function

    Public Sub Main()
        'set things up
        Net.ServicePointManager.DefaultConnectionLimit = 10 'evil, i know, but it won't work with only 2 connections for some reason. sorry, RFC 2616 section 8.1.4...
        XmlReaderSettingsForHTML.ProhibitDtd = False

        Console.WriteLine("PNG crusade bot version 2007.01.11 (11 Jan 2007)")
        Console.WriteLine("Copyright (C) English Wikipedia user ""Remember the dot""")
        Console.WriteLine("The PNG crusade bot comes with ABSOLUTELY NO WARRANTY")
        Console.WriteLine("This is free software, and you are welcome to redistribute it")
        Console.WriteLine("under the terms of the GNU General Public License.")
        Console.WriteLine()

        'start!
        Login("PNG crusade bot", )
        Dim pages() As String = GetPagesInCategory(TargetCategory)
        Dim imageName As String
        For Each imageName In pages
            If imageName = "Wikipedia:Template messages/Image namespace" Then GoTo DoNext 'skip the 1 page in this category
            imageName = imageName.Substring("Image:".Length)
            Dim image As Image
            Dim imageSize As Long
            Dim imageRevisionHistory As String = ""
            Dim pngFileSize As Long
            Dim imageLinks() As String
            Dim pageTitle As String

            Console.WriteLine("Looking at """ + imageName + """...")

            If PageExists("Image talk:" + imageName) Then
                Console.WriteLine("An image talk page exists. This bot is not designed to handle image talk pages. Moving to next image.")
                GoTo DoNext
            End If

            image = GetImage(imageName, imageSize, imageRevisionHistory)

            If image Is Nothing Then
                Console.WriteLine("This image appears to be from the Wikimedia commons. This bot is not designed to handle images on the Wikimedia commons.")
                GoTo DoNext
            End If

            image.Save(TempPath, Imaging.ImageFormat.Png)

            pngFileSize = New IO.FileInfo(TempPath).Length

            If pngFileSize < imageSize Then 'if the PNG version is smaller
                Console.WriteLine("This image could save " & imageSize - pngFileSize & " bytes. Gathering data...")

                Dim newImageName As String = InputBox("What is the new name for """ + imageName + """? Do not include the "".png"" extension.", "New image name", IO.Path.GetFileNameWithoutExtension(imageName)) + ".png"

                'the console way
                'Dim newImageName As String
                'Console.Write("Input new image name without "".png"" extension: ")
                'newImageName = Console.ReadLine() + ".png"

                If newImageName = ".png" Then 'if the input was an empty string or the user pressed Cancel
                    Console.WriteLine("Aborted. Moving on.")
                    GoTo DoNext
                End If
                Dim oldImagePageSource As String 'the page for the GIF version
                Dim newImagePageSource As String 'the page for the PNG version

                'create variables for (re)writing image pages
                oldImagePageSource = CaseInsensitiveReplace(CaseInsensitiveReplace(CaseInsensitiveReplace(GetPageSource("Image:" + imageName), vbLf + TargetTemplate, ""), TargetTemplate + vbLf, ""), TargetTemplate, vbLf)
                newImagePageSource = "__NOTOC__" + Environment.NewLine + _
                                     oldImagePageSource + Environment.NewLine + _
                                     "== Automatically converted to PNG ==" + Environment.NewLine + _
                                     "The [[User:PNG crusade bot|PNG crusade bot]] automatically converted this image to the more efficient [[PNG]] format. The image was originally uploaded as """ + imageName + """." + Environment.NewLine + _
                                     "=== Previous file history ===" + Environment.NewLine + _
                                     imageRevisionHistory
                oldImagePageSource = "{{PNG version available|" + newImageName + "}}" + Environment.NewLine + _
                                     "{{subst:orfur|Image:" + newImageName + "}}" + Environment.NewLine + _
                                     oldImagePageSource

                'upload image
                UploadImage(TempPath, newImageName, newImagePageSource)

                Pause("Uploaded image """ + newImageName + """"c)

                'change the references to point to the new image
                imageLinks = GetImageLinks(imageName)
                newImageName = "Image:" + newImageName
                For Each pageTitle In imageLinks
                    Dim newPageSource As String = GetPageSource(pageTitle)
                    Dim revisedNewPageSource As String

                    revisedNewPageSource = WikiReplace(newPageSource, "Image:" + imageName, newImageName)
                    If newPageSource = revisedNewPageSource Then
                        Console.WriteLine("Error rewriting """ + pageTitle + """: the image was not found in the code. Press enter to continue processing...")
                        Console.ReadLine()
                    Else
                        SetPageSource(pageTitle, revisedNewPageSource, "Converted image to PNG", True)
                        Pause("Rewrote """ + pageTitle + """")
                    End If
                Next

                'make sure all references were removed
                imageLinks = GetImageLinks(imageName)
                If imageLinks.Length <> 0 Then
                    Console.WriteLine("Ack! " & imageLinks.Length & " references to the old image still exist! Press enter to resume processing.")
                    Console.ReadLine()
                End If

                'rewrite old image page
                SetPageSource("Image:" + imageName, oldImagePageSource, "Marked image as having been replaced by a PNG version")
                Console.WriteLine("Rewrote old image page; finished.")
            Else
                Console.WriteLine("This image could not save anything.")
            End If

DoNext:
            Console.WriteLine() 'put a line break between each image
        Next

        IO.File.Delete(TempPath)

        Console.WriteLine("All done! Press enter to exit.")
        Console.ReadLine() 'allow the log to be read
    End Sub
End Module