User:PNG crusade bot/Source code
Appearance
'PNG crusade bot
'Copyright (C) 2007 English Wikipedia user "Remember the dot"
'This program is free software; you can redistribute it and/or modify
'it under the terms of the GNU General Public License as published by
'the Free Software Foundation; either version 2 of the License, or
'(at your option) any later version.
'This program is distributed in the hope that it will be useful,
'but WITHOUT ANY WARRANTY; without even the implied warranty of
'MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
'GNU General Public License for more details.
'You should have received a copy of the GNU General Public License
'along with this program; if not, write to the Free Software
'Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
''' <summary>
''' The PNG Crusade Bot
''' </summary>
''' <remarks>Requires the project to reference and import System.Drawing and System.Web in addition to the standard references and imports for a VB .NET console program</remarks>
Module PngCrusadeBot
''' <summary>
''' The category to search. Make sure TargetCategory and TargetTemplate match!
''' </summary>
''' <remarks>Sample values: "Images which should be in PNG format" "Images with inappropriate GIF compression" "Images with inappropriate JPEG compression"</remarks>
Private Const TargetCategory As String = "Images which should be in PNG format"
''' <summary>
''' The template associated with the category target. Make sure TargetCategory and TargetTemplate match!
''' </summary>
''' <remarks>Sample values: {{ShouldBePNG}} {{BadGIF}} {{BadJPEG}}</remarks>
Private Const TargetTemplate As String = "{{ShouldBePNG}}"
Private XmlReaderSettingsForHTML As New Xml.XmlReaderSettings
Private TempPath As String = IO.Path.GetTempFileName + ".png"
Private Cookies As New Net.CookieContainer
''' <summary>
''' Gets a web response using the UserAgent string "Mozilla/5.0 (compatible)"
''' </summary>
''' <remarks>Usually you'll want to use HttpGet</remarks>
Private Function HttpGetResponse(ByVal uri As String) As Net.WebResponse
Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create(uri)
request.UserAgent = "Mozilla/5.0 (compatible)"
request.CookieContainer = Cookies
Return request.GetResponse
End Function
''' <summary>
''' Gets a HTTP response stream
''' </summary>
Private Function HttpGet(ByVal uri As String) As IO.Stream
Return HttpGetResponse(uri).GetResponseStream
End Function
''' <summary>
''' Gets an XML document over HTTP
''' </summary>
Private Function HttpGetXML(ByVal uri As String) As Xml.XPath.XPathNavigator
Return New Xml.XPath.XPathDocument(Xml.XmlReader.Create(HttpGet(uri), XmlReaderSettingsForHTML)).CreateNavigator()
End Function
Private Function WikifyTimestamp(ByVal timestampAttribute As String)
'sample timestamp attribute: "2006-11-05T18:13:50Z"
'sample return value: "18:13:50, 5 November 2006"
Return timestampAttribute.Substring(11, 8) + ", " + FormatNumber(timestampAttribute.Substring(8, 2), 0, TriState.False) + " "c + DateAndTime.MonthName(timestampAttribute.Substring(5, 2)) + " "c + timestampAttribute.Substring(0, 4)
End Function
''' <summary>
''' From a navigator positioned on an image or ih element, retrieves one line's worth of revision history
''' </summary>
Private Function GetRevisionHistoryLine(ByVal navigator As Xml.XPath.XPathNavigator)
Dim username As String = navigator.GetAttribute("user", "")
Dim comment As String = navigator.GetAttribute("comment", "")
GetRevisionHistoryLine = WikifyTimestamp(navigator.GetAttribute("timestamp", "")) + " . . [[User:" + username + "|"c + username + "]] ([[User talk:" + username + "|Talk]] | [[Special:Contributions/" + username + "|Contribs]]) . . " + navigator.GetAttribute("width", "") + "x"c + navigator.GetAttribute("height", "") + " (" + FormatNumber(navigator.GetAttribute("size", ""), 0, TriState.UseDefault, TriState.UseDefault, TriState.True) + " bytes)"
If comment <> "" Then
'change "<" to "<" to prevent any tags from working
'change EOLs to spaces so everything is on one line
GetRevisionHistoryLine += " (''" + comment.Replace("<", "<").Replace(vbLf, " ") + "'')"
End If
End Function
''' <summary>
''' Retrieves an image from Wikipedia
''' </summary>
''' <param name="imageName">The title of the image to retrive</param>
''' <param name="imageSize">The size of the image</param>
''' <remarks>Uses the Query API</remarks>
Private Function GetImage(ByVal imageName As String, ByRef imageSize As String, ByRef revisionHistory As String) As Image
Dim navigator As Xml.XPath.XPathNavigator = HttpGetXML("http://en.wikipedia.org/w/query.php?format=xml&what=imageinfo&iihistory&iiurl&titles=Image:" + HttpUtility.UrlEncode(imageName)).SelectSingleNode("/yurik/pages/page")
Dim nodeIterator As Xml.XPath.XPathNodeIterator
If navigator Is Nothing Then Return Nothing 'this bot will generate an exception if it tries to process images on the commons
nodeIterator = navigator.Select("imghistory/ih")
navigator = navigator.SelectSingleNode("image")
imageSize = navigator.GetAttribute("size", "")
GetImage = Image.FromStream(HttpGet(navigator.GetAttribute("url", "")))
'determine revision history
revisionHistory = GetRevisionHistoryLine(navigator)
If nodeIterator.Count <> 0 Then 'there is more to the history
Dim index As Integer
Dim upperBound As Integer = nodeIterator.Count - 1
Dim oldRevisionHistoryLines(upperBound) As String
'the older revision history lines are old-to-new when they need to be new-to-old, so this algorithm reverses them
For index = 0 To upperBound
nodeIterator.MoveNext()
navigator = nodeIterator.Current
oldRevisionHistoryLines(upperBound - index) = GetRevisionHistoryLine(navigator)
Next
'join the array together, putting in line breaks
revisionHistory += "<br />" + Environment.NewLine + String.Join("<br />" + Environment.NewLine, oldRevisionHistoryLines)
End If
End Function
''' <remarks>Uses the MediaWiki API</remarks>
Private Function GetImageLinks(ByVal imageName As String) As String()
Dim imageLinksIterator As Xml.XPath.XPathNodeIterator = HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&format=xml&list=imagelinks&illimit=500&titles=Image:" + HttpUtility.UrlEncode(imageName)).Select("/api/query/imagelinks/il/@title")
Dim imageLinksArray(imageLinksIterator.Count - 1) As String
Dim index As Integer
For index = 0 To imageLinksArray.GetUpperBound(0)
imageLinksIterator.MoveNext() 'get the iterator going the first time, and increment it every time after that
imageLinksArray(index) = imageLinksIterator.Current.Value
Next
Return imageLinksArray
End Function
''' <remarks>Uses the MediaWiki API</remarks>
Private Function GetPageSource(ByVal pageTitle As String) As String
Return HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=content&titles=" + HttpUtility.UrlEncode(pageTitle)).SelectSingleNode("/api/query/pages/page/revisions/rev").Value
End Function
''' <remarks>Uses the Query API</remarks>
Private Function GetPagesInCategory(ByVal categoryTitle As String) As String()
Dim pagesIterator As Xml.XPath.XPathNodeIterator = HttpGetXML("http://en.wikipedia.org/w/query.php?what=category&format=xml&cptitle=" + HttpUtility.UrlEncode(categoryTitle)).Select("/yurik/pages/page/title")
Dim pages(pagesIterator.Count - 1) As String
Dim index As Integer
For index = 0 To pages.GetUpperBound(0)
pagesIterator.MoveNext()
pages(index) = pagesIterator.Current.Value
Next
Return pages
End Function
Private Sub SetPageSource(ByVal pageTitle As String, ByVal value As String, Optional ByVal editSummary As String = "", Optional ByVal minor As Boolean = False)
Dim requestUriBeginning As String = "http://en.wikipedia.org/w/index.php?title=" + HttpUtility.UrlEncode(pageTitle.Replace(" "c, "_"c)) + "&action="
'get edit token
Dim reader As Xml.XmlReader = Xml.XmlReader.Create(HttpGet(requestUriBeginning + "edit"), XmlReaderSettingsForHTML)
Dim namespaces As New Xml.XmlNamespaceManager(reader.NameTable)
Dim navigator As Xml.XPath.XPathNavigator = New Xml.XPath.XPathDocument(reader).CreateNavigator
Dim editToken As String
Dim editTime As String
namespaces.AddNamespace("html", "http://www.w3.org/1999/xhtml")
editToken = navigator.SelectSingleNode("//html:input[@name='wpEditToken']", namespaces).GetAttribute("value", "")
editTime = navigator.SelectSingleNode("//html:input[@name='wpEdittime']", namespaces).GetAttribute("value", "")
'post back data
Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create(requestUriBeginning + "submit")
Dim requestString As String
request.Method = "POST"
request.UserAgent = "Mozilla/5.0 (compatible)"
requestString = "wpSection=&scrollTop=&wpSave=Save%20page&wpEditToken=" + editToken + "&wpStarttime=" + Date.Now.ToUniversalTime.ToString("yyyyMMddHHmmss") + "&wpEdittime=" + editTime + "&wpSummary=" + HttpUtility.UrlEncode(editSummary) + "&"c
If minor Then requestString += "wpMinoredit=1&"
requestString += "wpTextbox1=" + HttpUtility.UrlEncode(value) 'i don't think Wikipedia likes it when this is URL encoded
request.ContentLength = requestString.Length
request.ContentType = "application/x-www-form-urlencoded; encoding=UTF-8"
request.CookieContainer = Cookies 'contains login info
Dim bytes() As Byte = System.Text.Encoding.UTF8.GetBytes(requestString)
request.GetRequestStream.Write(bytes, 0, bytes.Length)
request.GetRequestStream.Close()
request.GetResponse.Close() 'a response is not neccessary and it leaks connections to not close it
'for debugging the output
'Dim s As String = New IO.StreamReader(request.GetResponse.GetResponseStream).ReadToEnd()
End Sub
Private Sub Login(ByVal username As String, ByVal password As String)
'thanks to http://www.netomatix.com/HttpPostData.aspx
Dim request As Net.HttpWebRequest = Net.HttpWebRequest.Create("http://en.wikipedia.org/w/api.php")
Dim requestString As String
request.Method = "POST"
request.UserAgent = "Mozilla/5.0 (compatible)"
requestString = "action=login&lgname=" + username + "&lgpassword=" + password
request.ContentLength = requestString.Length
request.ContentType = "application/x-www-form-urlencoded"
request.CookieContainer = Cookies 'store the login info for later
request.GetRequestStream.Write(System.Text.Encoding.ASCII.GetBytes(requestString), 0, requestString.Length)
request.GetRequestStream.Close()
request.GetResponse() 'retrieve and store the cookies (they contain login tokens)
'for debugging the output
'Dim s As String = New IO.StreamReader(request.GetResponse.GetResponseStream).ReadToEnd()
End Sub
''' <summary>Uploads local image to wiki site.</summary>
''' <param name="filePath">Path of image file.</param>
''' <param name="description">Image description.</param>
''' <remarks>
''' Ported from DotNetWikiBot
'''
''' DotNetWikiBot Framework is available under The MIT License (also called X11 license). Below is the text of the license.
'''
''' Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
'''
''' The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
'''
''' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
''' </remarks>
Private Sub UploadImage(ByVal filePath As String, ByVal imageTitle As String, ByVal description As String)
Dim fileName As String = IO.Path.GetFileName(filePath).Substring(0, 1).ToUpper() + IO.Path.GetFileName(filePath).Substring(1)
Dim webReq As Net.HttpWebRequest = Net.HttpWebRequest.Create("http://en.wikipedia.org/wiki/Special:Upload")
'webReq.Proxy.Credentials = CredentialCache.DefaultCredentials;
webReq.Method = "POST"
Dim boundary As String = "----------" + DateTime.Now.Ticks.ToString("x")
webReq.ContentType = "multipart/form-data; boundary=" + boundary
webReq.UserAgent = "Mozilla/5.0 (compatible)"
webReq.CookieContainer = Cookies
Dim sb As New System.Text.StringBuilder
Dim ph As String = "--" + boundary + Environment.NewLine + "Content-Disposition: form-data; name="""
sb.Append(ph + "wpIgnoreWarning""" + Environment.NewLine + Environment.NewLine + "1"c + Environment.NewLine)
sb.Append(ph + "wpUploadAffirm""" + Environment.NewLine + Environment.NewLine + "1"c + Environment.NewLine)
sb.Append(ph + "wpUpload""" + Environment.NewLine + Environment.NewLine + "upload bestand" + Environment.NewLine)
'sb.Append(ph + "wpLicense""" + environment.newline + environment.newline + HttpUtility.UrlEncode(license) + "\r\n");
sb.Append(ph + "wpUploadDescription""" + Environment.NewLine + Environment.NewLine + description + Environment.NewLine)
sb.Append(ph + "wpUploadFile""; filename=""" + imageTitle + """"c + Environment.NewLine + "Content-Type: application/octet-stream" + Environment.NewLine + Environment.NewLine)
Dim postHeaderBytes() As Byte = System.Text.Encoding.UTF8.GetBytes(sb.ToString())
Dim fileBytes() As Byte = IO.File.ReadAllBytes(filePath)
Dim boundaryBytes() As Byte = System.Text.Encoding.ASCII.GetBytes(Environment.NewLine + "--" + boundary + Environment.NewLine)
webReq.ContentLength = postHeaderBytes.Length + fileBytes.Length + boundaryBytes.Length
Dim reqStream As IO.Stream = webReq.GetRequestStream()
reqStream.Write(postHeaderBytes, 0, postHeaderBytes.Length)
reqStream.Write(fileBytes, 0, fileBytes.Length)
reqStream.Write(boundaryBytes, 0, boundaryBytes.Length)
Dim webResp As Net.WebResponse = webReq.GetResponse()
Dim strmReader As New IO.StreamReader(webResp.GetResponseStream())
Dim respStr As String = strmReader.ReadToEnd()
strmReader.Close()
webResp.Close()
End Sub
''' <summary>
''' Stops processing for a while.
''' </summary>
''' <param name="message">The message to write to the screen.</param>
''' <remarks>All bots must pause between edits.</remarks>
Private Sub Pause(ByVal message As String)
Console.WriteLine(message + ". Sleeping...")
Threading.Thread.Sleep("10000") 'wait 10 seconds
End Sub
''' <summary>
''' Searches a block of text without being case-sensitive and treating "_" as " ", then performs the replacement
''' </summary>
Private Function WikiReplace(ByVal text As String, ByVal find As String, ByVal replace As String) As String
Dim index As Integer
'WikiReplace is the return value, text is just used for the search
WikiReplace = text
text = text.ToLower.Replace("_"c, " ")
find = find.ToLower.Replace("_"c, " ")
index = text.IndexOf(find) 'initial setup
While index <> -1
WikiReplace = WikiReplace.Substring(0, index) + replace + WikiReplace.Substring(index + find.Length)
index = text.IndexOf(find, index + 1)
End While
End Function
''' <summary>
''' Searches a block of text without being case-sensitive, then performs the replacement
''' </summary>
Private Function CaseInsensitiveReplace(ByVal text As String, ByVal find As String, ByVal replace As String) As String
Dim index As Integer
'WCaseInsensitiveReplace is the return value, text is just used for the search
CaseInsensitiveReplace = text
text = text.ToLower
find = find.ToLower
index = text.IndexOf(find) 'initial setup
While index <> -1
CaseInsensitiveReplace = CaseInsensitiveReplace.Substring(0, index) + replace + CaseInsensitiveReplace.Substring(index + find.Length)
index = text.IndexOf(find, index + 1)
End While
End Function
''' <remarks>Uses MediaWiki API</remarks>
Private Function PageExists(ByVal pageTitle As String) As Boolean
Return HttpGetXML("http://en.wikipedia.org/w/api.php?action=query&format=xml&titles=" + HttpUtility.UrlEncode(pageTitle)).SelectSingleNode("/api/query/pages/page").GetAttribute("pageid", "") <> ""
End Function
Public Sub Main()
'set things up
Net.ServicePointManager.DefaultConnectionLimit = 10 'evil, i know, but it won't work with only 2 connections for some reason. sorry, RFC 2616 section 8.1.4...
XmlReaderSettingsForHTML.ProhibitDtd = False
Console.WriteLine("PNG crusade bot version 2007.01.11 (11 Jan 2007)")
Console.WriteLine("Copyright (C) English Wikipedia user ""Remember the dot""")
Console.WriteLine("The PNG crusade bot comes with ABSOLUTELY NO WARRANTY")
Console.WriteLine("This is free software, and you are welcome to redistribute it")
Console.WriteLine("under the terms of the GNU General Public License.")
Console.WriteLine()
'start!
Login("PNG crusade bot", )
Dim pages() As String = GetPagesInCategory(TargetCategory)
Dim imageName As String
For Each imageName In pages
If imageName = "Wikipedia:Template messages/Image namespace" Then GoTo DoNext 'skip the 1 page in this category
imageName = imageName.Substring("Image:".Length)
Dim image As Image
Dim imageSize As Long
Dim imageRevisionHistory As String = ""
Dim pngFileSize As Long
Dim imageLinks() As String
Dim pageTitle As String
Console.WriteLine("Looking at """ + imageName + """...")
If PageExists("Image talk:" + imageName) Then
Console.WriteLine("An image talk page exists. This bot is not designed to handle image talk pages. Moving to next image.")
GoTo DoNext
End If
image = GetImage(imageName, imageSize, imageRevisionHistory)
If image Is Nothing Then
Console.WriteLine("This image appears to be from the Wikimedia commons. This bot is not designed to handle images on the Wikimedia commons.")
GoTo DoNext
End If
image.Save(TempPath, Imaging.ImageFormat.Png)
pngFileSize = New IO.FileInfo(TempPath).Length
If pngFileSize < imageSize Then 'if the PNG version is smaller
Console.WriteLine("This image could save " & imageSize - pngFileSize & " bytes. Gathering data...")
Dim newImageName As String = InputBox("What is the new name for """ + imageName + """? Do not include the "".png"" extension.", "New image name", IO.Path.GetFileNameWithoutExtension(imageName)) + ".png"
'the console way
'Dim newImageName As String
'Console.Write("Input new image name without "".png"" extension: ")
'newImageName = Console.ReadLine() + ".png"
If newImageName = ".png" Then 'if the input was an empty string or the user pressed Cancel
Console.WriteLine("Aborted. Moving on.")
GoTo DoNext
End If
Dim oldImagePageSource As String 'the page for the GIF version
Dim newImagePageSource As String 'the page for the PNG version
'create variables for (re)writing image pages
oldImagePageSource = CaseInsensitiveReplace(CaseInsensitiveReplace(CaseInsensitiveReplace(GetPageSource("Image:" + imageName), vbLf + TargetTemplate, ""), TargetTemplate + vbLf, ""), TargetTemplate, vbLf)
newImagePageSource = "__NOTOC__" + Environment.NewLine + _
oldImagePageSource + Environment.NewLine + _
"== Automatically converted to PNG ==" + Environment.NewLine + _
"The [[User:PNG crusade bot|PNG crusade bot]] automatically converted this image to the more efficient [[PNG]] format. The image was originally uploaded as """ + imageName + """." + Environment.NewLine + _
"=== Previous file history ===" + Environment.NewLine + _
imageRevisionHistory
oldImagePageSource = "{{PNG version available|" + newImageName + "}}" + Environment.NewLine + _
"{{subst:orfur|Image:" + newImageName + "}}" + Environment.NewLine + _
oldImagePageSource
'upload image
UploadImage(TempPath, newImageName, newImagePageSource)
Pause("Uploaded image """ + newImageName + """"c)
'change the references to point to the new image
imageLinks = GetImageLinks(imageName)
newImageName = "Image:" + newImageName
For Each pageTitle In imageLinks
Dim newPageSource As String = GetPageSource(pageTitle)
Dim revisedNewPageSource As String
revisedNewPageSource = WikiReplace(newPageSource, "Image:" + imageName, newImageName)
If newPageSource = revisedNewPageSource Then
Console.WriteLine("Error rewriting """ + pageTitle + """: the image was not found in the code. Press enter to continue processing...")
Console.ReadLine()
Else
SetPageSource(pageTitle, revisedNewPageSource, "Converted image to PNG", True)
Pause("Rewrote """ + pageTitle + """")
End If
Next
'make sure all references were removed
imageLinks = GetImageLinks(imageName)
If imageLinks.Length <> 0 Then
Console.WriteLine("Ack! " & imageLinks.Length & " references to the old image still exist! Press enter to resume processing.")
Console.ReadLine()
End If
'rewrite old image page
SetPageSource("Image:" + imageName, oldImagePageSource, "Marked image as having been replaced by a PNG version")
Console.WriteLine("Rewrote old image page; finished.")
Else
Console.WriteLine("This image could not save anything.")
End If
DoNext:
Console.WriteLine() 'put a line break between each image
Next
IO.File.Delete(TempPath)
Console.WriteLine("All done! Press enter to exit.")
Console.ReadLine() 'allow the log to be read
End Sub
End Module