Jump to content

User:OrphanBot/libBot.pl

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

#!/usr/bin/perl

# libBot: A library of useful routines for running a bot

use strict;
use warnings;

require "libPearle2.pl";

my $test_only = 0;
my $username = "";

sub config
{
	my %params = @_;
	
	$test_only = $params{test_only} if(defined($params{test_only}));
	$username = $params{username} if(defined($params{username}));
}

# Log a warning on the talk page of the bot
sub userwarnlog
{
	my ($text, $editTime, $startTime, $token, $user, $summary, $session);
	$user = $_[1];
	$user = $username if(!defined($user));
	$summary = $_[2];
	$summary = "Logging warning message" if(!defined($summary));
	$session = $_[3];
	
	if(defined($session))
	{
		# We've been handed an editing session
		($text, $editTime, $startTime, $token) = @{$session};
		Pearle::myLog("Warning with existing edit session\n");
	}
	else
	{
		($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$user");
	}
	
	if($test_only)
	{
		print STDERR $_[0];
		return;
	}
	
	if($text =~ /^#redirect/i)
	{
		userwarnlog("*User talk page [[User talk:$user]] is a redirect\n");
		return;
	}
	$text .= $_[0];
	Pearle::postPage("User talk:$user", $editTime, $startTime, $token, $text, $summary, "no");
	print STDERR $_[0];
}

# Log a notification message to the console
sub notelog
{
	print STDERR @_;
}

# Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
	my $link = shift;
	$link =~ s/\[\[(Category|Image)/[[:$1/g;
	return $link;
}

# Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
	my $string = shift;
	# Escape metacharacters
	$string =~ s/\\/\\\\/g;
	$string =~ s/\./\\\./g;
	$string =~ s/\(/\\\(/g;
	$string =~ s/\)/\\\)/g;
	$string =~ s/\[/\\\[/g;
	$string =~ s/\]/\\\]/g;
	$string =~ s/\+/\\\+/g;
	$string =~ s/\*/\\\*/g;
	$string =~ s/\?/\\\?/g;
	$string =~ s/\^/\\\^/g;
	$string =~ s/\$/\\\$/g;
	# Process the string to match both with spaces and with underscores
	$string =~ s/[ _]/[ _]+/g;

	# Process the string to match both upcase and lowercase first characters
	if($string =~ /^[A-Za-z]/)
	{       
		$string =~ s/^(.)/"[$1".lc($1)."]"/e;
	}
	return $string;
}

# Check for new talk page messages
sub DoIHaveMessages
{
	my $text = shift;
	if($text =~ /<div class="usermessage">You have/)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}


sub GetPageList
{
	my $image = shift;
	my $image_text = shift;
	my @pages = ();
	# Extract the page links
	# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
	# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
	# </ul>
	while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
	{
		my $title;
		$title = $2;
		# Unescape any HTML entities in the title
		$title =~ s/</</g;
		$title =~ s/>/>/g;
		$title =~ s/"/"/g;
		$title =~ s/&/&/g;

		notelog("Matched article $title\n");

		# Filter out bad namespaces
		if($title =~ /^(User:|Talk:|User talk:|Template talk:|Image:|Image talk:|Category talk:|Wikipedia:|Wikipedia talk:|Portal talk:)/)	# Leave these alone
		{
			notelog("Ignoring [[$title]] due to namespace\n");
		}
		elsif($title =~ /^Special:/)
		{
			# Ignore Special: pages completely
		}
		elsif($title =~ /^(MediaWiki:|MediaWiki talk:|Template:|Help:|Help talk:)/)		# Log a warning about these, but otherwise leave them alone
		{
			userwarnlog("*Found image [[:$image]] in [[$title]]\n");
		}
		else	# Good namespaces: article, Category:, Portal:
		{
			push @pages, $title;
		}
	}
	return @pages;
}

# Get all pages.  Don't filter for bad namespaces.
sub GetFullPageList
{
	my $image = shift;
	my $image_text = shift;
	my @pages = ();
	# Extract the page links
	# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
	# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
	# </ul>
	while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
	{
		my $title;
		$title = $2;
		# Unescape any HTML entities in the title
		$title =~ s/</</g;
		$title =~ s/>/>/g;
		$title =~ s/"/"/g;
		$title =~ s/&/&/g;

		notelog("Matched article $title\n");

		push @pages, $title;
	}
	return @pages;
}

sub SaveImage
{
	my $image = shift;
	my $image_text = shift;
	my $image_path = shift;
	
	my $image_url;
	
	($image_url) = $image_text =~ /<a href="(http:\/\/upload\.wikimedia\.org\/wikipedia\/en\/[^"]+)"/;
	if(defined($image_url))
	{
		my $filename;
		my $image_data;
		notelog("Fetching image $image_url\n");
		($filename) = $image_url =~ /(\/[^\/]+)$/;
		$filename = $image_path . $filename;
		if(! -e $filename)
		{
			if($test_only)
			{
				notelog("Would save to $filename...");
			}
			else
			{
				$image_url = Pearle::urlDecode($image_url);
				$image_data = Pearle::getURL($image_url);
				notelog("Saving to $filename...");
				if(defined($filename) and $filename)
				{
					open OUTFILE, ">", $filename;
					print OUTFILE $image_data;
					close OUTFILE;
					notelog("Image saved\n");
					Pearle::myLog("Image $image saved as $filename\n");
				}
				else
				{
					notelog("Failed\n");
				}
			}
		}
		else
		{
			notelog("File already exists\n");
		}
	}			
}

sub RemoveImageFromPage
{
	my $image = shift;
	my $page = shift;
	my $image_regex = shift;
	my $removal_prefix = shift;
	my $removal_comment = shift;

	my ($text, $editTime, $startTime, $token);
	my ($match1, $match2);
	my $old_length;
	my $new_length;
	my $change_len;
	my $match_len;

	# Fetch an article page
	($text, $editTime, $startTime, $token) = Pearle::getPage($page);
	
	if(!defined($text))
	{
		Pearle::myLog("Error: Bad edit page [[$page]]\n");
		userwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n"));
		sleep(300);
		return 0;
	}
	
	if($text =~ /^\s*$/)
	{
		# Might be protected instead of empty
		Pearle::myLog("Error: Empty page [[$page]]\n");
		userwarnlog(FixupLinks("*Error: Empty page [[$page]]\n"));
		sleep(300);
		return 0;
	}
	
	if($text =~ /^#redirect/i)
	{
		Pearle::myLog("Redirect found for page [[$page]] (image [[:$image]])\n");
		userwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n"));
		return 0;
	}

	# Remove the image
	my $regex3 = "(\\[\\[${image_regex}.*?(\\[\\[.*?\\]\\].*?|)+\\]\\][ \\t]*)";	# Regex to match images
	my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w";									# Regex to try to spot inline images
	my $regex3c = "<!--.*${regex3}.*-->";											# Regex to spot images in comments
	my $regex3g = "(${image_regex}.*)";												# Regex to match gallery images
	my $regex3gc = "<!--.*${regex3g}-->";											# Regex to spot gallery images in comments
	my ($raw_image) = $image =~ /Image:(.*)/;	
	my $regex4a = "([Cc]over\\s*=\\s*)" . MakeWikiRegex($raw_image);
	my $regex4b = "(image_skyline\\s*=\\s*)" . MakeWikiRegex($raw_image);
	my $regex4i = "(image\\s*=\\s*)" . MakeWikiRegex($raw_image);						# Regex to match "image = " sections in infoboxes
	my $regex4p = "(picture\\s*=\\s*)" . MakeWikiRegex($raw_image);					# Regex to match "picture = " sections in infoboxes

	my $regex4m = "\\[\\[[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\\]\\]";	# Regex to match inline Media: links
	my $regex4g =  "(img\\s*=\\s*)" . MakeWikiRegex($raw_image);	# Regex to match "img = " sections in infoboxes
	Pearle::myLog("Regex 3: $regex3\n");
	notelog("Regex 3: $regex3\n");
	notelog("Regex 3 extended: $regex3ex\n");
	notelog("Regex 3 gallery: $regex3g\n");
	Pearle::myLog("Raw regex: $raw_image\n");
	notelog("Regex 4 Album: $regex4a\n");
	notelog("Regex 4 City: $regex4b\n");
	notelog("Regex 4 Image: $regex4i\n");
	notelog("Regex 4 Media: $regex4m\n");
	notelog("Regex 4 Picture: $regex4p\n");
	notelog("Regex 4 Img: $regex4g\n");
	
	if($text =~ /$regex3ex/)
	{
		Pearle::myLog("Possible inline image in [[$page]]\n");
		userwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n"));
		return 0;	# Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
	}
	
	if($text =~ /$regex3c/ or $text =~ /$regex3gc/)
	{
		Pearle::myLog("Image in comment in [[$page]]\n");
#		userwarnlog(FixupLinks("*Image in comment in [[$page]]\n"));
		return 0;	# Can't do gallery matching because that also matches regular images
	}
	
	$text =~ /$regex3/;
	$match_len = length($1);
	$match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;

	$new_length = length($text);
	print "Num: $match2 Len: $match_len\n";
	if($match2)
	{
		# If a whole lot of text was removed, log a warning
		if($match_len > (500 + length($image)))
		{
			userwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n"));
			if($match_len > (1000 + length($image)))
			{
				notelog("Unusually long caption found.  Exiting.\n");
				Pearle::myLog("Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n");
				exit;
			}
		}
		if($match_len < (4 + length($image)))
		{
			notelog("*Short replacement of $match_len bytes in [[$page]]\n");
			Pearle::myLog("Short replacement of $match_len bytes (min " . (length($image) + 4) . ") in [[$page]] ($match2 matches).  Exiting.\n");
			Pearle::myLog("Text:\n$text\n");
			exit;
		}
		# If many matches, log a warning
		if($match2 > 2)
		{
			Pearle::myLog("More than one match ($match2) in page [[$page]]\n");
#			userwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n"));
		}
		if($match2 > 100)
		{
			Pearle::myLog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
			userwarnlog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
			return 0;
		}
		# If there might be a reference, log a warning
#		if($text =~ /(?:see (?:image|picture|graph|diagram|right|left)|\(left\)|\(right\)|\(below\)|\(above\))/)
#		{
#			Pearle::myLog("Possible image reference in page [[$page]]\n");
#			userwarnlog("*Possible image reference in page [[$page]]\n");
#		}
		if($text =~ /-->\]/)
		{
			Pearle::myLog("Possible bracket mixup in page [[$page]]\n");
			userwarnlog(FixupLinks("*Possible bracket mixup in page [[$page]]\n"));
		}
#		if($text =~ /\[\[(?: |)<!--/)
#		{
#			Pearle::myLog("Possible multiline image in page [[$page]]\n");
#			userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
#		}
	}
	elsif($text =~ /<gallery/)
	{
		Pearle::myLog("*Possible image gallery in page [[$page]]\n");
		if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/)
		{
			$match2 += 1;
		}
	}

	if($match2 > 0)
	{
		if($text =~ /\[\[(?: |)<!--/)
		{
			Pearle::myLog("Possible multiline image in page [[$page]]\n");
			userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
		}
	}

	# Infobox removal
	if($text =~ /{{Album[ _]infobox|{{Infobox[ _]Album/i)
	{
		if($text =~ s/$regex4a/$1/)
		{
			Pearle::myLog("*Album infobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]City/i)
	{
		if($text =~ s/$regex4b/$1/)
		{
			Pearle::myLog("*City infobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Taxobox/i)
	{
		if($text =~ s/$regex4i/$1/)
		{
			Pearle::myLog("*Taxobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{NFL[ _]player/i)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*NFL Playerbox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]President/i)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Presidentbox in page [[$page]]\n");
#			userwarnlog("*Presidentbox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Cricketer/i)
	{
		if($text =~ s/$regex4p/picture = cricket no pic.png/i)
		{
			Pearle::myLog("*Cricketer in page [[$page]]\n");
#			userwarnlog("*Cricketer in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Celebrity/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Celebrity in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Wrestler/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Wrestler in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox musical artist 2/)
	{
		if($text =~ s/$regex4g/$1/i)
		{
			Pearle::myLog("*InfoMusArt2 in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox Model/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Model in page [[$page]]\n");
			$match2 += 1;
		}
	}

	if($match2)	# No need to null-edit articles anymore
	{
		if($test_only)
		{
			notelog("Test removal from page succeeded\n");
		}
		else
		{
			# Submit the changes
			Pearle::postPage($page, $editTime, $startTime, $token, $text, $removal_comment, "no");
		}
	}
	
	return ($match2)
}

# Returns 1 if the user has been notified, or a reference to the userpage edit session if they haven't
sub isNotified
{
	my $image_text = shift;
	my $uploader = shift;
	my $image_regex = shift;
	my $image_name = shift;
	my $notes_ref = shift;
	my $donts_ref = shift;

	# Check notification list
	if($notes_ref->{"$uploader,$image_name"})
	{
		notelog("Already notified for this image\n");
		return 1;
	}

	if($donts_ref->{$uploader})
	{
		notelog("On exception list\n");
		Pearle::myLog("On exception list: $uploader\n");
		return 1;
	}
	
	# Check uploader's talkpage
	my ($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$uploader");
	if($text =~ /$image_regex/)
	{
		notelog("Already notified by someone else\n");
		$donts_ref->{"$uploader,$image_name"} = 1;
		return 1;
	}
	else
	{
		print "Not already notified\n";
		return [$text, $editTime, $startTime, $token];
	}
}

sub isDated
{
	my $image_text = shift;
	if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)	# Dated template
	{
		print "Dated tag $1 $2 $3\n";
		return 1;
	}
	# as of 6 October 2006">
	elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
	{
		print "Template borked; category $1 $2 $3\n";
		return 1;
	}
	elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/)	# Generic template
	{
		print "Generic tag\n";
		return 0;
	}
	else
	{
		print "No tag match\n";
		return 0;
	}
}

# Return the tag date if there is one, the upload date if not
# Returns in (day, month, year) format
sub getDate
{
	my $image_text = shift;
	if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)
	{
		print "Template date $1-$2-$3\n";
		return ($1, $2, $3);
	}
	elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
	{
		print "Category date $1-$2-$3\n";
		return ($1, $2, $3);
	}
	elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</)
	{
		print "Upload date $1-$2-$3\n";
#		return ($1, $2, $3);
		# For now, be conservative:
		my ($year, $month, $day) = Today();
		return ($day, Month_to_Text($month), $year);
	}
	else
	{
		print "No date\n";
		return (1, "January", 2006);
	}
}

# Return a list of upload dates
sub getUploadDates
{
	my @dates;
	my $image_text = shift;
	while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g)
	{
		push @dates, [$1, $2, $3];
	}
	return @dates;
}

sub getLastEditDate
{
	my ($day, $month, $year);
	my $image = shift;
	
	my @history = Pearle::parseHistory($image);
	(undef, $day, $month, $year) = @{$history[0]};
	
	return ($day, $month, $year);
}

# Find the most recent non-vandal, non-revert uploader
sub getUploader
{
	my $image_text = shift;
	my ($uploader, $dims, $bytes, $comment);
	my @uploaders;
	my $uploader_data;
	my $i = 0;
	
	# title="User:Jamie100">Jamie100</a> (<a href="/wiki/User_talk:Jamie100" title="User talk:Jamie100">Talk</a>) . . 424x216 (25800 bytes) <span class='comment'>(Reverted to earlier revision)</span></li>
	
#	while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a>\) \. \. (\d+x\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
	while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a> \| <a href="[^"]*" title="[^"]*">contribs<\/a>\) \. \. (\d+.+?\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
	{
		($uploader, $dims, $bytes, $comment) = ($1, $2, $3, $4);
		$bytes =~ s/,//g;						# Remove commas to turn into a real number
		$comment = "" if(!defined($comment));	# Reduce warnings
		push @uploaders, [$uploader, $dims, $bytes, $comment];
		notelog("Uploader found: $uploader, $dims, $bytes, $comment\n");
		$i++;
		die "Too many uploaders: $i\n" if($i > 100);
	}
	my $max = scalar(@uploaders);
	print $max, "\n";
	for($i = 0; $i < $max; $i++)
	{
		$uploader = $uploaders[$i][0];
		if($uploaders[$i][3] =~ /Reverted/)
		{
			$dims = $uploaders[$i][1];
			$bytes = $uploaders[$i][2];
			notelog("Revert found: $uploader, $dims, $bytes\n");
			$i++;
			while(($dims ne $uploaders[$i][1] or $bytes ne $uploaders[$i][2]) and $i < $max)
			{
				notelog("Reversion data: $uploaders[$i][1], $uploaders[$i][2], $i\n");
				$uploader = $uploaders[$i][0];
				$i++;
			}
		}
		elsif($uploaders[$i][3] =~ /optimi(z|s)|adjust|tweak|scale|crop|change|resize/i)
		{
			notelog("Optimize found.  Skipping.\n");
		}
		else
		{
			notelog("Uploader: $uploader ($i)\n");
			last;
		}
	}
	$uploader = undef if($i >= $max);
	
	print "Uploader: $uploader\n";
	return $uploader;
}

# See if the specified category exists, and if not, create it
sub checkImageCategory
{
	my $cat;
	my ($text, $editTime, $startTime, $token);
	$cat = "Category:Images with unknown source as of $_[0] $_[1] $_[2]";
	
	($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
	if($text !~ /\[\[[Cc]ategory:[Ii]mages with unknown source/)
	{
		$text .= "\n[[Category:Images with unknown source| ]]\n";
		if($test_only)
		{
			notelog("Would create category [[:$cat]]\n");
		}
		else
		{
			Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
			userwarnlog("*Created category [[:$cat]]\n");
		}
	}
}


sub loadNotificationList
{
	my $file = shift;
	my %notelist;
	my $i = 0;
	notelog("File: $file\n");
	open INFILE, "<", $file;
	while(<INFILE>)
	{
		$_ =~ s/\s*#.*$//g;
		chomp;
		$notelist{$_} = 1;
		$i++;
	}
	close INFILE;
	notelog("$i notifications loaded\n");
	return %notelist;
}

sub saveNotificationList
{
	return if($test_only);
	
	my $file = shift;
	my %notelist = @_;
	my $key;
	
	open OUTFILE, ">", $file;
	foreach $key (keys(%notelist))
	{
		print OUTFILE "$key\n";
	}
	close OUTFILE;
}

1;