User:ImageTaggingBot/tagbot.pl
Appearance
The code for ImageTaggingBot. Requires libBot.pm and Pearle.pm.
#!/usr/bin/perl # Tagbot # # A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag. use strict; use warnings; use lib '/home/mark/perllib'; use lib '/home/mark/Desktop/wikibots/common'; use Date::Calc qw(Month_to_Text Today); use Array::Utils; use utf8; use Data::Dumper; use libBot; binmode STDOUT, ":utf8"; my $permit_interruptions = 0; # Allow talkpage messages to stop the bot? my ($cur_y, $cur_m, $cur_d); my %users_notified; # List of users notifed. 0, undef = no; 1 = notified once; 2 = notified and second notice my %notifications; # List of user,image pairs, used to ensure that no user is ever notified about an image twice. my %dont_notify = (); # List of users to never notify my %banned_users = (); # List of users banned from uploading my %exempt_users = (); # List of users exempt from inspection my %unknown_tags; # List of tags found that are not in either the "good" or "bad" list my @sourcereq_tags; # List of tags that provide copyright information but not source information my $sourcereq_tags; my @nosource_tags; # List of tags that provide both source and copyright information my $nosource_tags; my @deletion_tags; # List of tags that will eventually lead to the deletion of the image my $deletion_tags; my @forbidden_tags; # List of tags that should never be seen my $forbidden_tags; my @deprecated_tags; # List of tags that shouldn't be used any more my $deprecated_tags; my @nontags; # List of tags that aren't copyright tags but that appear on image description pages my $nontags; my @source_tags; # List of tags that provide source information but not copyright status my $source_tags; sub loadTagList { my $filename = shift; my @list = (); open INFILE, "<", $filename; while(<INFILE>) { $_ =~ s/#.*//; # Remove comments $_ =~ s/^\s*//; # Remove leading whitespace $_ =~ s/\s*$//; # Remove trailing whitespace push @list, $_ if($_ !~ /^\s*$/); } close INFILE; return @list; } sub processTagList { my $tags = join "|", @_; $tags =~ s/\(/\\\(/g; $tags =~ s/\)/\\\)/g; $tags =~ s/\./\\\./g; $tags =~ s/\*/.*?/g; return "($tags)"; } @sourcereq_tags = loadTagList("sourcereq.tags"); $sourcereq_tags = processTagList(@sourcereq_tags); print "Sourcereq: Loaded\n\n"; @nosource_tags = loadTagList("nosource.tags"); $nosource_tags = processTagList(@nosource_tags); print "Nosource: Loaded\n\n"; @forbidden_tags = loadTagList("forbidden.tags"); $forbidden_tags = processTagList(@forbidden_tags); print "Forbid: Loaded\n\n"; @deletion_tags = loadTagList("deletion.tags"); $deletion_tags = processTagList(@deletion_tags); print "Deletion: Loaded\n\n"; @deprecated_tags = loadTagList("deprecated.tags"); $deprecated_tags = processTagList(@deprecated_tags); print "Deprecated: Loaded\n\n"; @nontags = loadTagList("nontags.tags"); $nontags = processTagList(@nontags); print "Nontags: Loaded\n\n"; @source_tags = loadTagList("source.tags"); $source_tags = processTagList(@source_tags); print "Sourcetags: Loaded\n\n"; sub tokenSubst { my $string = shift; my $image = shift; $string =~ s/<IMAGE>/$image/g if(defined($image)); $string =~ s/<DAY>/$cur_d/g; $string =~ s/<MONTH>/$cur_m/g; $string =~ s/<YEAR>/$cur_y/g; return $string; } sub loadUserList { my $file = shift; my %notelist; my $i = 0; Pearle::myLog(4, "File: $file\n"); open INFILE, "<", $file; while(<INFILE>) { my ($user, $reason); $_ =~ s/\s*#.*$//g; chomp; ($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/; next if(!defined($user) or !defined($reason)); $notelist{$user} = $reason; $i++; } close INFILE; Pearle::myLog(3, "$i notifications loaded\n"); return %notelist; } # Initialize ($cur_y, $cur_m, $cur_d) = Today(1); # Today in GMT $cur_m = Month_to_Text($cur_m); Pearle::init("<<BOT USERNAME>>", "<<BOT PASSWORD>>", "tagbot.log","cookies.tagbot.txt"); Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4); config(username => '<<BOT USERNAME>>'); Pearle::myLog(2, "Beginning execution\n"); %dont_notify = loadNotificationList("orphanbot.whitelist"); %banned_users = loadUserList("banneduser.list"); %exempt_users = loadUserList("exemptuser.list"); if(!Pearle::login()) { exit; } # Get the day's uploads my @articles; @articles = Pearle::getLogArticles(log => 'upload', limit => 150); # Chop off the 20 most recent log entries splice @articles, 0, 20; Pearle::myLog(3, scalar(@articles) . " images found\n"); foreach my $log_entry (@articles) { my $image = $log_entry->[0]; my $uploader = $log_entry->[1]; my $summary = $log_entry->[2] || ""; print "$image\n"; print "$uploader\n"; Pearle::myLog(2, "Processing image $image\n"); # Basic checks that can be done from the log alone # Non-terminating check: Was the image uploaded by a blacklisted user? if($banned_users{$uploader}) { botwarnlog("*Image [[:$image]] uploaded by blacklisted user [[User:$uploader]]\n"); Pearle::myLog(3, "Upload by banned user $uploader\n"); } # Terminating check: Is the user on the whitelist? if($exempt_users{$uploader}) { Pearle::myLog(2, "Upload by exempt user $uploader found.\n"); next; } # Terminating check: Is the upload a modification? if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i) { Pearle::myLog(2, "Optimize upload found for image $image\n"); next; } if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i) { Pearle::myLog(2, "Tweak found for image $image\n"); next; } # Terminating check: Is the upload a revert? if($summary =~ /Reverted to earlier revision|Reverted to version/) { Pearle::myLog(2, "Revert upload found for image $image\n"); next; } # Get page data my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'], tllimit => 500, # All the templates rvprop => ['content'], # Article body meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages redirects => 1, # Resolve redirects ); if(!defined($image_data)) { Pearle::myLog(1, "Server did not return an appropriate response.\n"); next; } my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data, ForceArray => ['tl'] ); Pearle::myLog(4, Dumper($parsed_xml)); my $page_text = GetPageText($parsed_xml); my @templates = GetPageTemplates($parsed_xml); # Remove non-tags from template list # TODO: Remove redlinks @templates = grep {$_ !~ /:$nontags$/i} @templates; my $stripped_page_text = $page_text || ""; $stripped_page_text =~ s/^==.*?==//gm; # Remove section headers $stripped_page_text =~ s/\n//g; # Remove newlines $stripped_page_text =~ s/{{{[^}]+}}}//g; # Remove template parameters $stripped_page_text =~ s/{{[^}]+}}//gi; # Remove templates Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n"); Pearle::myLog(4, "Stripped text: $stripped_page_text\n"); print "=============================================================================\n"; # Check for interruptions if($permit_interruptions and DoIHaveMessages($image_data)) { Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n"); last; } # Sanity check: Does the image still exist? if(defined($parsed_xml->{query}->{pages}->{page}->{missing})) { Pearle::myLog(2, "Image $image has already been deleted\n"); next; } # Sanity check: Is the image marked for deletion? if(grep {$_ =~ /:$deletion_tags$/i} @templates) { # We don't do anything with images already marked for deletion. There are just too many corner cases and wasted-effort conditions. Pearle::myLog(2, "Deletion tag found\n"); next; } ######### Check for source, license, and tag ################### # Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes my $has_source = undef; my $has_license = undef; my $has_tag = undef; # Does the image lack a description page? if($page_text =~ /^\s*$/) { Pearle::myLog(3, "Empty IDP\n"); $has_source = 0; $has_license = 0; $has_tag = 0; } # Does the image have a source-providing template? if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates) { Pearle::myLog(3, "Source-providing template @tags found\n"); $has_source = 1; # Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags @templates = Array::Utils::array_diff(@templates, @tags); } # Does the image have a self-sourcing tag? if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates) { Pearle::myLog(3, "Self-sourcing tag @tags found\n"); $has_source = 1; $has_license = 1; $has_tag = 1; } # Does the image have a sourcereq tag? if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates) { Pearle::myLog(3, "Sourcereq tag @tags found\n"); $has_license = 1; $has_tag = 1; } # Handle "Information" and "Non-free use rationale" tags if(grep {$_ =~ /:Information$/} @templates) { Pearle::myLog(3, "Has an Information template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Information$/} @templates; # Attempt to parse an "information" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Information}}\n"); $has_source = 1; } if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "author" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Information}}\n"); $has_source = 1; } if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "permission" parameter, assume a license (but not a tag) Pearle::myLog(3, "Assuming license in {{Information}}\n"); $has_license = 1; } if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "flickr_url" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Flickr}}\n"); $has_source = 1; } } if(grep {$_ =~ /:Non-free use rationale$/} @templates) { Pearle::myLog(3, "Has a non-free use rationale template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free use rationale$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Non-free use rationale}}\n"); $has_source = 1; } $has_license = 1; # Assume that it's licensed as "fair use" } if(grep {$_ =~ /:Non-free media rationale$/} @templates) { Pearle::myLog(3, "Has a non-free use rationale template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free media rationale$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n"); $has_source = 1; } if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "publisher" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n"); $has_source = 1; } if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "owner" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n"); $has_source = 1; } $has_license = 1; # Assume that it's licensed as "fair use" } if(grep {$_ =~ /:Non-free image data$/} @templates) { Pearle::myLog(3, "Has a non-free image data template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free image data$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) { # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Non-free image data}}\n"); $has_source = 1; } $has_license = 1; # Assume that it's licensed as "fair use" } if(grep {$_ =~ /:spoken article entry$/i} @templates) { Pearle::myLog(3, "Has a Spoken Article template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Spoken article entry$/i} @templates; # Attempt to parse if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i) { # If there's a filled-in "user_name" parameter, assume a source Pearle::myLog(3, "Assuming source in {{Spoken article entry}}\n"); $has_source = 1; } $has_license = 1; # Assume that it's GFDL } # Is the image description page lacking in tags? # This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier # We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list # here so we can say for sure that the page is untagged. if(scalar(@templates) == 0) { Pearle::myLog(3, "No templates found\n"); $has_tag = 0; } # Does it have source information outside of the templates? if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work") { # TODO: Better source checking if(!defined($has_source) or $has_source == 0) { Pearle::myLog(3, "Assuming page has source\n"); $has_source = 1; } if(!defined($has_license) or $has_license == 0) { Pearle::myLog(3, "Assuming page has license\n"); $has_license = 1; } } else { if(!defined($has_source)) { # If we still don't know if it has a source, it's safe to assume it doesn't. Pearle::myLog(3, "Assuming page doesn't have source\n"); $has_source = 0; } if(!defined($has_license)) { # If we still don't know if it has a license, we'll assume it doesn't Pearle::myLog(3, "Assuming page doesn't have license information\n"); $has_license = 0; } } ########## Check for exceptional conditions ########## # Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag? if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates)) { Pearle::myLog(2, "Image has deprecated tag\n"); # Mark as no-license wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Obsolete or deprecated tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", "\n{{subst:User:OrphanBot/deprecated|$image}} --~~~~", "Image with obsolete or deprecated license"); } Pearle::limit(); next; } # Terminating check: Does the image have a forbidden tag? if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates) { # Doesn't matter what else is on the page, the image requires human handling Pearle::myLog(2, "Forbidden tag $tags[0] found on image [[:$image]]\n"); botwarnlog("*Forbidden tag {{tl|$tags[0]}} found on image [[:$image]]\n"); Pearle::limit(); next; } # TODO: Terminating check: Malformed fair-use rationale ########## Process ##################################### # We've found an image with only unknown templates if(!defined($has_tag)) { # We don't know if it has any tags or not, and so we cannot deduce the license or source status # We know an image doesn't have tags if: # * It has no templates # * or all templates are on the "nontags" list # We know an image has tags if: # * We found a tag we know about print "Has unknown tags\n"; my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates; if(scalar(@new_unknown_tags) > 0) { foreach my $unknown_tag (@new_unknown_tags) { Pearle::myLog(2, "Unknown tag {{$unknown_tag}} found\n"); botwarnlog("* Unknown tag [[$unknown_tag]] found\n"); $unknown_tags{$unknown_tag} = 1; } Pearle::limit(); } } elsif($has_tag == 0) { if($has_source == 0) { if($has_license == 0) { # Tag as "no source" and "no license" wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license information"); } Pearle::myLog(2, "No source, no license\n"); Pearle::limit(); } else { # No license tag, and it either has a license or we don't know if it has a license # Tag as "no source" and "untagged" wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource untagged|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license tag"); } Pearle::myLog(2, "No source, untagged\n"); Pearle::limit(); } } else { if($has_license == 0) { # Tag as "no license" wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license information"); } Pearle::myLog(2, "No license\n"); Pearle::limit(); } else { # Tag as "untagged" wikilog($image, tokenSubst("\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/untagged-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license tag"); } Pearle::myLog(2, "Untagged\n"); Pearle::limit(); } } } else { # If it has a tag, it has a license if($has_source == 0) { # Tag as "no source" wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) { Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source information"); } Pearle::myLog(2, "No source\n"); Pearle::limit(); } else { # Everything's fine Pearle::myLog(2, "Image has no problems\n"); } } sleep(2); } Pearle::myLog(2, "Finished with upload set\n");