Jump to content

User:ImageTaggingBot/tagbot.pl

From Wikipedia, the free encyclopedia

The code for ImageTaggingBot. Requires libBot.pm and Pearle.pm.

#!/usr/bin/perl

# Tagbot
#
# A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.


use strict;
use warnings;
use lib '/home/mark/perllib';
use lib '/home/mark/Desktop/wikibots/common';

use Date::Calc qw(Month_to_Text Today);
use Array::Utils;
use utf8;

use Data::Dumper;

use libBot;

binmode STDOUT, ":utf8";

my $permit_interruptions = 0;   # Allow talkpage messages to stop the bot?

my ($cur_y, $cur_m, $cur_d);

my %users_notified;                             # List of users notifed.  0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications;                              # List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = ();                           # List of users to never notify
my %banned_users = ();                          # List of users banned from uploading
my %exempt_users = ();                          # List of users exempt from inspection

my %unknown_tags;                               # List of tags found that are not in either the "good" or "bad" list
my @sourcereq_tags;                             # List of tags that provide copyright information but not source information
my $sourcereq_tags;
my @nosource_tags;                              # List of tags that provide both source and copyright information
my $nosource_tags;
my @deletion_tags;                              # List of tags that will eventually lead to the deletion of the image
my $deletion_tags;
my @forbidden_tags;                             # List of tags that should never be seen
my $forbidden_tags;
my @deprecated_tags;                            # List of tags that shouldn't be used any more
my $deprecated_tags;
my @nontags;                                    # List of tags that aren't copyright tags but that appear on image description pages
my $nontags;
my @source_tags;                                # List of tags that provide source information but not copyright status
my $source_tags;

sub loadTagList
{
        my $filename = shift;
        my @list = ();
        open INFILE, "<", $filename;
        while(<INFILE>)
        {
                $_ =~ s/#.*//;          # Remove comments
                $_ =~ s/^\s*//;         # Remove leading whitespace
                $_ =~ s/\s*$//;         # Remove trailing whitespace
                push @list, $_ if($_ !~ /^\s*$/);
        }
        close INFILE;
        return @list;
}

sub processTagList
{
        my $tags = join "|", @_;
        $tags =~ s/\(/\\\(/g;
        $tags =~ s/\)/\\\)/g;
        $tags =~ s/\./\\\./g;
        $tags =~ s/\*/.*?/g;
        return "($tags)";
}


@sourcereq_tags = loadTagList("sourcereq.tags");
$sourcereq_tags = processTagList(@sourcereq_tags);
print "Sourcereq: Loaded\n\n";

@nosource_tags = loadTagList("nosource.tags");
$nosource_tags = processTagList(@nosource_tags);
print "Nosource: Loaded\n\n";

@forbidden_tags = loadTagList("forbidden.tags");
$forbidden_tags = processTagList(@forbidden_tags);
print "Forbid: Loaded\n\n";

@deletion_tags = loadTagList("deletion.tags");
$deletion_tags = processTagList(@deletion_tags);
print "Deletion: Loaded\n\n";

@deprecated_tags = loadTagList("deprecated.tags");
$deprecated_tags = processTagList(@deprecated_tags);
print "Deprecated: Loaded\n\n";

@nontags = loadTagList("nontags.tags");
$nontags = processTagList(@nontags);
print "Nontags: Loaded\n\n";

@source_tags = loadTagList("source.tags");
$source_tags = processTagList(@source_tags);
print "Sourcetags: Loaded\n\n";


sub tokenSubst
{
        my $string = shift;
        my $image = shift;

        $string =~ s/<IMAGE>/$image/g if(defined($image));
        $string =~ s/<DAY>/$cur_d/g;
        $string =~ s/<MONTH>/$cur_m/g;
        $string =~ s/<YEAR>/$cur_y/g;

        return $string;
}

sub loadUserList
{
        my $file = shift;
        my %notelist;
        my $i = 0;
        Pearle::myLog(4, "File: $file\n");
        open INFILE, "<", $file;
        while(<INFILE>)
        {
                my ($user, $reason);
                $_ =~ s/\s*#.*$//g;
                chomp;
                ($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/;
                next if(!defined($user) or !defined($reason));
                $notelist{$user} = $reason;
                $i++;
        }
        close INFILE;
        Pearle::myLog(3, "$i notifications loaded\n");
        return %notelist;

}


# Initialize
($cur_y, $cur_m, $cur_d) = Today(1);    # Today in GMT
$cur_m = Month_to_Text($cur_m);

Pearle::init("<<BOT USERNAME>>", "<<BOT PASSWORD>>", "tagbot.log","cookies.tagbot.txt");
Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4);
config(username => '<<BOT USERNAME>>');

Pearle::myLog(2, "Beginning execution\n");

%dont_notify = loadNotificationList("orphanbot.whitelist");
%banned_users = loadUserList("banneduser.list");
%exempt_users = loadUserList("exemptuser.list");

if(!Pearle::login())
{
        exit;
}

# Get the day's uploads
my @articles;
@articles = Pearle::getLogArticles(log => 'upload', limit => 150);
# Chop off the 20 most recent log entries
splice @articles, 0, 20;
Pearle::myLog(3, scalar(@articles) . " images found\n");


foreach my $log_entry (@articles)
{
        my $image = $log_entry->[0];
        my $uploader = $log_entry->[1];
        my $summary = $log_entry->[2] || "";

        print "$image\n";
        print "$uploader\n";

        Pearle::myLog(2, "Processing image $image\n");

        # Basic checks that can be done from the log alone

        # Non-terminating check: Was the image uploaded by a blacklisted user?
        if($banned_users{$uploader})
        {
                botwarnlog("*Image [[:$image]] uploaded by blacklisted user [[User:$uploader]]\n");
                Pearle::myLog(3, "Upload by banned user $uploader\n");
        }

        # Terminating check: Is the user on the whitelist?
        if($exempt_users{$uploader})
        {
                Pearle::myLog(2, "Upload by exempt user $uploader found.\n");
                next;
        }

        # Terminating check: Is the upload a modification?
        if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)
        {
                Pearle::myLog(2, "Optimize upload found for image $image\n");
                next;
        }
        if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i)
        {
                Pearle::myLog(2, "Tweak found for image $image\n");
                next;
        }

        # Terminating check: Is the upload a revert?
        if($summary =~ /Reverted to earlier revision|Reverted to version/)
        {
                Pearle::myLog(2, "Revert upload found for image $image\n");
                next;
        }

        # Get page data
        my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'],
                                          tllimit => 500,                                               # All the templates
                                          rvprop => ['content'],                                        # Article body
                                          meta => 'userinfo', uiprop => ['hasmsg'],                     # Check for talkpage messages
                                          redirects => 1,                                               # Resolve redirects
                                          );

        if(!defined($image_data))
        {
                Pearle::myLog(1, "Server did not return an appropriate response.\n");
                next;
        }
        my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data, ForceArray => ['tl'] );
        Pearle::myLog(4, Dumper($parsed_xml));

        my $page_text = GetPageText($parsed_xml);
        my @templates = GetPageTemplates($parsed_xml);

        # Remove non-tags from template list
        # TODO: Remove redlinks
        @templates = grep {$_ !~ /:$nontags$/i} @templates;

        my $stripped_page_text = $page_text || "";
        $stripped_page_text =~ s/^==.*?==//gm;          # Remove section headers
        $stripped_page_text =~ s/\n//g;                 # Remove newlines
        $stripped_page_text =~ s/{{{[^}]+}}}//g;        # Remove template parameters
        $stripped_page_text =~ s/{{[^}]+}}//gi;         # Remove templates

        Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n");
        Pearle::myLog(4, "Stripped text: $stripped_page_text\n");
        print "=============================================================================\n";

        # Check for interruptions
        if($permit_interruptions and DoIHaveMessages($image_data))
        {
                Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
                last;
        }

        # Sanity check: Does the image still exist?
        if(defined($parsed_xml->{query}->{pages}->{page}->{missing}))
        {
                Pearle::myLog(2, "Image $image has already been deleted\n");
                next;
        }

        # Sanity check: Is the image marked for deletion?
        if(grep {$_ =~ /:$deletion_tags$/i} @templates)
        {
                # We don't do anything with images already marked for deletion.  There are just too many corner cases and wasted-effort conditions.
                Pearle::myLog(2, "Deletion tag found\n");
                next;
        }

        ######### Check for source, license, and tag ###################

        # Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes
        my $has_source = undef;
        my $has_license = undef;
        my $has_tag = undef;

        # Does the image lack a description page?
        if($page_text =~ /^\s*$/)
        {
                Pearle::myLog(3, "Empty IDP\n");
                $has_source = 0;
                $has_license = 0;
                $has_tag = 0;
        }

        # Does the image have a source-providing template?
        if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates)
        {
                Pearle::myLog(3, "Source-providing template @tags found\n");
                $has_source = 1;

                # Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags
                @templates = Array::Utils::array_diff(@templates, @tags);
        }

        # Does the image have a self-sourcing tag?
        if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates)
        {
                Pearle::myLog(3, "Self-sourcing tag @tags found\n");
                $has_source = 1;
                $has_license = 1;
                $has_tag = 1;
        }

        # Does the image have a sourcereq tag?
        if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates)
        {
                Pearle::myLog(3, "Sourcereq tag @tags found\n");
                $has_license = 1;
                $has_tag = 1;
        }

        # Handle "Information" and "Non-free use rationale" tags
        if(grep {$_ =~ /:Information$/} @templates)
        {
                Pearle::myLog(3, "Has an Information template\n");
                # Remove the template from the list
                @templates = grep {$_ !~ /:Information$/} @templates;
                # Attempt to parse an "information" template
                if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "source" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Information}}\n");
                        $has_source = 1;
                }

                if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "author" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Information}}\n");
                        $has_source = 1;
                }

                if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "permission" parameter, assume a license (but not a tag)
                        Pearle::myLog(3, "Assuming license in {{Information}}\n");
                        $has_license = 1;
                }

                if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "flickr_url" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Flickr}}\n");
                        $has_source = 1;
                }
        }

        if(grep {$_ =~ /:Non-free use rationale$/} @templates)
        {
                Pearle::myLog(3, "Has a non-free use rationale template\n");
                # Remove the template from the list
                @templates = grep {$_ !~ /:Non-free use rationale$/} @templates;
                # Attempt to parse an "non-free use rationale" template
                if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "source" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Non-free use rationale}}\n");
                        $has_source = 1;
                }

                $has_license = 1;       # Assume that it's licensed as "fair use"
        }

        if(grep {$_ =~ /:Non-free media rationale$/} @templates)
        {
                Pearle::myLog(3, "Has a non-free use rationale template\n");
                # Remove the template from the list
                @templates = grep {$_ !~ /:Non-free media rationale$/} @templates;
                # Attempt to parse an "non-free use rationale" template
                if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "source" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n");
                        $has_source = 1;
                }
                if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "publisher" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n");
                        $has_source = 1;
                }
                if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "owner" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Non-free media rationale}}\n");
                        $has_source = 1;
                }

                $has_license = 1;       # Assume that it's licensed as "fair use"
        }

        if(grep {$_ =~ /:Non-free image data$/} @templates)
        {
                Pearle::myLog(3, "Has a non-free image data template\n");
                # Remove the template from the list
                @templates = grep {$_ !~ /:Non-free image data$/} @templates;
                # Attempt to parse an "non-free use rationale" template
                if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
                {
                        # If there's a filled-in "source" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Non-free image data}}\n");
                        $has_source = 1;
                }

                $has_license = 1;       # Assume that it's licensed as "fair use"
        }
        if(grep {$_ =~ /:spoken article entry$/i} @templates)
        {
                Pearle::myLog(3, "Has a Spoken Article template\n");
                # Remove the template from the list
                @templates = grep {$_ !~ /:Spoken article entry$/i} @templates;
                # Attempt to parse
                if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i)
                {
                        # If there's a filled-in "user_name" parameter, assume a source
                        Pearle::myLog(3, "Assuming source in {{Spoken article entry}}\n");
                        $has_source = 1;
                }
                $has_license = 1;       # Assume that it's GFDL
        }

        # Is the image description page lacking in tags?
        # This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier
        # We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list
        # here so we can say for sure that the page is untagged.
        if(scalar(@templates) == 0)
        {
                Pearle::myLog(3, "No templates found\n");
                $has_tag = 0;
        }


        # Does it have source information outside of the templates?
        if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work")
        {
                # TODO: Better source checking
                if(!defined($has_source) or $has_source == 0)
                {
                        Pearle::myLog(3, "Assuming page has source\n");
                        $has_source = 1;
                }
                if(!defined($has_license) or $has_license == 0)
                {
                        Pearle::myLog(3, "Assuming page has license\n");
                        $has_license = 1;
                }
        }
        else
        {
                if(!defined($has_source))
                {
                        # If we still don't know if it has a source, it's safe to assume it doesn't.
                        Pearle::myLog(3, "Assuming page doesn't have source\n");
                        $has_source = 0;
                }
                if(!defined($has_license))
                {
                        # If we still don't know if it has a license, we'll assume it doesn't
                        Pearle::myLog(3, "Assuming page doesn't have license information\n");
                        $has_license = 0;
                }
        }
        ########## Check for exceptional conditions ##########

        # Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag?
        if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates))
        {
                Pearle::myLog(2, "Image has deprecated tag\n");
                # Mark as no-license
                wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Obsolete or deprecated tag");
                if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                {
                        Pearle::myLog(2, "Warning user $uploader\n");
                        wikilog("User talk:$uploader", "\n{{subst:User:OrphanBot/deprecated|$image}} --~~~~", "Image with obsolete or deprecated license");
                }
                Pearle::limit();
                next;
        }

        # Terminating check: Does the image have a forbidden tag?
        if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates)
        {
                # Doesn't matter what else is on the page, the image requires human handling
                Pearle::myLog(2, "Forbidden tag $tags[0] found on image [[:$image]]\n");
                botwarnlog("*Forbidden tag {{tl|$tags[0]}} found on image [[:$image]]\n");
                Pearle::limit();
                next;
        }

        # TODO: Terminating check: Malformed fair-use rationale

        ########## Process #####################################

        # We've found an image with only unknown templates
        if(!defined($has_tag))
        {
                # We don't know if it has any tags or not, and so we cannot deduce the license or source status
                # We know an image doesn't have tags if:
                # * It has no templates
                # * or all templates are on the "nontags" list
                # We know an image has tags if:
                # * We found a tag we know about
                print "Has unknown tags\n";

                my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates;
                if(scalar(@new_unknown_tags) > 0)
                {
                        foreach my $unknown_tag (@new_unknown_tags)
                        {
                                Pearle::myLog(2, "Unknown tag {{$unknown_tag}} found\n");
                                botwarnlog("* Unknown tag [[$unknown_tag]] found\n");
                                $unknown_tags{$unknown_tag} = 1;
                        }
                        Pearle::limit();
                }
        }
        elsif($has_tag == 0)
        {
                if($has_source == 0)
                {
                        if($has_license == 0)
                        {
                                # Tag as "no source" and "no license"
                                wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license information");
                                if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                                {
                                        Pearle::myLog(2, "Warning user $uploader\n");
                                        wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license information");
                                }
                                Pearle::myLog(2, "No source, no license\n");
                                Pearle::limit();
                        }
                        else
                        {
                                # No license tag, and it either has a license or we don't know if it has a license
                                # Tag as "no source" and "untagged"
                                wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license tag");
                                if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                                {
                                        Pearle::myLog(2, "Warning user $uploader\n");
                                        wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource untagged|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license tag");
                                }
                                Pearle::myLog(2, "No source, untagged\n");
                                Pearle::limit();
                        }
                }
                else
                {
                        if($has_license == 0)
                        {
                                # Tag as "no license"
                                wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license information");
                                if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                                {
                                        Pearle::myLog(2, "Warning user $uploader\n");
                                        wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license information");
                                }
                                Pearle::myLog(2, "No license\n");
                                Pearle::limit();
                        }
                        else
                        {
                                # Tag as "untagged"
                                wikilog($image, tokenSubst("\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license tag");
                                if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                                {
                                        Pearle::myLog(2, "Warning user $uploader\n");
                                        wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/untagged-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license tag");
                                }
                                Pearle::myLog(2, "Untagged\n");
                                Pearle::limit();
                        }
                }
        }
        else
        {
                # If it has a tag, it has a license
                if($has_source == 0)
                {
                        # Tag as "no source"
                        wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source information");
                        if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
                        {
                                Pearle::myLog(2, "Warning user $uploader\n");
                                wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source information");
                        }
                        Pearle::myLog(2, "No source\n");
                        Pearle::limit();
                }
                else
                {
                        # Everything's fine
                        Pearle::myLog(2, "Image has no problems\n");
                }
        }

        sleep(2);
}

Pearle::myLog(2, "Finished with upload set\n");