Jump to content

User:HBC Archive Indexerbot/source

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Main code

# This script is released under the GFDL license, see
# http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history
# for a full list of contributors

### Configuration ###
# Time to sleep between writes (in seconds)
my $write_delay = 5;
# Max pages to download at once
my $download_max = 25;

# Default template
my $default_template = 'User:HBC Archive Indexerbot/default template';

# Cache paths
my $cache = 'cache';
my $wiki_cache = "$cache/wiki";
my $obj_cache = "$cache/obj";
my $report_cache = "$cache/reports";
### End Configuration ###

use strict;
use warnings;
use Data::Dumper;
use Date::Parse;
use Digest::SHA1 qw(sha1_hex);
use Encode qw(decode_utf8 encode_utf8);
use HTML::Entities qw(encode_entities);
use IO::Socket;
use MediaWiki;
use POSIX qw(strftime);
use Storable;
use Time::Duration;
use Time::Local;
use URI::Escape;
use XML::Simple;


my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0);
my $nowiki = 'nowiki';
my $start_time = undef;
#########################################
# Log into Wikipedia                    #
#########################################
die "Cache directories must be created in advance\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache);
open(PASS,'password');                  # A file with only the password, no carraige return
sysread(PASS, my $password, -s(PASS));  # No password in sourcecode.
close(PASS);
writelog                  ('Connecting to Wikipedia');
my $c                  =   MediaWiki->new;
$c->setup
                        ({
                          'bot' => {'user' => 'HBC Archive Indexerbot','pass' => $password},
                          'wiki' => {'host' => 'en.wikipedia.org','path' => 'w'}
                        }) || die 'Failed to log in';
my $whoami              =  $c->user();
writelog                 ($whoami.' connected');

#########################################
# Update cache to modern state          #
#########################################
writelog                 ('Gathering jobs');
my @master_job_list     =  gather_jobs();
my @post_jobs           = @master_job_list;


writelog                  (scalar(@master_job_list).' jobs found');
writelog                 ('Parsing cache');
my $rh_cache_data       =  parse_cache();
writelog                 ('done');
writelog                 ('Parsing watchlist');
my $rh_modified_data    =  parse_watchlist();
writelog                 ('done');
download_pages(find_updated());
download_pages(find_templates(@master_job_list));
#push                     (@needed,&find_holes());
                           fetch_pages(@master_job_list);
writelog                 ("$pages_watched pages added to watchlist.");
writelog                 ("$pages_downloaded out of $pages_attempted downloaded.");
#########################################
# Parse cached data and create reports  #
#########################################

writelog ('Creating reports');
foreach my $ra_job (@post_jobs) {
  my $page = decode_utf8(encode_utf8($ra_job->{'page'}));
  my $dest = decode_utf8(encode_utf8($ra_job->{'target'}));
  my $dest_escaped = _escape($dest);
  my $mask = decode_utf8(encode_utf8(join(', ', @{$ra_job->{'mask'}})));
  my $index_here = $ra_job->{'indexhere'};
  unless (check_text(1000,$dest_escaped))
    {
    writelog ('Not writing to [['.$dest.']] as I cannot find permission (sourced from: [['.$page.']])');
    next;
    }
  my $report = create_report($ra_job);
  open(REPORT, ">$report_cache/$dest_escaped.$$");
  use bytes;
  print REPORT $report;
  close(REPORT);
  if (-e "$report_cache/$dest_escaped") {
    my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`;
    unless ($result) {
      writelog ('No change, skipping [['.encode_entities($dest).']]');
      unlink "$report_cache/$dest_escaped.$$";
      next;
    }
  }
  $c->login();
  writelog ('Writing report at [['.encode_entities($dest).']]');
  my $edit_summary = "Writing index of archives in " . encode_entities($mask) . " due to request from [[ " . encode_entities($page) . "]] - Bot edit";
  my $result = send_report($dest,$report,$edit_summary);
  if ($result) {
      rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped";
  } else {
      unlink("$report_cache/$dest_escaped.$$");
  }
}

$ul += 120 + length($log_file);
writelog ('Complete, downloaded {{formatnum:'.int($dl/1024).'}} kilobyte(s) and uploaded {{formatnum:'.int($ul/1024).'}} kilobyte(s) (figures approximate)');
&post_log();
exit;

#########################################
# Subroutines                           #
#########################################

sub check_text {
  my $bytes = shift;
  my $page = shift;

  my $host = 'en.wikipedia.org';
  my $path = "/w/index.php?title=$page&action=raw";
  my $sock         = new IO::Socket::INET
      (
       PeerAddr    => $host,
       PeerPort    => 80,
       Proto       => 'tcp',
      );
  return 0 unless ($sock);
  my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: HBC Archive Indexerbot 0.9a'."\r\n\r\n");
  syswrite ($sock, $header);
  my($buf , $content, $done);
  while (!$done)
    {
    ($done = 1) unless sysread($sock, $buf, $bytes);
    $content .= $buf;
    if ((length($content) >= $bytes) || ($content =~ m|!-- HBC Archive Indexerbot can blank this --|))
      {
      $done = 1;
      }
    }
  close($sock);
  $dl += length($content);
  return ($content =~ m|!-- HBC Archive Indexerbot can blank this --|);
}

sub create_report {
  my ($ra_job) = @_;
  my ($rh_index, $numbered_links) = index_headings($ra_job);
  my $template = get_template($ra_job->{'template'});
  my $report = sprintf("%sThis report has been generated because of a request at [[%s]]. It covers the archives that match '''%s'''\n<br/>Report generated at ~~~~~ by ~~~\n----\n\n",
                       $template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}}));
  $report .= $template->{'header'};
  my $i = 0;
  foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}} (keys(%{$rh_index}))) {
    $rh_index->{$key}->{'topic'} =~ s:({{.*?}}|[|!]{2}):<$nowiki>$1</$nowiki>:g;
    my $row = $template->{'row'};
    if ($template->{'altrow'}) {
      unless ($i++ % 2 == 0) {
        $row = $template->{'altrow'}
      }
    }
    foreach ('topic','replies','link','first','last','duration',
             'firstepoch','lastepoch','durationsecs') {
      $row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi;
    }
    $report .= $row;
  }
  $report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'});
  return $report;
}

sub download_pages {
  my (@pages) = @_;
  return unless @pages;
  my $requests = scalar(@pages);

  my (@received_names);

  while (@pages) {
    my @batch;
    while ((scalar(@batch) < 50) && @pages) {
      my $item = shift(@pages) || last;
      $item = _underscore($item);
      push (@batch, $item);
    }
    $pages_attempted += scalar(@batch);
    my $xml_code = $c->special_export(@batch);
    $dl += length($xml_code);
    my $xml_result = XMLin($xml_code);
    next unless ($xml_result->{'page'});
    if ($xml_result->{'page'}{'title'}) {
      push (@received_names, handle_chunk($xml_result->{'page'}));
    } else {
      foreach my $key (keys %{$xml_result->{'page'}}) {
        push (@received_names, handle_chunk($xml_result->{'page'}->{$key}));
      }
    }
  }
  writelog('Downloaded '.scalar(@received_names)." pages from $requests requests");
  return (@received_names);
}

sub fetch_pages {
  my (@jobs) = @_;

  my (@cache_names) = keys(%$rh_cache_data);
  foreach my $ra_job (@jobs) {
    my @fetch;

    if ($ra_job->{'indexhere'}) {
      my $page = _underscore($ra_job->{'page'});
      push(@fetch, $ra_job->{'page'}) unless (defined($rh_cache_data->{$page}));
    }

    my $fetch_size = 0;
    foreach my $mask (@{$ra_job->{'mask'}}) {
      if ($mask =~ m|<#>|) {
        $fetch_size += 10;
        my $pattern = _underscore($mask);
        my ($part1, $part2) = split(m|<#>|, $pattern, 2);
        $pattern = qr/\Q$part1\E(\d+)/;
        $pattern .= qr/\Q$part2\E/ if $part2;
        my $leading_zeros = $ra_job->{'leading_zeros'}+1;
        my $marker = '%d';
        $marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1);
        my $printf_pattern = $mask;
        $printf_pattern =~ s|<#>|$marker|;
        my (@mask_pages) = grep(/^$pattern/,@cache_names);
        my $largest = 0;
        foreach my $key (@mask_pages) {
          ($key =~ m|$pattern|) || next;
          $largest = $1 if ($1 > $largest);
        }
        my $count = $largest;
        my (@pages);
        until ($count >= ($largest + $fetch_size)) {
          $count++;
          my $page_name = sprintf($printf_pattern, $count);
          push(@fetch,$page_name);
        }
      # MONTHLY: elsif here for the <date> or whatever is used
      } else {
        my $check = _underscore($mask);
        push (@fetch, $mask) unless (defined($rh_cache_data->{$check}));
      }
    } continue {
      if (scalar(@fetch)) {
        my (@received) = download_pages(@fetch);
        $rh_cache_data = parse_cache();
        (@cache_names) = keys(%$rh_cache_data);
        if (scalar(@fetch) == scalar(@received)) {
          @fetch = ();
          redo;
        } else {
          @fetch = ();
        }
      }
      $fetch_size = 0;
    }
  }
}

sub find_holes  # This sub will find gaps in the archive(mabye a page was deleted then restored) and
  {             # adds them to the list of potentially needed pages
  return();
  }

sub find_templates {
  my (@jobs) = @_;
  my %templates;
  my @templates_needed;
  foreach my $ra_job (@jobs) {
    $templates{$ra_job->{'template'}}++;
  }
  foreach my $template (keys %templates) {
    $template = $default_template if $template eq 'default';
    my $tmpl_under = _underscore($template);
    push(@templates_needed, $template) unless defined($rh_cache_data->{$tmpl_under});
  }
  writelog (scalar(@templates_needed).' templates needed');
  return @templates_needed;
}

sub find_updated # Find items that have changed
  {
  my(@need_update);
  foreach my $page (keys(%{$rh_cache_data})) {
    if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't
                                      # been modified in the past month, ignore
      if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) {
        push(@need_update,$page);
        my $fname = ("$wiki_cache/".uri_escape_utf8($page).' '.$rh_cache_data->{$page});
        unlink($fname); # Remove old item
      }
    }
  }
  writelog (scalar(@need_update).' pages need updating');
  return @need_update;
  }

sub gather_jobs
  {
  my (@jobs);
  my $html_list         =  $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:HBC Archive Indexerbot/OptIn&limit=5000")->content();
  $dl += length($html_list);
  my @targets;
  while ($html_list =~ s|>([^<]*?)</a> \(transclusion\)||)
    {
    push(@targets,$1);
    }
  my $xml_source = XMLin($c->special_export(@targets));
  my $xml = $xml_source;
  $dl += length($xml_source);
  my $rh_pages = ${$xml}{'page'};
  my %targets;
  foreach my $key (keys(%{$rh_pages})) {
    my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'};
    if ($content =~ m"\Q{{User:HBC Archive Indexerbot/OptIn\E\s*\|(.+?)\s*\Q}}\E"s) {
      my @params = split(/\s*\|\s*/, $1);
      my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 );
      foreach my $param (@params) {
        my ($key, $value) = split(/\s*=\s*/, $param);
        next unless ($key && defined($value));

        $value =~ s:^\.?/:$job{'page'}/:;

        if ($key eq 'target') {
          $job{'target'} = $value;
        } elsif ($key eq 'mask') {
          next unless $value;
          push (@{$job{'mask'}}, $value);
        } elsif ($key =~ /leading_zeroe?s/) {
          if ($value =~ m/^(\d+)$/) {
            $job{'leading_zeros'} = $1;
          }
        } elsif ($key eq 'indexhere') {
          $job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0'));
        } elsif ($key eq 'template') {
          $job{'template'} = $value;
        }

      }
      $job{'template'} = 'default' unless $job{'template'};
      $job{'template'} = 'default' if $job{'template'} eq 'template location';

      next unless ($job{'target'} && $job{'mask'});

      if ($targets{$job{'target'}}) {
        writelog("Request on [[$job{'page'}]] duplicates target [[$job{'target'}]]; skipping");
        next;
      } else {
        $targets{$job{'target'}}++;
      }

      push(@jobs,\%job);
    }
  }
  return @jobs;
  }

sub get_template {
  my ($template) = (@_);

  if ($template eq 'default') {
    $template = $default_template;
  }

  my $tmpl_fn = _escape($template);
  my ($file) = glob("$wiki_cache/$tmpl_fn*");
  unless ($file) {
    if ($template eq $default_template) {
      die "$template missing from cache\n";
    } else {
      return get_template('default');
    }
  }
  open(TMPL, $file);
  my @content = <TMPL>;
  close(TMPL);

  my %template = (lead => '', header => '', row => '', altrow => '',
                  footer => '', tail => '');
  my $section = '';
  foreach my $line (@content) {
    chomp $line;
    if ($line =~ m:^<!--\s*(.*?)\s*-->$:) {
      $section = lc($1);
      $section =~ s/\s+//g;
      last if $section eq 'end';
    } else {
      if ($section) {
        next unless $line;
        $template{$section} .= "$line\n";
      }
    }
  }
  $template{'lead'} .= "\n" if $template{'lead'};

  unless ($template{'row'}) {
    die "Default template missing 'row' parameter!\n" if $template eq $default_template;
    writelog("Invalid template: '$template', using default instead");
    return get_template('default');
  }

  return \%template;
}

sub handle_chunk {
  my $chunk = shift;
  my $name = _underscore(${$chunk}{'title'});
  my $fname = "$wiki_cache/".uri_escape_utf8($name);
  ${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|;
  my $time = timegm($6,$5,$4,$3,$2-1,$1);
  watch($name) unless (${$rh_cache_data}{$name});
  open(OUT,">$fname $time");
  binmode(OUT);
  use bytes;
  print OUT (${$chunk}{'revision'}{'text'}{'content'});
  no bytes;
  close(OUT);
  $pages_downloaded++;
  return $name;
}

sub index_headings {
  my ($ra_job) = @_;

  my $mask_re = '';
  foreach my $mask (@{$ra_job->{'mask'}}) {
    my $mask2 = _escape($mask);
    if ($mask2 =~ m|%3C%23%3E|) {
      my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2);
      $mask_re .= '(?:';
      $mask_re .= qr/\Q$part1\E\d+/;
      $mask_re .= qr/\Q$part2\E/ if $part2;
      $mask_re .= ')|';
    # MONTHLY: elsif here for <date>
    } else {
      $mask_re .= qr/\Q$mask2\E/.'|';
    }
  }
  chop($mask_re);

  opendir(CACHE,$wiki_cache);
  my(@cache) = readdir(CACHE);
  closedir(CACHE);
  my @files = grep(m|^(?:$mask_re)|,@cache);
  if ($ra_job->{'indexhere'}) {
    my $page = _escape($ra_job->{'page'});
    push(@files, grep(m|^\Q$page\E \d+$|,@cache));
  }
  my (%index, %used_headings);
  my $numbered_links = 0;
  foreach my $file (@files) {
    my (%used_names);
    next unless ($file =~ m|^(.*) (\d+)$|);
    my $root_path = decode_utf8(uri_unescape($1));
    my $display_name = $root_path;
    $display_name =~ s/_/ /g;
    open(WIKI, "$wiki_cache/$file");
    my @content = <WIKI>;
    close(WIKI);
    my $prev_heading = '';
    my ($comment_count,$first,$last) = (0,0,0);
    foreach my $line (@content) {
      if ($line =~ m|^==\s*([^=].+?)\s*==|) {
        if ($prev_heading && $comment_count > 0) {
          ## WARNING: This code is duplicated below vvvvvv
          $index{$prev_heading}->{'replies'} = $comment_count;
          if ($first && $last) {
            $index{$prev_heading}->{'firstepoch'} = $first;
            $index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first));
            $index{$prev_heading}->{'lastepoch'} = $last;
            $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
            $index{$prev_heading}->{'durationsecs'} = $last - $first;
            if ($comment_count > 1) {
              $index{$prev_heading}->{'duration'} = duration($last - $first);
            } else {
              $index{$prev_heading}->{'duration'} = 'None';
            }
          }
          $comment_count = 0;
          $first = 0;
          $last = 0;
        }
        my $heading = $1;
        my $head_link;
        ($head_link, $numbered_links) = path_fix($heading, $numbered_links);
        $used_names{lc($head_link)}++;
        my $suffix = (($used_names{lc($head_link)} > 1) ? ('_'.$used_names{lc($head_link)}) : (''));
        $used_headings{lc($head_link.$suffix)}++;
        $prev_heading = $head_link.$suffix.'_'.$used_headings{lc($head_link.$suffix)};
        $index{$prev_heading} = { topic => encode_entities(decode_utf8($heading)), link => ("[[{{urlencode:$root_path}}#$head_link".$suffix."|$display_name]]"),
                                  root_path => $root_path, head_link => $head_link,
                                  replies => 'Unknown', first => 'Unknown',
                                  'last' => 'Unknown', duration => 'Unknown',
                                  firstepoch => 0, lastepoch => 0,
                                  durationsecs => 0,
                                };
      } elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) {
        $comment_count++;
        my $time = str2time($1);
        if ($time && (!$first || $time < $first)) {
          $first = $time;
        }
        if ($time && ($time > $last)) {
          $last = $time;
        }
      }
    }
    if ($prev_heading && $comment_count > 0) {
      ## WARNING: This code is duplicated from above ^^^^^^
      $index{$prev_heading}->{'replies'} = $comment_count;
      if ($first && $last) {
        $index{$prev_heading}->{'firstepoch'} = $first;
        $index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first));
        $index{$prev_heading}->{'lastepoch'} = $last;
        $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
        $index{$prev_heading}->{'durationsecs'} = $last - $first;
        if ($comment_count > 1) {
          $index{$prev_heading}->{'duration'} = duration($last - $first);
        } else {
          $index{$prev_heading}->{'duration'} = 'None';
        }
      }
    }
  }
  return \%index;
}

sub parse_cache
  {
  my (@pages,$count);
  opendir(CACHE,$wiki_cache);
  my(@files) = readdir(CACHE);
  closedir(CACHE);
  my(%cache);
  foreach my $file (@files)
    {
    next unless ($file =~ m|^(.*) (\d+)$|);
    my $page_name = decode_utf8(uri_unescape($1));
    my $time = $2;
    $cache{$page_name} = $time;
    }
  return \%cache;
  }

sub parse_watchlist
  {
  my $watchlist         =  $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content();
  $dl += length($watchlist);
  my @lines             =  split("\n",$watchlist);
  my @date;
  my %watchlist;
  while (scalar(@lines))
    {
    my $line = shift(@lines);
    if ($line =~ m|<h4>(\d{4})-(\d{2})-(\d{2})</h4>|i)
      {
      @date = ($1,$2,$3);
      }
    if ($line =~ m|title="([^"]*?)">hist</a>|i) # "
      {
      my $page_name = _underscore($1);
      $line =~ m|(\d{2}):(\d{2}):(\d{2})|;
      $watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]);
      }
    }
  return \%watchlist;
  }

sub path_fix {
  my ($path,$numbered_links) = @_;
  ($path =~ s|'{2,4}||g);
  ($path =~ s|<.*?>||g);
  ($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g);
  ($path =~ s|\[\[:?(.*?)\]\]|$1|g);
  while ($path =~ m|\[.*?\]|) {
    my $title;
    if ($path =~ m|\[[^ ]* (.*?)\]|) {
      $title = $1;
    } else {
      $numbered_links++;
      $title = ".5B$numbered_links.5D";
    }
    $path =~ s|\[.*?\]|$title|;
  }
  ($path =~ s|\s|_|g);
  ($path =~ s| |.C2.A0|g);
  while ($path =~ m|([^/a-z0-9\.:_'-])|i) {
    my $bad = $1;
    my $fix = uc('.'.sprintf("%x",ord($bad)));
    ($path =~ s/\Q$bad/$fix/g);
  }
  return ($path,$numbered_links);
}

sub post_log {
  my $pg                =  $c->get('User:HBC Archive Indexerbot/logs', 'rw');
  $pg->{summary}        =  ('Writing log file for '.$start_time).' - Bot edit';
  $pg->{content}        =  $log_file;
  $pg->save();
}

sub send_report {
  my $dest      = shift;
  my $report    = shift;
  my $edit_summary = shift;
  my $pg        = $c->get($dest, 'w');
  $pg->{summary}        =  $edit_summary;
  $pg->{content}        =  '<!-- HBC Archive Indexerbot can blank this -->'."\n".$report;
  $ul += length($report);
  my $result = $pg->save();
  unless ($result) {
      my $dest_entities = encode_entities($dest);
      writelog("Failed to save report to $dest_entities");
  }
  sleep($write_delay);
  return $result;
}

sub watch
  {
  my $page_name = shift;
  my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success;
  $pages_watched++ if ($success);
  return $success;
  }

sub writelog {
  my $entry = shift;
  my @month_table =
  (
   'January',
   'February',
   'March',
   'April',
   'May',
   'June',
   'July',
   'August',
   'September',
   'October',
   'November',
   'December',
  );
  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time);
  my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900));
  $start_time ||= $time;
  $log_file .= ('* '.$time.': '.$entry.' ~~~'."\n");
  warn "$entry\n";
}

sub _escape {
  my ($val) = @_;
  $val = _underscore($val);
  $val = uri_escape_utf8($val);
  return $val;
}

sub _hash {
  my ($val) = @_;
  $val = _escape($val);
  $val = sha1_hex($val);
  return $val;
}

sub _underscore {
  my ($val) = @_;
  $val =~ s|\s|_|g;
  return $val;
}
 

MediaWiki Change

Fixing an error in MediaWiki/page.pm (forgotten &):

return $obj->{ua}->get($obj->_wiki_url . "&action=" . ($unwatch ? "un" : "") . "watch")->is_success;