Jump to content

User:HBC archive builderbot/source

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
use strict;
use Storable;
use LWP::UserAgent;
use HTTP::Request::Common;
use XML::Simple;
use URI::Escape;
use Data::Dumper;
use Algorithm::Diff qw(diff);

my $ua = LWP::UserAgent->new('agent' => 'HBC archive builderbot v0.1 - developing (Operated by User:HighInBC)');
my $nowiki = ('nowiki'); # So it doesn't screw up the display of the source code on wiki

my $page = 'Wikipedia:Requests for comment/User names';
my $shortcut;
$shortcut = 'WP:RFCN';
$shortcut ||= $page;
my %revisions = get_complete_history($page);

my(@old_content);
my($old_key);
my $day;
KEY: foreach my $key (sort {$a <=> $b} keys(%revisions))
  {
  my(@content) = split("\n",${$revisions{$key}}{'text'}{'content'});
  my $timestamp = ${$revisions{$key}}{'timestamp'};
  my $summary = ${$revisions{$key}}{'comment'};
  $summary =~ s|/\*.*\*/\s*||;
  my $user = ${$revisions{$key}}{'contributor'}{'username'};
  my (@headings);
  if (scalar(@content) && scalar(@old_content))
    {
    my @diffs = diff(\@old_content, \@content);
    foreach my $ra_hunk (@diffs)
      {
      foreach my $ra_diff (@{$ra_hunk})
        {
        my($action,$content) = @{$ra_diff}[0,2];
        if (($content =~ m|==\s?([^=]*)\s?==|) && ($action eq '-'))
          {
          my $heading = $1;
          ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|);
          push(@headings,$heading);
          }
        }
      }
    }
  if (scalar(@headings))
    {
    $timestamp =~ m|(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z|;
    if ($1 ne $day)
      {
      $day = $1;
      warn "'''$day'''\n";
      }
    my $time = $2;
    my $archive_link = "'''[{{fullurl:$shortcut|oldid=$old_key}} Archive link]'''";
    if (scalar(@headings) > 1)
      {
      warn "* '''$time''': $archive_link - ($summary ([[User:$user|$user]])) - (".scalar(@headings)." entries)\n";
      foreach my $heading (@headings)
        {
        warn "** $heading\n";
        }
      }
    elsif (scalar(@headings) == 1)
      {
      warn "* '''$time''': $archive_link - $headings[0] - ($summary ([[User:$user|$user]]))\n";
      }
    }
  @old_content = @content;
  $old_key = $key;
  }

sub get_complete_history # Add Gzip, 100 times smaller, gee where did that ratio come from??
  {
  mkdir('cache') unless (-d('cache'));
  my $page = shift;
  my(%revisions);
  my $count;
  my $offset;
  my $fname = 'cache/'.uri_escape($page);
  if (-f($fname))
    {
    warn "Found '$page' in cache, loading...\n";
    %revisions = %{retrieve($fname)};
    my(@keys) = sort {$a <=> $b} keys(%revisions);
    $offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); # Get timestamp of most recent revision
    warn (scalar(keys(%revisions))." loaded from cache.\n");
    }
  else
    {
    warn "No cache, starting fresh.\n";
    $offset = '0';
    }
  my $total;
  GETMORE:
  warn "\nDownloading as many as 100 revisions starting at ".($offset || 'the start')."\n";
  my $index = 'http://en.wikipedia.org/w/index.php';
  my $res = $ua->request
	(
	 POST $index."?title=Special:Export",
	 Content_Type  => 'application/x-www-form-urlencoded',
	 Content       =>	[(
				  'pages'	=> $page,
				  'action'	=> 'submit',
				  'submit'	=> 'Export',
				  'limit'	=> 100,
				  'offset'	=> $offset
				)]
	);
  my $current = $res->content();
  unless ($current =~ m|^<mediawiki|)
    {
    warn "Failed somehow, trying again.\n";
    goto GETMORE;
    }
  my $index = rindex($current, '<timestamp>');
  my $string = substr($current,$index,43);
  $string =~ m|<timestamp>(.+?)</timestamp>|;
  $offset = $1;
  my $xml_data = XMLin($current);
  $count = 0;
  if (!scalar(keys(%{${$xml_data}{page}{revision}}))) {} # do nothing
  elsif (${$xml_data}{'page'}{'revision'}{'id'})
    {
    unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}) {$count++;$total++;}
    $revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'};
    }
  else
    {
    foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}}))
      {
      unless ($revisions{$revision}) {$count++;$total++;}
      $revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision};
      }
    warn Dumper($xml_data) unless ($total);
    }
  warn "Got $count revisions\n";
  if ($count == 100)
    {
    warn "Still more.\n";
    goto GETMORE;
    }
  if ($total > 0)
    {
    warn "Saving cache...\n";
    store(\%revisions, $fname);
    warn "done.\n";
    }
  return %revisions;
  }