User:Carnildo/wiki-regex-tester.pl

The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Common usages:

wiki-regex-tester.pl titles.txt < blacklist.txt

Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.

wiki-regex-tester.pl 'Title of a Wikipedia article' < blacklist.txt

Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"

wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |perl wiki-regex-tester.pl ns_0.txt|wc -l

Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.

#!/usr/bin/perl

use warnings;
use strict;
use utf8;

use Time::HiRes;

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

my @regexes;

while(<STDIN>)
{
        my $regex = $_;
        my $ignorecase = 1;
        my $moveonly = 0;
        my $newaccount = 0;

        $regex =~ s/#.*$//g;    # Strip comments
        $ignorecase = 0 if($regex =~ /casesensitive/);
        $moveonly = 1 if($regex =~ /moveonly/);
        $newaccount = 1 if($regex =~ /newaccountonly/);
        $regex =~ s/<(moveonly|newaccountonly|casesensitive|\||errmsg=[^|>]*| )+>//g; # Strip modifiers
        $regex =~ s/\s*$//g;    # Strip trailing space
        $regex =~ s/^\s*//g;    # Strip leading space

        if($regex !~ /^\s*$/ and !$newaccount)
        {
                push @regexes, [$regex, $ignorecase, $moveonly];
        }
}

print STDERR "Testing " . scalar(@regexes) . " regexes\n";

my $lines = 0;
my $lines2 = 0;
my $regex_count = 0;
foreach my $regex_entry (@regexes)
{
        my $start_time = Time::HiRes::time();
        my $u_start_time = Time::HiRes::clock();
        my $maxtime = 0;
        my ($regex, $ignorecase, $moveonly) = @{$regex_entry};

        if(-e $ARGV[0])
        {
                open INFILE, "<", $ARGV[0];
                binmode INFILE, ":utf8";
        }
        else
        {
                open INFILE, "<", \$ARGV[0];
                binmode INFILE, ":utf8";
        }

        while(<INFILE>)
        {
                my $target = $_;
                chomp $target;
                $target =~ s/_/ /g;

                if($ignorecase)
                {
                        if($target =~ /^$regex$/i)
                        {
                                print "* [[$target]] :: $regex\n";
                        }
                }
                else
                {
                        if($target =~ /^$regex$/)
                        {
                                print "* [[$target]] :: $regex\n";
                        }
                }

                $lines = $lines + 1;
                if($lines >= 10000)
                {
                        my $newtime = Time::HiRes::clock();
                        my $diff = $newtime - $u_start_time;
                        $u_start_time = $newtime;

                        $maxtime = $diff if($diff > $maxtime);

                        $lines = 0;
                        $lines2 += 10000;
                        print STDERR "$diff $lines2\r";
                }
        }
        $regex_count += 1;

        my $stop_time = Time::HiRes::time();
        print STDERR "Regex $regex took " . ($stop_time - $start_time) . " seconds\n";
        print STDERR "Slowest batch took $maxtime seconds\n";
        close INFILE;
}