Jump to content

User:Carnildo/wiki-regex-tester.pl

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Common usages:

wiki-regex-tester.pl titles.txt < blacklist.txt

Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.

wiki-regex-tester.pl 'Title of a Wikipedia article' < blacklist.txt

Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"

wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |perl wiki-regex-tester.pl ns_0.txt|wc -l

Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.


#!/usr/bin/perl

use warnings;
use strict;
use utf8;

use Time::HiRes;

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

my @regexes;

while(<STDIN>)
{
        my $regex = $_;
        my $ignorecase = 1;
        my $moveonly = 0;
        my $newaccount = 0;

        $regex =~ s/#.*$//g;    # Strip comments
        $ignorecase = 0 if($regex =~ /casesensitive/);
        $moveonly = 1 if($regex =~ /moveonly/);
        $newaccount = 1 if($regex =~ /newaccountonly/);
        $regex =~ s/<(moveonly|newaccountonly|casesensitive|\||errmsg=[^|>]*| )+>//g; # Strip modifiers
        $regex =~ s/\s*$//g;    # Strip trailing space
        $regex =~ s/^\s*//g;    # Strip leading space

        if($regex !~ /^\s*$/ and !$newaccount)
        {
                push @regexes, [$regex, $ignorecase, $moveonly];
        }
}

print STDERR "Testing " . scalar(@regexes) . " regexes\n";

my $lines = 0;
my $lines2 = 0;
my $regex_count = 0;
foreach my $regex_entry (@regexes)
{
        my $start_time = Time::HiRes::time();
        my $u_start_time = Time::HiRes::clock();
        my $maxtime = 0;
        my ($regex, $ignorecase, $moveonly) = @{$regex_entry};

        if(-e $ARGV[0])
        {
                open INFILE, "<", $ARGV[0];
                binmode INFILE, ":utf8";
        }
        else
        {
                open INFILE, "<", \$ARGV[0];
                binmode INFILE, ":utf8";
        }

        while(<INFILE>)
        {
                my $target = $_;
                chomp $target;
                $target =~ s/_/ /g;

                if($ignorecase)
                {
                        if($target =~ /^$regex$/i)
                        {
                                print "* [[$target]] :: $regex\n";
                        }
                }
                else
                {
                        if($target =~ /^$regex$/)
                        {
                                print "* [[$target]] :: $regex\n";
                        }
                }

                $lines = $lines + 1;
                if($lines >= 10000)
                {
                        my $newtime = Time::HiRes::clock();
                        my $diff = $newtime - $u_start_time;
                        $u_start_time = $newtime;

                        $maxtime = $diff if($diff > $maxtime);

                        $lines = 0;
                        $lines2 += 10000;
                        print STDERR "$diff $lines2\r";
                }
        }
        $regex_count += 1;

        my $stop_time = Time::HiRes::time();
        print STDERR "Regex $regex took " . ($stop_time - $start_time) . " seconds\n";
        print STDERR "Slowest batch took $maxtime seconds\n";
        close INFILE;
}