User:Carnildo/wiki-regex-tester.c

Common usages:

./wiki-regex-tester titles.txt < blacklist.txt

Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.

./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt

Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"

wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l

Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.

/* wiki-regex-tester.c
 *
 * A program to test regular expressions for the Wikipedia title blacklist.  Assumes UTF-8.
 */

/* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs`
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include <pcre.h>

void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly)
{
	size_t lead = 0;
	char tempregex[4096];
	
	/* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */
	if(strstr(regex, "casesensitive"))
	{
		*casesensitive = 1;
	}
	if(strstr(regex, "newaccountonly"))
	{
		*newaccountonly = 1;
	}
	
	/* Cut off the trailing newline */
	if(strrchr(regex, '\n'))
	{
		*strrchr(regex, '\n') = '\0';
	}
	
	/* Whack off the tail end of the regex -- all modifiers and comments */
	if(strchr(regex, '#'))
	{
		*strchr(regex, '#') = '\0';	/* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */
	}
	if(strstr(regex, "<moveonly"))
	{
		*strstr(regex, "<moveonly") = '\0';
	}
	if(strstr(regex, "<newaccountonly"))
	{
		*strstr(regex, "<newaccountonly") = '\0';
	}
	if(strstr(regex, "<casesensitive"))
	{
		*strstr(regex, "<casesensitive") = '\0';
	}
	if(strstr(regex, "<reupload"))
	{
		*strstr(regex, "<reupload") = '\0';
	}
	if(strstr(regex, "<errmsg"))
	{
		*strstr(regex, "<errmsg") = '\0';
	}
	if(strstr(regex, "<autoconfirmed"))
	{
		*strstr(regex, "<autoconfirmed") = '\0';
	}
	if(strstr(regex, "<noedit"))
	{
		*strstr(regex, "<noedit") = '\0';
	}

	/* Trim leading and trailing whitespace */
	lead = strspn(regex, " \t");
	if(lead > 0)
	{
		memmove(regex, regex + lead, strlen(regex) - lead + 1);
	}
	while(regex[strlen(regex) - 1] == ' ')
	{
		regex[strlen(regex) - 1] = '\0';
	}
	
	/* Add anchors */
	if(strlen(regex) > 0)
	{
		sprintf(tempregex, "^%s$", regex);
		strcpy(regex, tempregex);
	}
}

void fixup_line(char *line)
{
	line[strlen(line) - 1] = '\0';	/* Cut off the trailing newline */
	while(strchr(line, '_'))
	{
		*strchr(line, '_') = ' ';
	}
}

int main(int argc, char *argv[])
{
	int i;
	char regex[4096];
	char line[1024];
	int ovector[300];
	FILE *infile;
	struct stat dummy;
	pcre *comp_regex;
	int result;
	int matches;
	int lines = 0;
	
	const char * errptr;
	int offset;
		
	/* Read the regexes in from stdin */
	while(!feof(stdin))
	{
		int casesensitive = 0;
		int newaccountonly = 0;
		
		fgets(regex, 4096, stdin);
		/* For each regex */
		/* Preprocess */
		preprocess_regex(regex, &casesensitive, &newaccountonly);
		if(strlen(regex) > 0)
		{
			matches = 0;
			fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i');
			comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL);
			if(NULL == comp_regex)
			{
				fprintf(stderr, "Compile failed: %d %s\n", offset, errptr);
			}
			else
			{
				if(!newaccountonly)
				{
					/* Test */
					for(i = 1; i < argc; i++)
					{
						/* If it's a file */
						if(!stat(argv[i], &dummy))
						{
							infile = fopen(argv[i], "r");
							while(!feof(infile))
							{
								lines += 1;
								fgets(line, 1024, infile);
								fixup_line(line);
								result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300);
								if(result >= 0)
								{
									printf("* [[%s]] :: %s\n", line, regex);
									matches += 1;
								}
								else if(result == PCRE_ERROR_NOMATCH)
								{
//									printf("* Nomatch\n");
								}
								else
								{
									fprintf(stderr, "Error: %d\n", result);
								}
								if((lines % 100000) == 0)
								{
									fprintf(stderr, "Lines: %d            \r", lines);
								}
							}
							fclose(infile);
						}
						else
						{
							lines += 1;
							/* Otherwise, test as a literal */
							result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300);
							if(result >= 0)
							{
								matches += 1;
								printf("* [[%s]] :: %s\n", argv[i], regex);
							}
							else if(result == PCRE_ERROR_NOMATCH)
							{
//								printf("* No match\n");
							}
							else
							{
								fprintf(stderr, "Error: %d\n", result);
							}
						}
					}
				}
			}
			fprintf(stderr, "Matches: %d\n", matches);
		}
	}
}