Jump to content

User:Carnildo/wiki-regex-tester.c

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Common usages:

./wiki-regex-tester titles.txt < blacklist.txt

Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.

./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt

Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"

wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l

Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.


/* wiki-regex-tester.c
 *
 * A program to test regular expressions for the Wikipedia title blacklist.  Assumes UTF-8.
 */

/* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs`
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include <pcre.h>

void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly)
{
	size_t lead = 0;
	char tempregex[4096];
	
	/* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */
	if(strstr(regex, "casesensitive"))
	{
		*casesensitive = 1;
	}
	if(strstr(regex, "newaccountonly"))
	{
		*newaccountonly = 1;
	}
	
	/* Cut off the trailing newline */
	if(strrchr(regex, '\n'))
	{
		*strrchr(regex, '\n') = '\0';
	}
	
	/* Whack off the tail end of the regex -- all modifiers and comments */
	if(strchr(regex, '#'))
	{
		*strchr(regex, '#') = '\0';	/* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */
	}
	if(strstr(regex, "<moveonly"))
	{
		*strstr(regex, "<moveonly") = '\0';
	}
	if(strstr(regex, "<newaccountonly"))
	{
		*strstr(regex, "<newaccountonly") = '\0';
	}
	if(strstr(regex, "<casesensitive"))
	{
		*strstr(regex, "<casesensitive") = '\0';
	}
	if(strstr(regex, "<reupload"))
	{
		*strstr(regex, "<reupload") = '\0';
	}
	if(strstr(regex, "<errmsg"))
	{
		*strstr(regex, "<errmsg") = '\0';
	}
	if(strstr(regex, "<autoconfirmed"))
	{
		*strstr(regex, "<autoconfirmed") = '\0';
	}
	if(strstr(regex, "<noedit"))
	{
		*strstr(regex, "<noedit") = '\0';
	}

	/* Trim leading and trailing whitespace */
	lead = strspn(regex, " \t");
	if(lead > 0)
	{
		memmove(regex, regex + lead, strlen(regex) - lead + 1);
	}
	while(regex[strlen(regex) - 1] == ' ')
	{
		regex[strlen(regex) - 1] = '\0';
	}
	
	/* Add anchors */
	if(strlen(regex) > 0)
	{
		sprintf(tempregex, "^%s$", regex);
		strcpy(regex, tempregex);
	}
}

void fixup_line(char *line)
{
	line[strlen(line) - 1] = '\0';	/* Cut off the trailing newline */
	while(strchr(line, '_'))
	{
		*strchr(line, '_') = ' ';
	}
}

int main(int argc, char *argv[])
{
	int i;
	char regex[4096];
	char line[1024];
	int ovector[300];
	FILE *infile;
	struct stat dummy;
	pcre *comp_regex;
	int result;
	int matches;
	int lines = 0;
	
	const char * errptr;
	int offset;
		
	/* Read the regexes in from stdin */
	while(!feof(stdin))
	{
		int casesensitive = 0;
		int newaccountonly = 0;
		
		fgets(regex, 4096, stdin);
		/* For each regex */
		/* Preprocess */
		preprocess_regex(regex, &casesensitive, &newaccountonly);
		if(strlen(regex) > 0)
		{
			matches = 0;
			fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i');
			comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL);
			if(NULL == comp_regex)
			{
				fprintf(stderr, "Compile failed: %d %s\n", offset, errptr);
			}
			else
			{
				if(!newaccountonly)
				{
					/* Test */
					for(i = 1; i < argc; i++)
					{
						/* If it's a file */
						if(!stat(argv[i], &dummy))
						{
							infile = fopen(argv[i], "r");
							while(!feof(infile))
							{
								lines += 1;
								fgets(line, 1024, infile);
								fixup_line(line);
								result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300);
								if(result >= 0)
								{
									printf("* [[%s]] :: %s\n", line, regex);
									matches += 1;
								}
								else if(result == PCRE_ERROR_NOMATCH)
								{
//									printf("* Nomatch\n");
								}
								else
								{
									fprintf(stderr, "Error: %d\n", result);
								}
								if((lines % 100000) == 0)
								{
									fprintf(stderr, "Lines: %d            \r", lines);
								}
							}
							fclose(infile);
						}
						else
						{
							lines += 1;
							/* Otherwise, test as a literal */
							result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300);
							if(result >= 0)
							{
								matches += 1;
								printf("* [[%s]] :: %s\n", argv[i], regex);
							}
							else if(result == PCRE_ERROR_NOMATCH)
							{
//								printf("* No match\n");
							}
							else
							{
								fprintf(stderr, "Error: %d\n", result);
							}
						}
					}
				}
			}
			fprintf(stderr, "Matches: %d\n", matches);
		}
	}
}