User:Carnildo/wiki-regex-tester.c
Appearance
Common usages:
./wiki-regex-tester titles.txt < blacklist.txt
Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.
./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt
Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"
wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l
Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.
/* wiki-regex-tester.c * * A program to test regular expressions for the Wikipedia title blacklist. Assumes UTF-8. */ /* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs` */ #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <pcre.h> void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly) { size_t lead = 0; char tempregex[4096]; /* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */ if(strstr(regex, "casesensitive")) { *casesensitive = 1; } if(strstr(regex, "newaccountonly")) { *newaccountonly = 1; } /* Cut off the trailing newline */ if(strrchr(regex, '\n')) { *strrchr(regex, '\n') = '\0'; } /* Whack off the tail end of the regex -- all modifiers and comments */ if(strchr(regex, '#')) { *strchr(regex, '#') = '\0'; /* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */ } if(strstr(regex, "<moveonly")) { *strstr(regex, "<moveonly") = '\0'; } if(strstr(regex, "<newaccountonly")) { *strstr(regex, "<newaccountonly") = '\0'; } if(strstr(regex, "<casesensitive")) { *strstr(regex, "<casesensitive") = '\0'; } if(strstr(regex, "<reupload")) { *strstr(regex, "<reupload") = '\0'; } if(strstr(regex, "<errmsg")) { *strstr(regex, "<errmsg") = '\0'; } if(strstr(regex, "<autoconfirmed")) { *strstr(regex, "<autoconfirmed") = '\0'; } if(strstr(regex, "<noedit")) { *strstr(regex, "<noedit") = '\0'; } /* Trim leading and trailing whitespace */ lead = strspn(regex, " \t"); if(lead > 0) { memmove(regex, regex + lead, strlen(regex) - lead + 1); } while(regex[strlen(regex) - 1] == ' ') { regex[strlen(regex) - 1] = '\0'; } /* Add anchors */ if(strlen(regex) > 0) { sprintf(tempregex, "^%s$", regex); strcpy(regex, tempregex); } } void fixup_line(char *line) { line[strlen(line) - 1] = '\0'; /* Cut off the trailing newline */ while(strchr(line, '_')) { *strchr(line, '_') = ' '; } } int main(int argc, char *argv[]) { int i; char regex[4096]; char line[1024]; int ovector[300]; FILE *infile; struct stat dummy; pcre *comp_regex; int result; int matches; int lines = 0; const char * errptr; int offset; /* Read the regexes in from stdin */ while(!feof(stdin)) { int casesensitive = 0; int newaccountonly = 0; fgets(regex, 4096, stdin); /* For each regex */ /* Preprocess */ preprocess_regex(regex, &casesensitive, &newaccountonly); if(strlen(regex) > 0) { matches = 0; fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i'); comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL); if(NULL == comp_regex) { fprintf(stderr, "Compile failed: %d %s\n", offset, errptr); } else { if(!newaccountonly) { /* Test */ for(i = 1; i < argc; i++) { /* If it's a file */ if(!stat(argv[i], &dummy)) { infile = fopen(argv[i], "r"); while(!feof(infile)) { lines += 1; fgets(line, 1024, infile); fixup_line(line); result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300); if(result >= 0) { printf("* [[%s]] :: %s\n", line, regex); matches += 1; } else if(result == PCRE_ERROR_NOMATCH) { // printf("* Nomatch\n"); } else { fprintf(stderr, "Error: %d\n", result); } if((lines % 100000) == 0) { fprintf(stderr, "Lines: %d \r", lines); } } fclose(infile); } else { lines += 1; /* Otherwise, test as a literal */ result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300); if(result >= 0) { matches += 1; printf("* [[%s]] :: %s\n", argv[i], regex); } else if(result == PCRE_ERROR_NOMATCH) { // printf("* No match\n"); } else { fprintf(stderr, "Error: %d\n", result); } } } } } fprintf(stderr, "Matches: %d\n", matches); } } }