User:SQLBot/Readref.php
Appearance
<?php
if( !isset($argv[1] ) ) {
$helptext = "Reference Problem Finder, by SQL@Enwiki\nphp $argv[0] <DumpFile> <OutputFile> -w\n* -w = Wikify output\n-d = Double check against API\n\n";
die($helptext);
}
$fIn = fopen( $argv[1], "r" );
$fOut = fopen( $argv[2], "w" );
if( array_search( "-w", $argv ) !== FALSE ) {
$wikify = TRUE;
}
if( array_search( "-d", $argv ) !== FALSE ) {
$doublecheck = TRUE;
}
$refs = "/(<ref |<ref>)/i";
$reflist = "/(\{\{(reflist|reference|refs|footnotes)|<references)/i";
function GetPage($article) {
$url = "http://simple.wikipedia.org/w/";
$article = urlencode($article);
$request = $url . 'api.php?action=query&prop=revisions&titles=' . $article . '&rvprop=content&format=php';
$sxGetArticle = file_get_contents($request);
$sxGetA = unserialize($sxGetArticle);
$sxGetAID = $sxGetA['query']['pages'];
$sxGetAID = array_shift($sxGetAID);
$sxGetAID = array_shift($sxGetAID);
$sxAText = $sxGetA['query']['pages'][$sxGetAID]['revisions'][0]["*"];
return($sxAText);
}
function checkArticle( $text ) {
global $refs, $reflist;
$text = html_entity_decode( $text );
$text = preg_replace( "/\<\!\-\-(.*)\-\-\>/i", "", $text );
# echo "$text\n";
if( stripos( $text, "#REDIRECT" ) !== FALSE ) {
return( FALSE );
}
$hasRef = preg_match( $refs, $text, $mRefs );
$hasRefList = preg_match( $reflist, $text, $mRefList );
if( isset( $mRefs[1] ) && !isset( $mRefList[1] ) ) {
return( TRUE );
}
}
$num = 0;
$ok = 0;
$prob = 0;
while( !feof( $fIn ) ) {
$fLine = fgets( $fIn );
$fLine = rtrim( ltrim( $fLine ) );
$mTitleF = preg_match( "/\<title\>(.*)\<\/title\>/i", $fLine, $mTitle );
if($mTitleF) {
echo "$num [$ok / $prob]: Checking $mTitle[1]... ";
$title = $mTitle[1];
}
unset( $mStartTextFound );
unset( $mEndTextFound );
$mStartTextFound = strpos( $fLine, "<text" );
$mEndTextFound = strpos( $fLine, "</text>" );
if( $mStartTextFound !== FALSE && $mEndTextFound !== FALSE ) {
preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $fLine, $mText );
echo " Got text...";
if( !checkArticle( $mText[1] ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
fwrite( $fOut, "$title\n" );
$prob++;
}
$num++;
} else if ( $mStartTextFound !== FALSE ) {
unset( $mEndFound );
unset( $aText );
$aText = $fLine;
while( !$mEndFound ) {
$fLine = fgets( $fIn );
$fLine = rtrim( ltrim( $fLine ) );
$aText = $aText . $fLine;
$mEndFound = strpos( $fLine, "</text>" );
}
echo " End Found... ";
# if( strlen( $aText ) == 0 ) { die("\n\nERROR: Zero length aText?!\n\n"); }
preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $aText, $mText );
echo " Got Text... ";
# if( strlen( $mText[1] ) == 0 ) { die("\n\nERROR: Zero length mText?!\n\n"); }
if( !checkArticle( $mText[1] ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
if( $wikify ) {
fwrite( $fOut, "* [[$title]]\n" );
} else {
fwrite( $fOut, "$title\n" );
}
$prob++;
}
$num++;
} else {
}
}
fclose( $fOut );
if( $doublecheck ) {
echo "\n\nDouble-checking articles!\n\n";
$articles = file( $argv[2] );
$fOut = fopen( $argv[2], "w" );
sort( $articles );
$num = 0;
$ok = 0;
$prob = 0;
foreach( $articles as $article) {
$num++;
$article = ltrim( rtrim( $article ) );
if( strpos( $article, "* [[" ) !== FALSE ) {
preg_match( "/\* \[\[(.*)\]\]/i", $article, $mArticle );
$article = $mArticle[1];
}
echo "$num [$ok / $prob]: $article :";
$aText = GetPage($article);
if( !checkArticle( $aText ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
if( $wikify ) {
fwrite( $fOut, "* [[$title]]\n" );
} else {
fwrite( $fOut, "$title\n" );
}
$prob++;
}
}
}
fclose( $fIn );
fclose( $fOut );
?>