Jump to content

User:SQLBot/Readref.php

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
<?php
if( !isset($argv[1] ) ) {
	$helptext = "Reference Problem Finder, by SQL@Enwiki\nphp $argv[0] <DumpFile> <OutputFile> <domain> <options>\n* -w = Wikify output\n* -d = Double check against API\n\n";
	die($helptext);
}
if( !isset( $argv[3] ) ) {
	die($helptext);
}
$domain = $argv[3];
$fIn = fopen( $argv[1], "r" );
$fOut = fopen( $argv[2], "w" );

if( in_array( "-w", $argv ) ) {
	$wikify = TRUE;
}

if( in_array( "-d", $argv ) ) {
	$doublecheck = TRUE;
}

$refs = "/(<ref |<ref>)/i";
$reflist = "/(\{\{(reflist|reference|refs|footnotes)|<references)/i";

function GetPage($article) {
	global $domain;
	$url = "http://$domain/w/";
        $article = urlencode($article);
        $request = $url . 'api.php?action=query&prop=revisions&titles=' . $article . '&rvprop=content&format=php';
        $sxGetArticle = file_get_contents($request);
        $sxGetA = unserialize($sxGetArticle);
	$sxGetAID = $sxGetA['query']['pages'];
	$sxGetAID = array_shift($sxGetAID);
	$sxGetAID = array_shift($sxGetAID);
        $sxAText = $sxGetA['query']['pages'][$sxGetAID]['revisions'][0]["*"];
        return($sxAText);
}

function checkArticle( $text ) {
	global $refs, $reflist;
	$text = html_entity_decode( $text );
	if( stripos( $text, "<!--") !== FALSE ) {
		$text = preg_replace( "/\<\!\-\-(.*)\-\-\>/i", "", $text );
	}
	if( stripos( $text, "#REDIRECT" ) !== FALSE ) {
		return( FALSE );
	}
	$hasRef = preg_match( $refs, $text, $mRefs );
	$hasRefList = preg_match( $reflist, $text, $mRefList );

	if( isset( $mRefs[1] ) && !isset( $mRefList[1] ) ) {
		return( TRUE );
	}
}
$num = 0;
$ok = 0;
$prob = 0;
$time_start = microtime(true);
while( !feof( $fIn ) ) {
	$fLine = fgets( $fIn );
	$fLine = rtrim( ltrim( $fLine ) );
	$mTitleF = preg_match( "/\<title\>(.*)\<\/title\>/i", $fLine, $mTitle );
	if($mTitleF) {
		echo "$num [$ok / $prob]: Checking $mTitle[1]... ";
		$title = $mTitle[1];
	}
	unset( $mStartTextFound );
	unset( $mEndTextFound );
	$mStartTextFound = strpos( $fLine, "<text" );
	$mEndTextFound = strpos( $fLine, "</text>" );
	if( $mStartTextFound !== FALSE && $mEndTextFound !== FALSE ) {
		preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $fLine, $mText );
		echo " Got text...";
		if( !checkArticle( $mText[1] ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
			echo " Problem!\n";
			fwrite( $fOut, "$title\n" );
			$prob++;
		}
		$num++;
	} else if ( $mStartTextFound !== FALSE ) {
		unset( $mEndFound ); 
		unset( $aText );
		$aText = $fLine;
		while( !$mEndFound ) {
			$fLine = fgets( $fIn );
			$fLine = rtrim( ltrim( $fLine ) );
			$aText = $aText . $fLine;
			$mEndFound = strpos( $fLine, "</text>" );
		}
		echo " End Found... ";
		preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $aText, $mText );
		echo " Got Text... ";
		if( !checkArticle( $mText[1] ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
                        echo " Problem!\n";
			if( $wikify ) {
				fwrite( $fOut, "* [[$title]]\n" );
			} else {
				fwrite( $fOut, "$title\n" );
			}
			$prob++;
                }
                $num++;
	} else {
	}
}
$time_end = microtime(true);
$time = round($time_end - $time_start, 0);
$nodc[time] = $time;
$nodc[rps] = $num / $time;
$nodc[num] = $num;
$nodc[prob] = $prob;
$nodc[ok] = $ok;
fclose( $fOut );
if( $doublecheck ) {
	echo "\n\nDouble-checking articles!\n\n";
	$articles = file( $argv[2] );
	$fOut = fopen( $argv[2], "w" );
	sort( $articles );
	$num = 0;
	$ok = 0;
	$prob = 0;
	$time_start = microtime(true);
	foreach( $articles as $article) {
		$num++;
		$article = ltrim( rtrim( $article ) );
		if( strpos( $article, "* [[" ) !== FALSE ) {
			preg_match( "/\* \[\[(.*)\]\]/i", $article, $mArticle );
			$article = $mArticle[1];
		}
		echo "$num [$ok / $prob]: $article :";
		$aText = GetPage($article);
		if( !checkArticle( $aText ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
                        echo " Problem!\n";
			if( $wikify ) {
				fwrite( $fOut, "* [[$article]]\n" );
			} else {
				fwrite( $fOut, "$article\n" );
			}
			$prob++;
                }	
	}
	$time_end = microtime(true);
	$time = round($time_end - $time_start, 0);
	$rps = $num / $time;
	$elim = $nodc[prob] - $prob;
	$dcpct = $elim / $nodc[prob];
	$dcpct = round( $dcpct * 100, 0 );
	echo "Processed $num in $time (sec) at about $rps checks per second, with double-checking enabled. DC eliminated $elim ($dcpct%) positives.\n";
}
echo "Processed $nodc[num] in $nodc[time] (sec) at about $nodc[rps] checks per second, with no double-checking.\nRun complete\n";
fclose( $fIn );

?>