User:STcatBot/1.2

STcatBot1.2版源代码：直接从屏幕拷贝即可。
#!/usr/bin/perl
# STcatBot1.2.pl - Simplified and Traditional CATegorization roBOT
# By WikiPedia:User:下一次登录
# Portions largely taken or based on upload.pl by WikiPedia:User:Eloquence 
#  and mwpush.pl by WikiPedia:User:KeithTyler

# Tested on WindowsXP/Cygwin/ActivePerl
# Corresponding robot: User:STcatBot (application in progress)

# Disclaimer: No warranty ganranteed. Use at your own risk. 

# call requirements
use Getopt::Std;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTTP::Cookies;
#use warnings;

my $username="STcatBot";
my $password="******";	#I won’t tell you my password
my $WIKI_PATH="zh.wikipedia.org";
my $WIKI_PAGE;

### Login to wiki

# Set up connection data
my $browser=LWP::UserAgent->new();
my @ns_headers = (
 'User-Agent' => 'STcatBot 1.2 by 下一次登录',  #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*',
 'Accept-Charset' => 'iso-8859-1,*,utf-8',
 'Accept-Language' => 'en-US',
);

# Hold cookies
$browser->cookie_jar( {} );

# Make login request
$response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin",
@ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]);

# After logging in, we should be redirected to another page. 
# If we aren't, something is wrong.
if($response->code!=302) { #cannot login
        print 
"We weren't able to login. This could have the following causes:

* The username ($username) or password may be incorrect.
  Solution: Re-run script with correct credentials.
* The MediaWiki software on the target host has been upgraded.
  Solution: Go to http://commons.wikimedia.org/wiki/Commons:File_upload_service
  and get a new version of the upload script.
* You are trying to hack this script for other wikis. The wiki you
  are uploading to has cookie check disabled.
  Solution: Try setting \$ignore_login_error to 1.

Regardless, we will now try to write the output from the server to 
rfget.debug.out....\n\n";
        open(DEBUG,">rfget.debug.out") or die "Could not write file.\n";
        print DEBUG $response->as_string;
        print 
"This seems to have worked. Take a look at the file for further information or
send it to moeller AT scireview DOT de if you need help debugging the script.\n";
        close(DEBUG);
        exit 1;
}

my $URL;
my $filename1; #random page reply
my $filestartstr; #first searching string
my $filestart; #first string position
my $fileendstr; #second searching string
my $fileend; #second string position
my $filename; #file name extracted
my $pagecontent; #target page content
my $redcat; #-1, no red cat; otherwise there is red cat
my @unicat; #unicode catnames
my @oricat; #original catnames
my @tarcat; #found targeted catnames if the cat exists
my $catlinecontent; #one cat line content
my $catcount; #number of red cats
my $probecatcontent; #target cat content
my $emptyprobe="class=\"selected new\"";
my $oricattemp; #temp string
my $editToken; #edit token
my $catfound;	#is there any change?
my $stcatfound; #is there any s/t cat?
my $content2; #edit content
my $content1; #reply content
my $special_char; #illegal char
my $contain_char; #is there any?

my $changemade;
$changemade=0;


my $article_count=0; #number of articles in allpages
my @article_name;	#the characters of the article names for log
my @article_unicode; #the unicode article names for connection
my $last_string; #the unicode of the last article in the last run (init="%21")
my $article_line; #one article line in allpage content
my $article_ID; 


while(1) { #process

	#read last_string.txt and start allpages from that article
	open FILE, "<last_string.txt";
	$last_string="";
	while (<FILE>) {
  	  $last_string.=$_;
	}
	#print $last_string;

	#go to allpages and get the contents
	$URL="http://".$WIKI_PATH."/wiki/Special:Allpages/".$last_string;
	$response=$browser->get($URL, @ns_headers);
	$filename1=$response->as_string;
	$article_count=0; #reset the article count
	
	if(1) {	#truncate the contents
		#find the start point and extract the content
		$filestartstr="<table style=\"background: inherit;\" border=\"0\" width=\"100%\">";
		$filestart=index($filename1, $filestartstr);  
		$filename1=substr($filename1, $filestart+60);
		
		#find the end point and cut
		$fileendstr="<div class=\"printfooter\">";
		$fileend=index($filename1, $fileendstr);  
		$filename1=substr($filename1, 0, $fileend);
	}
	
	#find all the article names without redirect
		#extract a line (between<td> </td>)and leave rest to 
		$filestartstr="<td>";
		$fileendstr="</td>";
		$filestart=index($filename1, $filestartstr)+4;
		$fileend=index($filename1, $fileendstr);
		$article_line=substr($filename1, $filestart, $fileend-$filestart);
		$filename1=substr($filename1, $fileend+5);
		
		while($fileend>0)	{	#if there is article names in allpage contents

			#check if it is a redirect
			$filestartstr="<div class=\"allpagesredirect\">";
			$filestart=index($article_line, $filestartstr);
			if($filestart<0)	{ #it's not a redirect
				#process $article_line
					#extract the unicode name
					$filestartstr="<a href=\"/wiki/";
					$filestart=index($article_line, $filestartstr)+15;
					$article_line=substr($article_line, $filestart);
					$fileendstr="\"";
					$fileend=index($article_line, $fileendstr);
					$article_unicode[$article_count]=substr($article_line, 0, $fileend);
					$article_line=substr($article_line, $fileend+1);
					
					if(0)	{	#debug allpage contents
						open INPUT, ">>debug4.txt";
						print INPUT $article_unicode[$article_count];
						print INPUT "\n";
						close INPUT;
					}
					
					#extract the character name
					$filestartstr="title=\"";
					$filestart=index($article_line, $filestartstr)+7;
					$article_line=substr($article_line, $filestart);
					$fileendstr="\"";
					$fileend=index($article_line, $fileendstr);
					$article_name[$article_count]=substr($article_line, 0, $fileend);

					if(0)	{	#debug allpage contents
						open INPUT, ">>debug5.txt";
						print INPUT $article_name[$article_count];
						print INPUT "\n";
						close INPUT;
					}
					
					$article_count+=1;
			}

			#extract a line (between<td> </td>)and leave rest to 
			$filestartstr="<td>";
			$fileendstr="</td>";
			$filestart=index($filename1, $filestartstr)+4;
			$fileend=index($filename1, $fileendstr);
			$article_line=substr($filename1, $filestart, $fileend-$filestart);
			$filename1=substr($filename1, $fileend+5);
			
	}	#while
	
	$article_ID=0;

	while($article_ID<$article_count) { #go through all the pages and process
		sleep 1;
		
		$catfound=0;	#is there any change?
		$stcatfound=0; #is there any s/t cat?
		
		#go to the target page
		$WIKI_PAGE=$article_unicode[$article_ID];
		$URL="http://".$WIKI_PATH."/wiki/".$WIKI_PAGE;
		$response=$browser->get($URL, @ns_headers);
		$pagecontent=$response->as_string ;
		print "\nConnected... ";
		
		#check there is a red category
		$filestartstr="<a href=\"/wiki/Special:Categories\" title=\"Special:Categories\">";
		$redcat = index($pagecontent, $filestartstr); 
		
		$catcount=0;
		
		
		if($redcat<0)	{ #if there is no cat at all, print in cat_log.txt
			if(0)	{	#debug catname10.txt
				open INPUT, ">>cat_log.txt";
				print INPUT "No cat at all.\n\n";
				close INPUT;
				print "No cat.";
			}
		}
		else { #there is(are) cat(s), search red cat(s)
			$redcat+=62;
			$pagecontent=substr($pagecontent, $redcat, 10000);
			
			$fileendstr="</div>";
			$fileend=index($pagecontent, $fileendstr);
			$pagecontent=substr($pagecontent, 0, $fileend-4);
			
			$filestartstr="action=edit";
			$redcat=index($pagecontent, $filestartstr);
			print "Cat found...  ";
		}
		
		if($redcat<0) { #if there is no red cat, print in cat_log.txt
				if(0)	{	#debug cat_log.txt
					open INPUT, ">>cat_log.txt";
					print INPUT "No red cat.\n\n";
					close INPUT;
				}
				print "No redcat.";
		}
		else
		{
			if(1)	{	#record the target URL
				open INPUT, ">>cat_log.txt";
				print INPUT $URL;
				print INPUT "\n";
				close INPUT;
			}
			print "Redcat found...  ";
		}
		
		while($redcat>=0) { #fount red cat(s)
			#extract a cat line in content
			$filestartstr="<a href";
			$fileendstr="</a></span>";
			$filestart=index($pagecontent, $filestartstr);
			$fileend=index($pagecontent, $fileendstr);
			$catlinecontent=substr($pagecontent, $filestart, $fileend-$filestart);
			$pagecontent=substr($pagecontent, $fileend+14, 10000);
			
			#is the cat red?
			$filestartstr="action=edit";
			if(index($catlinecontent, $filestartstr)>=0) { #if the cat is red...
				#extract unicat
				$fileendstr="&action=edit";
				$filestart=28;
				$fileend=index($catlinecontent, $fileendstr);
				$unicat[$catcount]=substr($catlinecontent, $filestart, $fileend-$filestart);
				
				#extract oricat
				$filestartstr="title=\"Category:";
				$filestart=index($catlinecontent, $filestartstr);
				$oricattemp=substr($catlinecontent, $filestart+16, 1000);
				$oricat[$catcount]=substr($oricattemp, 0, length($oricattemp)/2-1);
				
				#does it have a simp/trad corresponding cat?
				$URL="http://".$WIKI_PATH."/w/index.php?title=".$unicat[$catcount]."&action=edit";
				sleep 1;
				$response=$browser->get($URL, @ns_headers);
				$probecatcontent=$response->as_string ;
				
				if(index($probecatcontent, $emptyprobe)<0) { #if there is a corresponding cat...
					#extract tarcat
					$filestartstr="<title>";
					$filestart=index($probecatcontent, $filestartstr);
					$filestart+=28;
					$probecatcontent=substr($probecatcontent, $filestart, 1000);
					$fileendstr=" - Wikipedia</title>";
					$fileend=index($probecatcontent, $fileendstr);
					$tarcat[$catcount]=substr($probecatcontent, 0, $fileend);
					print "s/t  ";
					$stcatfound=1;
				}
				else {
					$tarcat[$catcount]=-1;
					print "n/e  ";
				}
				
				#cound the red cats	
				$catcount+=1;
			}
			$filestartstr="action=edit";
			$redcat=index($pagecontent, $filestartstr);
		}
		
		if($catcount>0) { #if change needed, process the content
			if(1)	{	#debug cat_log.txt
				open INPUT, ">>cat_log.txt";
				print INPUT "Found ";
				print INPUT $catcount;
				print INPUT " red cat(s).\n";
				close INPUT;
			}
		
			$URL="http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=edit";
			sleep 1;
			$response=$browser->get($URL, @ns_headers);
			$content1=$response->as_string;
			# Get EditToken
			($editToken) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEditToken\"/ );
			($editTime) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ );
			
		
			$filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'";
			$fileendstr="</textarea>";
			$filestart= index($content1, $filestartstr);
			$filestart+=92;
			$fileend= index($content1, $fileendstr);
			$content2=substr($content1, $filestart, $fileend-$filestart);
			
			
			#substitute
			my $i=0;
			while($i<$catcount) {
				if($tarcat[$i]>=0) {
						my $oricatname1="[category:".$oricat[$i];
						my $oricatname2="[Category:".$oricat[$i];
						my $tarcatname="[Category:".$tarcat[$i];
						while(index($content2, $oricatname1)>=0) {
							substr($content2, index($content2, $oricatname1), length($oricatname1) ) =$tarcatname;
							$catfound=1;
						}
						while(index($content2, $oricatname2)>=0) {
							substr($content2, index($content2, $oricatname2), length($oricatname2) ) =$tarcatname;
							$catfound=1;
						}
					}
					$i+=1;
				}
		}
		
		#check for illegal characters
		$contain_char=-1;
		
		$special_char=""";
		$contain_char=index($content2, $special_char);
		$special_char="<";
		if($contain_char<0) {
			$contain_char=index($content2, $special_char);
		}
		$special_char=">";
		if($contain_char<0) {
			$contain_char=index($content2, $special_char);
		}
		$special_char="&";
		if($contain_char<0) {
			$contain_char=index($content2, $special_char);
		}
		
		
		if($catfound==1) { #if there is changes to be made
			print "s/t cat found...  ";
			if($contain_char<0) { #if there is no illegal character, upload the new content
				print "Updating...  ";	
				if(1)	{	#debug cat_log.txt
					open INPUT, ">>cat_log.txt";
					print INPUT "Change made\n\n";
					close INPUT;
				}
		
				$response=$browser -> 
		   		post("http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=submit",
		      	  @ns_headers,
		        	Content_Type=>'form-data',Content=>
		        	[ wpTextbox1 => $content2,
		          	wpSummary => "[[User:STcatBot|STcatBot]]: simp/trad catnames",
			          wpSave => "Save page",
		  	        wpSection => "",
		    	      wpEdittime => $editTime,
		      	    wpEditToken => $editToken,
		        	]);
			$changemade+=1;
			print "Change made. Sleep 1 min.";	
			sleep 10;
		
		      	}
			else {
				print "Illegal char found.";
				if(1)	{	#debug il_log.txt
					open INPUT, ">>il_log.txt";
					print INPUT "*[[";
					print INPUT $article_name[$article_ID];
					print INPUT "]] ( ";
					my $i=0;
					while($i<$catcount) {
						if($tarcat[$i]>=0) {
								print INPUT $oricat[$i]." -> ";
								print INPUT $tarcat[$i]." ";;
							} #if
							$i+=1;
						} #while
					print INPUT ")\n";
					close INPUT;
				}
			}
		}
		else { #if cannot make changes
				if($stcatfound==1) {
					print "No substritute found.";
						if(1)	{	#debug ns_log.txt
							open INPUT, ">>ns_log.txt";
							print INPUT "*[[";
							print INPUT $article_name[$article_ID];
							print INPUT "]]\n";
							close INPUT;
						}
	
				}
		}
	
	$article_ID+=1;
	
	} #while ID<count
	
	if(1)	{	#record last string.txt
		open INPUT, ">last_string.txt";
		print INPUT $article_unicode[$article_count-1];
		close INPUT;
	}
	
} #while whole



#print "Done.\n";
#open(LOG,">STcatBot.log") or die "Could not write file.\n";
#print LOG $response->as_string;