User:STcatBot/1.2
外观
STcatBot1.2版源代码:直接从屏幕拷贝即可。
#!/usr/bin/perl # STcatBot1.2.pl - Simplified and Traditional CATegorization roBOT # By WikiPedia:User:下一次登录 # Portions largely taken or based on upload.pl by WikiPedia:User:Eloquence # and mwpush.pl by WikiPedia:User:KeithTyler # Tested on WindowsXP/Cygwin/ActivePerl # Corresponding robot: User:STcatBot (application in progress) # Disclaimer: No warranty ganranteed. Use at your own risk. # call requirements use Getopt::Std; use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTTP::Cookies; #use warnings; my $username="STcatBot"; my $password="******"; #I won’t tell you my password my $WIKI_PATH="zh.wikipedia.org"; my $WIKI_PAGE; ### Login to wiki # Set up connection data my $browser=LWP::UserAgent->new(); my @ns_headers = ( 'User-Agent' => 'STcatBot 1.2 by 下一次登录', #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); # Hold cookies $browser->cookie_jar( {} ); # Make login request $response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin", @ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]); # After logging in, we should be redirected to another page. # If we aren't, something is wrong. if($response->code!=302) { #cannot login print "We weren't able to login. This could have the following causes: * The username ($username) or password may be incorrect. Solution: Re-run script with correct credentials. * The MediaWiki software on the target host has been upgraded. Solution: Go to http://commons.wikimedia.org/wiki/Commons:File_upload_service and get a new version of the upload script. * You are trying to hack this script for other wikis. The wiki you are uploading to has cookie check disabled. Solution: Try setting \$ignore_login_error to 1. Regardless, we will now try to write the output from the server to rfget.debug.out....\n\n"; open(DEBUG,">rfget.debug.out") or die "Could not write file.\n"; print DEBUG $response->as_string; print "This seems to have worked. Take a look at the file for further information or send it to moeller AT scireview DOT de if you need help debugging the script.\n"; close(DEBUG); exit 1; } my $URL; my $filename1; #random page reply my $filestartstr; #first searching string my $filestart; #first string position my $fileendstr; #second searching string my $fileend; #second string position my $filename; #file name extracted my $pagecontent; #target page content my $redcat; #-1, no red cat; otherwise there is red cat my @unicat; #unicode catnames my @oricat; #original catnames my @tarcat; #found targeted catnames if the cat exists my $catlinecontent; #one cat line content my $catcount; #number of red cats my $probecatcontent; #target cat content my $emptyprobe="class=\"selected new\""; my $oricattemp; #temp string my $editToken; #edit token my $catfound; #is there any change? my $stcatfound; #is there any s/t cat? my $content2; #edit content my $content1; #reply content my $special_char; #illegal char my $contain_char; #is there any? my $changemade; $changemade=0; my $article_count=0; #number of articles in allpages my @article_name; #the characters of the article names for log my @article_unicode; #the unicode article names for connection my $last_string; #the unicode of the last article in the last run (init="%21") my $article_line; #one article line in allpage content my $article_ID; while(1) { #process #read last_string.txt and start allpages from that article open FILE, "<last_string.txt"; $last_string=""; while (<FILE>) { $last_string.=$_; } #print $last_string; #go to allpages and get the contents $URL="http://".$WIKI_PATH."/wiki/Special:Allpages/".$last_string; $response=$browser->get($URL, @ns_headers); $filename1=$response->as_string; $article_count=0; #reset the article count if(1) { #truncate the contents #find the start point and extract the content $filestartstr="<table style=\"background: inherit;\" border=\"0\" width=\"100%\">"; $filestart=index($filename1, $filestartstr); $filename1=substr($filename1, $filestart+60); #find the end point and cut $fileendstr="<div class=\"printfooter\">"; $fileend=index($filename1, $fileendstr); $filename1=substr($filename1, 0, $fileend); } #find all the article names without redirect #extract a line (between<td> </td>)and leave rest to $filestartstr="<td>"; $fileendstr="</td>"; $filestart=index($filename1, $filestartstr)+4; $fileend=index($filename1, $fileendstr); $article_line=substr($filename1, $filestart, $fileend-$filestart); $filename1=substr($filename1, $fileend+5); while($fileend>0) { #if there is article names in allpage contents #check if it is a redirect $filestartstr="<div class=\"allpagesredirect\">"; $filestart=index($article_line, $filestartstr); if($filestart<0) { #it's not a redirect #process $article_line #extract the unicode name $filestartstr="<a href=\"/wiki/"; $filestart=index($article_line, $filestartstr)+15; $article_line=substr($article_line, $filestart); $fileendstr="\""; $fileend=index($article_line, $fileendstr); $article_unicode[$article_count]=substr($article_line, 0, $fileend); $article_line=substr($article_line, $fileend+1); if(0) { #debug allpage contents open INPUT, ">>debug4.txt"; print INPUT $article_unicode[$article_count]; print INPUT "\n"; close INPUT; } #extract the character name $filestartstr="title=\""; $filestart=index($article_line, $filestartstr)+7; $article_line=substr($article_line, $filestart); $fileendstr="\""; $fileend=index($article_line, $fileendstr); $article_name[$article_count]=substr($article_line, 0, $fileend); if(0) { #debug allpage contents open INPUT, ">>debug5.txt"; print INPUT $article_name[$article_count]; print INPUT "\n"; close INPUT; } $article_count+=1; } #extract a line (between<td> </td>)and leave rest to $filestartstr="<td>"; $fileendstr="</td>"; $filestart=index($filename1, $filestartstr)+4; $fileend=index($filename1, $fileendstr); $article_line=substr($filename1, $filestart, $fileend-$filestart); $filename1=substr($filename1, $fileend+5); } #while $article_ID=0; while($article_ID<$article_count) { #go through all the pages and process sleep 1; $catfound=0; #is there any change? $stcatfound=0; #is there any s/t cat? #go to the target page $WIKI_PAGE=$article_unicode[$article_ID]; $URL="http://".$WIKI_PATH."/wiki/".$WIKI_PAGE; $response=$browser->get($URL, @ns_headers); $pagecontent=$response->as_string ; print "\nConnected... "; #check there is a red category $filestartstr="<a href=\"/wiki/Special:Categories\" title=\"Special:Categories\">"; $redcat = index($pagecontent, $filestartstr); $catcount=0; if($redcat<0) { #if there is no cat at all, print in cat_log.txt if(0) { #debug catname10.txt open INPUT, ">>cat_log.txt"; print INPUT "No cat at all.\n\n"; close INPUT; print "No cat."; } } else { #there is(are) cat(s), search red cat(s) $redcat+=62; $pagecontent=substr($pagecontent, $redcat, 10000); $fileendstr="</div>"; $fileend=index($pagecontent, $fileendstr); $pagecontent=substr($pagecontent, 0, $fileend-4); $filestartstr="action=edit"; $redcat=index($pagecontent, $filestartstr); print "Cat found... "; } if($redcat<0) { #if there is no red cat, print in cat_log.txt if(0) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "No red cat.\n\n"; close INPUT; } print "No redcat."; } else { if(1) { #record the target URL open INPUT, ">>cat_log.txt"; print INPUT $URL; print INPUT "\n"; close INPUT; } print "Redcat found... "; } while($redcat>=0) { #fount red cat(s) #extract a cat line in content $filestartstr="<a href"; $fileendstr="</a></span>"; $filestart=index($pagecontent, $filestartstr); $fileend=index($pagecontent, $fileendstr); $catlinecontent=substr($pagecontent, $filestart, $fileend-$filestart); $pagecontent=substr($pagecontent, $fileend+14, 10000); #is the cat red? $filestartstr="action=edit"; if(index($catlinecontent, $filestartstr)>=0) { #if the cat is red... #extract unicat $fileendstr="&action=edit"; $filestart=28; $fileend=index($catlinecontent, $fileendstr); $unicat[$catcount]=substr($catlinecontent, $filestart, $fileend-$filestart); #extract oricat $filestartstr="title=\"Category:"; $filestart=index($catlinecontent, $filestartstr); $oricattemp=substr($catlinecontent, $filestart+16, 1000); $oricat[$catcount]=substr($oricattemp, 0, length($oricattemp)/2-1); #does it have a simp/trad corresponding cat? $URL="http://".$WIKI_PATH."/w/index.php?title=".$unicat[$catcount]."&action=edit"; sleep 1; $response=$browser->get($URL, @ns_headers); $probecatcontent=$response->as_string ; if(index($probecatcontent, $emptyprobe)<0) { #if there is a corresponding cat... #extract tarcat $filestartstr="<title>"; $filestart=index($probecatcontent, $filestartstr); $filestart+=28; $probecatcontent=substr($probecatcontent, $filestart, 1000); $fileendstr=" - Wikipedia</title>"; $fileend=index($probecatcontent, $fileendstr); $tarcat[$catcount]=substr($probecatcontent, 0, $fileend); print "s/t "; $stcatfound=1; } else { $tarcat[$catcount]=-1; print "n/e "; } #cound the red cats $catcount+=1; } $filestartstr="action=edit"; $redcat=index($pagecontent, $filestartstr); } if($catcount>0) { #if change needed, process the content if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "Found "; print INPUT $catcount; print INPUT " red cat(s).\n"; close INPUT; } $URL="http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=edit"; sleep 1; $response=$browser->get($URL, @ns_headers); $content1=$response->as_string; # Get EditToken ($editToken) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEditToken\"/ ); ($editTime) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ ); $filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'"; $fileendstr="</textarea>"; $filestart= index($content1, $filestartstr); $filestart+=92; $fileend= index($content1, $fileendstr); $content2=substr($content1, $filestart, $fileend-$filestart); #substitute my $i=0; while($i<$catcount) { if($tarcat[$i]>=0) { my $oricatname1="[category:".$oricat[$i]; my $oricatname2="[Category:".$oricat[$i]; my $tarcatname="[Category:".$tarcat[$i]; while(index($content2, $oricatname1)>=0) { substr($content2, index($content2, $oricatname1), length($oricatname1) ) =$tarcatname; $catfound=1; } while(index($content2, $oricatname2)>=0) { substr($content2, index($content2, $oricatname2), length($oricatname2) ) =$tarcatname; $catfound=1; } } $i+=1; } } #check for illegal characters $contain_char=-1; $special_char="""; $contain_char=index($content2, $special_char); $special_char="<"; if($contain_char<0) { $contain_char=index($content2, $special_char); } $special_char=">"; if($contain_char<0) { $contain_char=index($content2, $special_char); } $special_char="&"; if($contain_char<0) { $contain_char=index($content2, $special_char); } if($catfound==1) { #if there is changes to be made print "s/t cat found... "; if($contain_char<0) { #if there is no illegal character, upload the new content print "Updating... "; if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "Change made\n\n"; close INPUT; } $response=$browser -> post("http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=submit", @ns_headers, Content_Type=>'form-data',Content=> [ wpTextbox1 => $content2, wpSummary => "[[User:STcatBot|STcatBot]]: simp/trad catnames", wpSave => "Save page", wpSection => "", wpEdittime => $editTime, wpEditToken => $editToken, ]); $changemade+=1; print "Change made. Sleep 1 min."; sleep 10; } else { print "Illegal char found."; if(1) { #debug il_log.txt open INPUT, ">>il_log.txt"; print INPUT "*[["; print INPUT $article_name[$article_ID]; print INPUT "]] ( "; my $i=0; while($i<$catcount) { if($tarcat[$i]>=0) { print INPUT $oricat[$i]." -> "; print INPUT $tarcat[$i]." ";; } #if $i+=1; } #while print INPUT ")\n"; close INPUT; } } } else { #if cannot make changes if($stcatfound==1) { print "No substritute found."; if(1) { #debug ns_log.txt open INPUT, ">>ns_log.txt"; print INPUT "*[["; print INPUT $article_name[$article_ID]; print INPUT "]]\n"; close INPUT; } } } $article_ID+=1; } #while ID<count if(1) { #record last string.txt open INPUT, ">last_string.txt"; print INPUT $article_unicode[$article_count-1]; close INPUT; } } #while whole #print "Done.\n"; #open(LOG,">STcatBot.log") or die "Could not write file.\n"; #print LOG $response->as_string;