Wikipedista:DaBlerBot/src
Vzhled
#!/usr/bin/php
<?php
/*
PHP-WikiBot
VERSION: 2009-04-02
AUTHOR: [[cs:User:DaBler]]
*/
class WikiBot
{
private $base;
private $user;
private $pass;
const COOKIEFILE = 'cookies.txt';
const SLEEP = 6;
public function __construct($base='http://cs.wikipedia.org/w/index.php', $user='DaBlerBot', $pass='výchozí heslo')
{
$this->base = $base;
$this->user = $user;
$this->pass = $pass;
}
public function run($file='cs.txt')
{
echo "Logging into '{$this->base}'...\n";
if(FALSE === $this->login())
{
echo "Login fails, exiting...\n";
return FALSE;
}
echo "Logged in as {$this->user}\n";
echo "Processing articles in file '${file}'...\n";
$list = array();
$fp = fopen($file, 'r');
if($fp)
{
while(!feof($fp))
{
$b = fgets($fp, 4096);
$b = str_replace("\n", '', $b);
$b = str_replace("\r", '', $b);
if($b)
$list[] = $b;
}
fclose($fp);
}
else
{
echo "Warning: no input\n";
}
foreach($list as $name)
{
echo "Loading article '${name}'...\n";
$page = $this->edit($name);
if(FALSE === $page)
{
echo "Edit fails, exiting...\n";
return FALSE;
}
echo "Processing...\n";
$new = self::process($page['text']);
$page['attr']['wpSummary'] = $new['summary'];
$page['attr']['wpTextbox1'] = $new['text'];
echo "Posting...\n";
if(FALSE === $this->post($page, $name))
{
echo "Post fails, exiting...\n";
return FALSE;
}
echo "OK (sleeping for ".self::SLEEP." secs)...\n";
sleep(self::SLEEP);
}
}
private function login()
{
$user = urlencode($this->user);
$pass = urlencode($this->pass);
$url = "{$this->base}?title=Special:Userlogin&action=submit";
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_COOKIEJAR, self::COOKIEFILE);
curl_setopt($c, CURLOPT_POST, 1);
curl_setopt($c, CURLOPT_POSTFIELDS, "wpName=${user}&wpLoginattempt=Log+in&wpPassword=${pass}&wpRemember=1&wpRetype=&wpEmail=&wpSkipCookieCheck=0");
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
$r = curl_exec($c);
if( $r === "" || FALSE !== strpos($r,"var wgUserName = \"{$this->user}\";") )
return TRUE;
else
return FALSE;
}
private function edit($name)
{
$title = rawurlencode(str_replace(' ', '_', $name));
$url = "{$this->base}?title=${title}&action=edit";
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_COOKIEFILE, self::COOKIEFILE);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
$r = curl_exec($c);
if(FALSE === $r)
return FALSE;
xml_parse_into_struct(xml_parser_create('UTF-8'), $r, $val, $ind);
$p = strpos($r, '<textarea');
$k = substr($r, $p, strpos($r, '</textarea>') - $p);
$k = substr($k, strpos($k, '>') + 1);
$text = html_entity_decode($k, ENT_QUOTES, 'UTF-8');
if(!isset($ind['INPUT']))
{
echo "Error: No input element (MediaWiki has generated non-valid (X)HTML?)\n";
die -1;
}
foreach($ind['INPUT'] as $_=>$num)
{
$attr[$val[$num]['attributes']['NAME']] = $val[$num]['attributes']['VALUE'];
}
unset($attr['search']);
unset($attr['go']);
unset($attr['fulltext']);
unset($attr['wpPreview']);
unset($attr['wpWatchthis']);
unset($attr['wpDiff']);
unset($attr['title']);
return array('text' => $text, 'attr' => $attr);
}
private static function fix_headings_cb($m)
{
$str = $m[2];
// '''
$str = preg_replace('/(.*?)\'\'\'(.*?)\'\'\'(.*?)/m', '\1\2\3', $str);
// ''
$str = preg_replace('/(.*?)\'\'(.*?)\'\'(.*?)/m', '\1\2\3', $str);
// [[|]]
$str = preg_replace('/(.*?)\[\[(([^\]]*?)\|)?(.*?)\]\](.*?)/m', '\1\4\5', $str);
// :
$str = preg_replace('/^ *(.*?)( *:?)* *$/', '\1', $str);
return "${m[1]} ${str} ${m[1]}";
}
private static function fix_headings(&$str)
{
$old = $str;
// pekne nadpisy
$str = preg_replace_callback('/^(=+) *(.+?) *(=+) *\r?$/m', "WikiBot::fix_headings_cb", $str);
// nahrada nadpisu
$str = preg_replace('/^(=+) Viz též (=+)\r?$/m', '\1 Související články \2', $str, -1, $tmp);
$str = preg_replace('/^(=+) Podívejte se také na (=+)\r?$/m', '\1 Související články \2', $str, -1, $tmp);
// pred nadpisem bude presne 1 enter, za nadpisem max. 1 enter
$str = preg_replace('/^(=+ .+? =+)\n+$/m', "\\1\n", $str, -1, $tmp);
$str = preg_replace('/^\n*(=+ .+? =+)$/m', "\n\\1", $str, -1, $tmp);
return !($old == $str);
}
private static function fix_categories(&$str)
{
$old = $str;
// kategorie v anglictine
$str = preg_replace('/\[\[ *(:?) *Category *: *(.+?) *\]\]/im', '[[\1Kategorie:\2]]', $str, -1, $tmp);
return !($old == $str);
}
private static function fix_bullets(&$str)
{
$old = $str;
// odrazky orezat o mezery
$str = preg_replace('/^([\*#;:]+) *(.*?) *\r?$/m', '\1 \2', $str, -1, $tmp);
// zbavit se <br/> u odrazek
$str = preg_replace('|^([\*#;:]+) (.*?) *<br */?'.'>\r?$|m', '\1 \2', $str, -1, $tmp);
return !($old == $str);
}
private static function fix_entities(&$str)
{
$old = $str;
// HTML entity
$str = preg_replace('/–/m', '–', $str, -1, $tmp);
$str = preg_replace('/—/m', '—', $str, -1, $tmp);
$str = preg_replace('/…/m', '…', $str, -1, $tmp);
return !($old == $str);
}
private static function fix_orthography(&$str)
{
$old = $str;
$sep = '[ ,\.;*#\-–—…&\(\)\/\[\]|\'{}=:<>?!"„“\n]';
$replace = array(
'vyjímka' => 'výjimka',
'vyjímečný' => 'výjimečný',
'vyjímečně' => 'výjimečně'
// ...doplnit
);
foreach($replace as $pattern => $replacement)
{
$str = preg_replace("/(${sep})${pattern}(${sep})/m", "\\1${replacement}\\2", $str);
}
return !($old == $str);
}
private static function process($str)
{
$summary = "[[WP:WCW]]:";
// windowsove entery za linuxove, kvuli enterum kolem nadpisu
$str = preg_replace('/\r\n/', "\n", $str);
if(self::fix_headings($str))
$summary .= " opravy nadpisů,";
if(self::fix_bullets($str))
$summary .= " opravy odrážek,";
if(self::fix_entities($str))
$summary .= " náhrada HTML entit,";
if(self::fix_categories($str))
$summary .= " opravy kategorií v angličtině, ";
if(self::fix_orthography($str))
$summary .= " opravy pravopisu, ";
if($str == '')
{
echo "Error: Empty output!\n";
die -1;
}
$summary .= "…";
return array('text' => $str, 'summary' => $summary);
}
private function post($page, $name)
{
$title = rawurlencode(str_replace(' ', '_', $name));
$mp = array();
foreach($page['attr'] as $aname=>$val)
$mp[$aname] = $val;
$url = "{$this->base}?title=${title}&action=submit";
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_COOKIEFILE, self::COOKIEFILE);
curl_setopt($c, CURLOPT_POST, 1);
curl_setopt($c, CURLOPT_POSTFIELDS, $mp);
curl_setopt($c, CURLOPT_HTTPHEADER, array('Expect:'));
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
$ret = curl_exec($c);
if( '' === $ret )
return TRUE;
else
return FALSE;
}
}
$bot = new WikiBot();
$bot->run();
?>