Přeskočit na obsah

Wikipedista:DaBlerBot/src

Z Wikipedie, otevřené encyklopedie
(rozdíl) ← Starší revize | zobrazit aktuální verzi (rozdíl) | Novější revize → (rozdíl)
#!/usr/bin/php
<?php
/*
	PHP-WikiBot
	VERSION: 2009-04-02
	AUTHOR: [[cs:User:DaBler]]
*/

class WikiBot
{
	private $base;
	private $user;
	private $pass;

	const COOKIEFILE = 'cookies.txt';
	const SLEEP = 6;

	public function __construct($base='http://cs.wikipedia.org/w/index.php', $user='DaBlerBot', $pass='výchozí heslo')
	{
		$this->base = $base;
		$this->user = $user;
		$this->pass = $pass;
	}

	public function run($file='cs.txt')
	{
		echo "Logging into '{$this->base}'...\n";
		if(FALSE === $this->login())
		{
			echo "Login fails, exiting...\n";
			return FALSE;
		}
		echo "Logged in as {$this->user}\n";

		echo "Processing articles in file '${file}'...\n";

		$list = array();
		$fp = fopen($file, 'r');
		if($fp)
		{
			while(!feof($fp))
			{
				$b = fgets($fp, 4096);
				$b = str_replace("\n", '', $b);
				$b = str_replace("\r", '', $b);
				if($b)
					$list[] = $b;
			}
			fclose($fp);
		}
		else
		{
			echo "Warning: no input\n";
		}

		foreach($list as $name)
		{
			echo "Loading article '${name}'...\n";
			$page = $this->edit($name);
			if(FALSE === $page)
			{
				echo "Edit fails, exiting...\n";
				return FALSE;
			}

			echo "Processing...\n";
			$new = self::process($page['text']);
			$page['attr']['wpSummary'] = $new['summary'];
			$page['attr']['wpTextbox1'] = $new['text'];

			echo "Posting...\n";
			if(FALSE === $this->post($page, $name))
			{
				echo "Post fails, exiting...\n";
				return FALSE;
			}

			echo "OK (sleeping for ".self::SLEEP." secs)...\n";
			sleep(self::SLEEP);
		}

	}

	private function login()
	{
		$user = urlencode($this->user);
		$pass = urlencode($this->pass);
		$url = "{$this->base}?title=Special:Userlogin&action=submit";

		$c = curl_init();
		curl_setopt($c, CURLOPT_URL, $url);
		curl_setopt($c, CURLOPT_COOKIEJAR, self::COOKIEFILE);
		curl_setopt($c, CURLOPT_POST, 1);
		curl_setopt($c, CURLOPT_POSTFIELDS, "wpName=${user}&wpLoginattempt=Log+in&wpPassword=${pass}&wpRemember=1&wpRetype=&wpEmail=&wpSkipCookieCheck=0");
		curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
		$r = curl_exec($c);

		if( $r === "" || FALSE !== strpos($r,"var wgUserName = \"{$this->user}\";") )
			return TRUE;
		else
			return FALSE;
	}

	private function edit($name)
	{
		$title = rawurlencode(str_replace(' ', '_', $name));
		$url = "{$this->base}?title=${title}&action=edit";

		$c = curl_init();
		curl_setopt($c, CURLOPT_URL, $url);
		curl_setopt($c, CURLOPT_COOKIEFILE, self::COOKIEFILE);
		curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
		$r = curl_exec($c);

		if(FALSE === $r)
			return FALSE;

		xml_parse_into_struct(xml_parser_create('UTF-8'), $r, $val, $ind);

		$p = strpos($r, '<textarea');
		$k = substr($r, $p, strpos($r, '</textarea>') - $p);
		$k = substr($k, strpos($k, '>') + 1);
		$text = html_entity_decode($k, ENT_QUOTES, 'UTF-8');

		if(!isset($ind['INPUT']))
		{
			echo "Error: No input element (MediaWiki has generated non-valid (X)HTML?)\n";
			die -1;
		}

		foreach($ind['INPUT'] as $_=>$num)
		{
			$attr[$val[$num]['attributes']['NAME']] = $val[$num]['attributes']['VALUE'];
		}

		unset($attr['search']);
		unset($attr['go']);
		unset($attr['fulltext']);
		unset($attr['wpPreview']);
		unset($attr['wpWatchthis']);
		unset($attr['wpDiff']);
		unset($attr['title']);

		return array('text' => $text, 'attr' => $attr);
	}

	private static function fix_headings_cb($m)
	{
		$str = $m[2];

		// '''
		$str = preg_replace('/(.*?)\'\'\'(.*?)\'\'\'(.*?)/m', '\1\2\3', $str);
		// ''
		$str = preg_replace('/(.*?)\'\'(.*?)\'\'(.*?)/m', '\1\2\3', $str);
		// [[|]]
		$str = preg_replace('/(.*?)\[\[(([^\]]*?)\|)?(.*?)\]\](.*?)/m', '\1\4\5', $str);
		// :
		$str = preg_replace('/^ *(.*?)( *:?)* *$/', '\1', $str);

		return "${m[1]} ${str} ${m[1]}";
	}

	private static function fix_headings(&$str)
	{
		$old = $str;

		// pekne nadpisy
		$str = preg_replace_callback('/^(=+) *(.+?) *(=+) *\r?$/m', "WikiBot::fix_headings_cb", $str);

		// nahrada nadpisu
		$str = preg_replace('/^(=+) Viz též (=+)\r?$/m', '\1 Související články \2', $str, -1, $tmp);
		$str = preg_replace('/^(=+) Podívejte se také na (=+)\r?$/m', '\1 Související články \2', $str, -1, $tmp);

		// pred nadpisem bude presne 1 enter, za nadpisem max. 1 enter
		$str = preg_replace('/^(=+ .+? =+)\n+$/m', "\\1\n", $str, -1, $tmp);
		$str = preg_replace('/^\n*(=+ .+? =+)$/m', "\n\\1", $str, -1, $tmp);

		return !($old == $str);
	}

	private static function fix_categories(&$str)
	{
		$old = $str;

		// kategorie v anglictine
		$str = preg_replace('/\[\[ *(:?) *Category *: *(.+?) *\]\]/im', '[[\1Kategorie:\2]]', $str, -1, $tmp);

		return !($old == $str);
	}

	private static function fix_bullets(&$str)
	{
		$old = $str;

		// odrazky orezat o mezery
		$str = preg_replace('/^([\*#;:]+) *(.*?) *\r?$/m', '\1 \2', $str, -1, $tmp);
		// zbavit se <br/> u odrazek
		$str = preg_replace('|^([\*#;:]+) (.*?) *<br */?'.'>\r?$|m', '\1 \2', $str, -1, $tmp);

		return !($old == $str);
	}

	private static function fix_entities(&$str)
	{
		$old = $str;

		// HTML entity
		$str = preg_replace('/&ndash;/m', '–', $str, -1, $tmp);
		$str = preg_replace('/&mdash;/m', '—', $str, -1, $tmp);
		$str = preg_replace('/&hellip;/m', '…', $str, -1, $tmp);

		return !($old == $str);
	}

	private static function fix_orthography(&$str)
	{
		$old = $str;

		$sep = '[ ,\.;*#\-–—…&\(\)\/\[\]|\'{}=:<>?!"„“\n]';

		$replace = array(
			'vyjímka' => 'výjimka',
			'vyjímečný' => 'výjimečný',
			'vyjímečně' => 'výjimečně'
			// ...doplnit
		);

		foreach($replace as $pattern => $replacement)
		{
			$str = preg_replace("/(${sep})${pattern}(${sep})/m", "\\1${replacement}\\2", $str);
		}

		return !($old == $str);
	}

	private static function process($str)
	{
		$summary = "[[WP:WCW]]:";

		// windowsove entery za linuxove, kvuli enterum kolem nadpisu
		$str = preg_replace('/\r\n/', "\n", $str);

		if(self::fix_headings($str))
			$summary .= " opravy nadpisů,";

		if(self::fix_bullets($str))
			$summary .= " opravy odrážek,";

		if(self::fix_entities($str))
			$summary .= " náhrada HTML entit,";

		if(self::fix_categories($str))
			$summary .= " opravy kategorií v angličtině, ";

		if(self::fix_orthography($str))
			$summary .= " opravy pravopisu, ";

		if($str == '')
		{
			echo "Error: Empty output!\n";
			die -1;
		}

		$summary .= "…";

		return array('text' => $str, 'summary' => $summary);
	}

	private function post($page, $name)
	{
		$title = rawurlencode(str_replace(' ', '_', $name));

		$mp = array();
		foreach($page['attr'] as $aname=>$val)
			$mp[$aname] = $val;

		$url = "{$this->base}?title=${title}&action=submit";

		$c = curl_init();
		curl_setopt($c, CURLOPT_URL, $url);
		curl_setopt($c, CURLOPT_COOKIEFILE, self::COOKIEFILE);
		curl_setopt($c, CURLOPT_POST, 1);
		curl_setopt($c, CURLOPT_POSTFIELDS, $mp);
		curl_setopt($c, CURLOPT_HTTPHEADER, array('Expect:'));
		curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
		$ret = curl_exec($c);

		if( '' === $ret )
			return TRUE;
		else
			return FALSE;
	}

}

$bot = new WikiBot();
$bot->run();
?>