User:PhiLiP/bird.py

! -*- coding: utf-8 -*-

import urllib, urllib2, cookielib, re, json, subprocess, datetime, cgi, tempfile

regnum_la = u'Animalia' regnum_zh = u'动物界'

phylum_la = u'Chordata' phylum_zh = u'脊索动物门'

classis_la = u'Aves' classis_zh = u'鸟纲'

ordo_la = u'Passeriformes' ordo_zh = u'雀形目'

familia_la = u'Campephagidae' familia_zh = u'山椒鸟科'

genus_la = u'Pericrocotus' genus_zh = u'山椒鸟属'

replacements = ( ( u, u ), ( u'\s*is a species of [^\w]*bird[^\w]* in the [^\w]*Campephagidae[^\w]* family.', u'是%s %s的一种，' % ( familia_zh, genus_zh ) ), ( u'\s*It is found in ', u'分布于' ), ( u'New Caledonia', u'新喀里多尼亚' ), ( u'Solomon Islands', u'所罗门群岛' ), ( u'Vanuatu', u'瓦努阿图' ), ( u'Norfolk Island', u'诺福克岛' ), ( u'Brunei', u'文莱' ), ( u'India', u'印度' ), ( u'Indonesia', u'印度尼西亚' ), ( u'Malaysia', u'马来西亚' ), ( u'Philippines', u'菲律宾' ), ( u'Singapore', u'新加坡' ), ( u'Thailand', u'泰国' ), ( u'Australia', u'澳大利亚' ), ( u'Papua New Guinea', u'巴布亚新几内亚' ), ( u'New Guinea', u'新几内亚' ), ( u'Bismarck Archipelago', u'俾斯麦群岛' ), ( u'Samoa', u'萨摩亚' ), ( u'East Timor', u'东帝汶' ), ( u'Angola', u'安哥拉' ), ( u'Myanmar', u'缅甸' ), ( u'Benin', u'贝宁' ), ( u'Cameroon', u'喀麦隆' ), ( u'Central African Republic', u'中非共和国' ), ( u'Democratic Republic of the Congo', u'刚果民主共和国' ), ( u'Republic of the Congo', u'刚果共和国' ), ( u'Ivory Coast', u'科特迪瓦' ), ( u'Equatorial Guinea', u'赤道几内亚' ), ( u'Botswana', u'博茨瓦纳' ), ( u'Burundi', u'布隆迪' ), ( u'Ethiopia', u'埃塞俄比亚' ), ( u'Malawi', u'马拉维' ), ( u'Burkina Faso', u'布基纳法索' ), ( u'Chad', u'乍得' ), ( u'Eritrea', u'厄立特里亚' ), ( u'Gambia', u'冈比亚' ), ( u'Mozambique', u'莫桑比克' ), ( u'Namibia', u'纳米比亚' ), ( u'Rwanda', u'卢旺达' ), ( u'Somalia', u'索马里' ), ( u'Japan', u'日本' ), ( u'South Africa', u'南非' ), ( u'Swaziland', u'斯威士兰' ), ( u'Zimbabwe', u'津巴布韦' ), ( u'Gabon', u'加蓬' ), ( u'Ghana', u'加纳' ), ( u'Guinea-Bissau', u'几内亚比绍' ), ( u'Guinea', u'几内亚' ), ( u'Kenya', u'肯尼亚' ), ( u'Liberia', u'利比里亚' ), ( u'Mali', u'马里' ), ( u'Ani马里a', u'Animalia' ), ( u'Nigeria', u'尼日利亚' ), ( u'Sierra Leone', u'塞拉利昂' ), ( u'Sudan', u'苏丹' ), ( u'Tanzania', u'坦桑尼亚' ), ( u'Togo', u'多哥' ), ( u'Uganda', u'乌干达' ), ( u'Zambia', u'赞比亚' ), ( u'Mauritania', u'毛里塔尼亚' ), ( u'Niger', u'尼日尔' ), ( u'Senegal', u'塞内加尔' ), #( u, u ), ( u'coastal hinterland', u'沿海腹地' ), ( u'hinterland', u'腹地' ), ( u'subspecies', u'亚种' ), ( u'extinct', u'绝灭' ), ( u'Its natural (?:\[\[)?habitats?(?:\]\])?s? (?:are|is)', u'其自然栖息地为' ), ( u' ?subtropical or tropical', u'亚热带或热带的' ), ( u' ?moist', u'湿润' ), ( u' ?dry', u'干燥' ), ( u' ?(?:\[\[)?lowland(?:\]\])?', u'低地' ), ( u' ?(?:\[\[)?mangrove (?:\[\[)?forests?(?:\]\])?s?', u'红树林' ), ( u' ?((?:\[\[)?)forests?((?:\]\])?)s?', u'\\1森林\\2' ), ( u' ?((?:\[\[)?)montanes?((?:\]\])?)s?', u'\\1山地\\2' ), ( u' ?((?:\[\[)?)plantations?((?:\]\])?)s?', u'\\1人造林\\2' ), ( u' ?((?:\[\[)?)swamps?((?:\]\])?)s?', u'\\1沼泽\\2' ), ( u' ?(?:\[\[)?shrublands?(?:\]\])?s?', u'疏灌丛（英语：shrubland）' ), ( u' ?((?:\[\[)?)savannas?((?:\]\])?)s?', u'\\1疏林草原\\2' ), ( u'\[\[John Gould\|Gould\]\]', u'Gould' ), ( u'\[\[Coenraad Jacob Temminck\|Temminck\]\]', u'Temminck' ), ( u'\[\[Raymond Robert Forster\|Forster\]\]', u'Forster（英语：Raymond Robert Forster）' ), ( u'\[\[Emile Oustalet\|Oustalet\]\]', u'Oustalet（英语：Emile Oustalet）' ), ( u'\[\[Louis Jean Pierre Vieillot\|Vieillot\]\]', u'Vieillot' ), ( u'\[\[John Latham \(ornithologist\)\|Latham\]\]', u'Latham' ), ( u'It is threatened by \[\[habitat loss\]\].', u'该物种受栖息地减少威胁。' ), ( u'\]\], \[\[', u']]、[[' ), ( u'\]\],? and \[\[', u']]和[[' ), ( u' and ?', u'以及' ), )

def get_chinese( text ): params = { 'txt': text.encode( 'utf-8' ), 's': 'Search' } fp = urllib.urlopen( 'http://www.cnbird.org.cn/world/result.asp?' + urllib.urlencode( params ) ) data = fp.read().decode( 'gbk' )

ret = re.findall( u'.+?(.+?)', data )

if ret: ret = ret[0] else: ret = return ret

def exists( title ): params = { 'action': 'query', 'titles': title.encode( 'U8' ), 'format': 'json', } fp = urllib.urlopen( 'http://zh.wikipedia.org/w/api.php?' + urllib.urlencode( params ) ) data = json.load( fp ) if data['query']['pages'].has_key( '-1' ): return False else: return True

cj = cookielib.CookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cj ) ) def update_iucnid( text ): global opener params = { 'mode': , 'text': text.encode( 'utf-8' ), 'x': 15, 'y': 5 } fp = opener.open( 'http://www.iucnredlist.org/apps/redlist/search/external', urllib.urlencode( params ) ) data = fp.read().decode( 'utf-8' ) ret = re.findall( u'<a href=".+?(\d+).+?class="sciname">%s' % text, data ) if ret: ret = ret[0] else: ret = return ret

def build_iucn( binomial ): tpl = u'BirdLife International. ' \ + '%s. The IUCN Red List of Threatened Species 2009. [%s] （英语）. 参数|title=值左起第5位存在水平制表 (帮助); ' iucnid = update_iucnid( binomial ) if iucnid: utcnow = datetime.datetime.utcnow() utcnow = '%4d-%02d-%02d' % ( utcnow.year, utcnow.month, utcnow.day ) ret = tpl % ( iucnid, binomial, utcnow ) else: ret = return ret

def interwikis( m, english ): ori = m.group(0) ori = ori.split('\n') new = [] for _item in ori: _item = _item.strip() if _item: new.append( _item ) new.append( u % english ) new.sort() return u'\n\n' + u'\n'.join( new )

def gen(): global genus_la params = { 'action': 'query', 'prop': 'revisions', 'generator': 'categorymembers', 'gcmtitle': 'Category:%s' % genus_la, 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'gcmlimit': 10 }

_p = params.copy()

while 1: data = json.load( urllib.urlopen( 'http://en.wikipedia.org/w/api.php?' + urllib.urlencode( _p ) ) ) if data.has_key( 'query' ): for _item in data['query']['pages'].itervalues(): title = _item['title'] text = _item['revisions'][0]['*'] yield ( title, text ) if data.has_key( 'query-continue' ): _p = params.copy() _p['gcmcontinue'] = data['query-continue']['categorymembers']['gcmcontinue'] else: break

def do_translate( title, text ): global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh binomial = re.findall( u'\|\s*binomial\s*=\s*\'\'(.+?)\'\, text ) if binomial: binomial = binomial[0] else: return ( , , )

chinese = if binomial: chinese = get_chinese( binomial )

while not chinese: chinese = raw_input( ( u'请输入%s的中文名称：' % binomial ).encode( 'U8' ) )

if exists( chinese ): print ( u'条目%s已存在，自动跳过' % chinese ).encode( 'U8' ) return ( , , ) while 1: result = raw_input( ( u'条目%s已存在，是（y）否（n）跳过？' % chinese ).encode( 'U8' ) ) if result.lower() == 'y': return ( , , ) elif result.lower() == 'n': break

text = re.sub( u'(\|\s*name\s*=\s*)[^\|\n\r]*', u'\\1%s' % chinese, text ) text = re.sub( u'(\|\s*species\s*=\s*\'\'\')', u'\\1%s ' % chinese, text ) text = re.sub( u'(The \'\'\').+?(\'\'\')', u'\\1%s\\2' % chinese, text )

global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh

text = re.sub( u'(\|\s*regnum\s*=\s*)[^\|\n\r]*', u'\\1%s %s' % ( regnum_zh, regnum_la ), text ) text = re.sub( u'(\|\s*phylum\s*=\s*)[^\|\n\r]*', u'\\1%s %s' % ( phylum_zh, phylum_la ), text ) text = re.sub( u'(\|\s*classis\s*=\s*)[^\|\n\r]*', u'\\1%s %s' % ( classis_zh, classis_la ), text ) text = re.sub( u'(\|\s*ordo\s*=\s*)[^\|\n\r]*', u'\\1%s %s' % ( ordo_zh, ordo_la ), text ) text = re.sub( u'(\|\s*familia\s*=\s*)[^\|\n\r]*', u'\\1%s %s' % ( familia_zh, familia_la ), text ) text = re.sub( u'(\|\s*genus\s*=\s*)[^\|\n\r]*', u'\\1%s \'\'%s\'\ % ( genus_zh, genus_la ), text )

text = re.sub( u'The \'\'\'(.*?)\'\'\' \(\'\'\'(.*?)\'\'\'\)', u'\'\'\'\\1\'\'\'（学名：\\2）', text ) text = re.sub( u'The \'\'\'(.*?)\'\'\' \((\'\'.*?\'\')\)', u'\'\'\'\\1\'\'\'（学名：\\2）是%s %s的一种，' % ( familia_zh, genus_zh ), text )

text = re.compile( u'==\s*Notes?\s*==', re.I ).sub( u'== 脚注 ==', text ) text = re.compile( u'==\s*References?\s*==', re.I ).sub( u'\n== 参考文献 ==', text ) text = re.compile( u'==\s*External links?\s*==', re.I ).sub( u'== 外部链接 ==', text )

text = re.compile( u'\s*\{\{[^\}]+stub\}\}\s*', re.I ).sub( , text ) text = re.compile( u'\[\[Category:%s\]\]' % genus_la, re.I ).sub( u % genus_zh, text ) text = re.compile( u'\[\[Category:Birds of .+?\]\]', re.I ).sub( u, text ) text = re.sub( u'(?:\s*\[\[[a-z]+:.+?\]\])+', lambda m: interwikis( m, title ), text )

text = text.split( u'== 参考文献 ==' )

if len( text ) == 2: iucn = build_iucn( binomial ) print iucn if iucn: text[1] = re.compile( u'((?:^\*)?).*iucn.*', re.I ).sub( iucn, text[1] )

#text[0] = re.compile( '%s' % title, re.I ).sub( chinese, text[0] )

for rp in replacements: try: if len( rp ) == 3: text[0] = re.compile( rp[0], rp[2] ).sub( rp[1], text[0] ) else: text[0] = re.compile( rp[0], re.I ).sub( rp[1], text[0] ) except Exception, e: print rp raise text = u'== 参考文献 =='.join( text )

return ( chinese, binomial, text )

def build_html( title, text ): title = title.encode( 'utf-8' ) text = text.encode( 'utf-8' ) query = urllib.urlencode( { 'title': title } ) html = <html> <header> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <script language="javascript"> window.onload = function() { document.getElementById( 'wiki' ).submit(); } </script> </header> <body> <form style="display:none;" id="wiki" action="http://zh.wikipedia.org/w/index.php?%s&action=submit" method="post"> <textarea name="wpTextbox1">%s</textarea> </form> </body> </html> % ( query, cgi.escape( text ) ) return html

def main(): tmp = None for ( english, oritext ) in gen():

( chinese, binomial, midtext ) = do_translate( english, oritext )

if not binomial: continue

html = build_html( chinese, midtext )

if tmp: while 1: result = raw_input( '完成发表后请输入y，中止输入n：' ) if result.lower() == 'y': break elif result.lower() == 'n': return tmp.close()

tmp = tempfile.NamedTemporaryFile() tmp.file.write( html ) tmp.file.close() x = subprocess.Popen( ['firefox', tmp.name], shell = False ) x.wait()

if __name__ == '__main__': main()