User:PhiLiP/bird.py
外观
<nowiki>
#! -*- coding: utf-8 -*-
import urllib, urllib2, cookielib, re, json, subprocess, datetime, cgi, tempfile
regnum_la = u'Animalia'
regnum_zh = u'动物界'
phylum_la = u'Chordata'
phylum_zh = u'脊索动物门'
classis_la = u'Aves'
classis_zh = u'鸟纲'
ordo_la = u'Passeriformes'
ordo_zh = u'雀形目'
familia_la = u'Campephagidae'
familia_zh = u'山椒鸟科'
genus_la = u'Pericrocotus'
genus_zh = u'山椒鸟属'
replacements = (
( u'<!-- This article was auto-generated by \[\[User:Polbot\]\]. -->', u'' ),
( u'\s*is a species of [^\w]*bird[^\w]* in the [^\w]*Campephagidae[^\w]* family.',
u'是[[%s]][[%s]]的一种,' % ( familia_zh, genus_zh ) ),
( u'\s*It is found in ', u'分布于' ),
( u'New Caledonia', u'新喀里多尼亚' ),
( u'Solomon Islands', u'所罗门群岛' ),
( u'Vanuatu', u'瓦努阿图' ),
( u'Norfolk Island', u'诺福克岛' ),
( u'Brunei', u'文莱' ),
( u'India', u'印度' ),
( u'Indonesia', u'印度尼西亚' ),
( u'Malaysia', u'马来西亚' ),
( u'Philippines', u'菲律宾' ),
( u'Singapore', u'新加坡' ),
( u'Thailand', u'泰国' ),
( u'Australia', u'澳大利亚' ),
( u'Papua New Guinea', u'巴布亚新几内亚' ),
( u'New Guinea', u'新几内亚' ),
( u'Bismarck Archipelago', u'俾斯麦群岛' ),
( u'Samoa', u'萨摩亚' ),
( u'East Timor', u'东帝汶' ),
( u'Angola', u'安哥拉' ),
( u'Myanmar', u'缅甸' ),
( u'Benin', u'贝宁' ),
( u'Cameroon', u'喀麦隆' ),
( u'Central African Republic', u'中非共和国' ),
( u'Democratic Republic of the Congo', u'刚果民主共和国' ),
( u'Republic of the Congo', u'刚果共和国' ),
( u'Ivory Coast', u'科特迪瓦' ),
( u'Equatorial Guinea', u'赤道几内亚' ),
( u'Botswana', u'博茨瓦纳' ),
( u'Burundi', u'布隆迪' ),
( u'Ethiopia', u'埃塞俄比亚' ),
( u'Malawi', u'马拉维' ),
( u'Burkina Faso', u'布基纳法索' ),
( u'Chad', u'乍得' ),
( u'Eritrea', u'厄立特里亚' ),
( u'Gambia', u'冈比亚' ),
( u'Mozambique', u'莫桑比克' ),
( u'Namibia', u'纳米比亚' ),
( u'Rwanda', u'卢旺达' ),
( u'Somalia', u'索马里' ),
( u'Japan', u'日本' ),
( u'South Africa', u'南非' ),
( u'Swaziland', u'斯威士兰' ),
( u'Zimbabwe', u'津巴布韦' ),
( u'Gabon', u'加蓬' ),
( u'Ghana', u'加纳' ),
( u'Guinea-Bissau', u'几内亚比绍' ),
( u'Guinea', u'几内亚' ),
( u'Kenya', u'肯尼亚' ),
( u'Liberia', u'利比里亚' ),
( u'Mali', u'马里' ),
( u'Ani马里a', u'Animalia' ),
( u'Nigeria', u'尼日利亚' ),
( u'Sierra Leone', u'塞拉利昂' ),
( u'Sudan', u'苏丹' ),
( u'Tanzania', u'坦桑尼亚' ),
( u'Togo', u'多哥' ),
( u'Uganda', u'乌干达' ),
( u'Zambia', u'赞比亚' ),
( u'Mauritania', u'毛里塔尼亚' ),
( u'Niger', u'尼日尔' ),
( u'Senegal', u'塞内加尔' ),
#( u'', u'' ),
( u'coastal hinterland', u'沿海腹地' ),
( u'hinterland', u'腹地' ),
( u'subspecies', u'亚种' ),
( u'extinct', u'绝灭' ),
( u'Its natural (?:\[\[)?habitats?(?:\]\])?s? (?:are|is)', u'其自然[[栖息地]]为' ),
( u' ?subtropical or tropical', u'亚热带或热带的' ),
( u' ?moist', u'湿润' ),
( u' ?dry', u'干燥' ),
( u' ?(?:\[\[)?lowland(?:\]\])?', u'低地' ),
( u' ?(?:\[\[)?mangrove (?:\[\[)?forests?(?:\]\])?s?', u'[[红树林]]' ),
( u' ?((?:\[\[)?)forests?((?:\]\])?)s?', u'\\1森林\\2' ),
( u' ?((?:\[\[)?)montanes?((?:\]\])?)s?', u'\\1山地\\2' ),
( u' ?((?:\[\[)?)plantations?((?:\]\])?)s?', u'\\1人造林\\2' ),
( u' ?((?:\[\[)?)swamps?((?:\]\])?)s?', u'\\1沼泽\\2' ),
( u' ?(?:\[\[)?shrublands?(?:\]\])?s?', u'{{link-en|疏灌丛|shrubland}}' ),
( u' ?((?:\[\[)?)savannas?((?:\]\])?)s?', u'\\1疏林草原\\2' ),
( u'\[\[John Gould\|Gould\]\]', u'[[约翰·古尔德|Gould]]' ),
( u'\[\[Coenraad Jacob Temminck\|Temminck\]\]', u'[[康拉德·雅各·特明克|Temminck]]' ),
( u'\[\[Raymond Robert Forster\|Forster\]\]', u'{{link-en|雷蒙德·罗伯特·福斯特|Raymond Robert Forster|Forster}}' ),
( u'\[\[Emile Oustalet\|Oustalet\]\]', u'{{link-en|埃米尔·乌斯塔莱|Emile Oustalet|Oustalet}}' ),
( u'\[\[Louis Jean Pierre Vieillot\|Vieillot\]\]',
u'{{link-en|路易·让·皮埃尔·维埃约|Louis Jean Pierre Vieillot|Vieillot}}' ),
( u'\[\[John Latham \(ornithologist\)\|Latham\]\]',
u'{{link-en|约翰·莱瑟姆|John Latham (ornithologist)|Latham}}' ),
( u'It is threatened by \[\[habitat loss\]\].', u'该物种受栖息地减少威胁。' ),
( u'\]\], \[\[', u']]、[[' ),
( u'\]\],? and \[\[', u']]和[[' ),
( u' and ?', u'以及' ),
)
def get_chinese( text ):
params = {
'txt': text.encode( 'utf-8' ),
's': 'Search' }
fp = urllib.urlopen( 'http://www.cnbird.org.cn/world/result.asp?' + urllib.urlencode( params ) )
data = fp.read().decode( 'gbk' )
ret = re.findall( u'<tr class=tr1><td>.+?</td><td>(.+?)</td><td>', data )
if ret:
ret = ret[0]
else:
ret = ''
return ret
def exists( title ):
params = {
'action': 'query',
'titles': title.encode( 'U8' ),
'format': 'json',
}
fp = urllib.urlopen( 'http://zh.wikipedia.org/w/api.php?' + urllib.urlencode( params ) )
data = json.load( fp )
if data['query']['pages'].has_key( '-1' ):
return False
else:
return True
cj = cookielib.CookieJar()
opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cj ) )
def update_iucnid( text ):
global opener
params = {
'mode': '',
'text': text.encode( 'utf-8' ),
'x': 15,
'y': 5
}
fp = opener.open( 'http://www.iucnredlist.org/apps/redlist/search/external', urllib.urlencode( params ) )
data = fp.read().decode( 'utf-8' )
ret = re.findall( u'<a href=".+?(\d+).+?class="sciname">%s' % text, data )
if ret:
ret = ret[0]
else:
ret = ''
return ret
def build_iucn( binomial ):
tpl = u'{{IUCN2010 | assessors = BirdLife International | year = 2009 | id = %s | title = ' \
+ '%s | version = 2010.3 | downloaded = %s }}'
iucnid = update_iucnid( binomial )
if iucnid:
utcnow = datetime.datetime.utcnow()
utcnow = '%4d-%02d-%02d' % ( utcnow.year, utcnow.month, utcnow.day )
ret = tpl % ( iucnid, binomial, utcnow )
else:
ret = ''
return ret
def interwikis( m, english ):
ori = m.group(0)
ori = ori.split('\n')
new = []
for _item in ori:
_item = _item.strip()
if _item:
new.append( _item )
new.append( u'[[en:%s]]' % english )
new.sort()
return u'\n\n' + u'\n'.join( new )
def gen():
global genus_la
params = {
'action': 'query',
'prop': 'revisions',
'generator': 'categorymembers',
'gcmtitle': 'Category:%s' % genus_la,
'prop': 'revisions',
'rvprop': 'content',
'format': 'json',
'gcmlimit': 10 }
_p = params.copy()
while 1:
data = json.load( urllib.urlopen( 'http://en.wikipedia.org/w/api.php?' + urllib.urlencode( _p ) ) )
if data.has_key( 'query' ):
for _item in data['query']['pages'].itervalues():
title = _item['title']
text = _item['revisions'][0]['*']
yield ( title, text )
if data.has_key( 'query-continue' ):
_p = params.copy()
_p['gcmcontinue'] = data['query-continue']['categorymembers']['gcmcontinue']
else:
break
def do_translate( title, text ):
global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh
global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh
binomial = re.findall( u'\|\s*binomial\s*=\s*\'\'(.+?)\'\'', text )
if binomial:
binomial = binomial[0]
else:
return ( '', '', '' )
chinese = ''
if binomial:
chinese = get_chinese( binomial )
while not chinese:
chinese = raw_input( ( u'请输入%s的中文名称:' % binomial ).encode( 'U8' ) )
if exists( chinese ):
print ( u'条目[[%s]]已存在,自动跳过' % chinese ).encode( 'U8' )
return ( '', '', '' )
while 1:
result = raw_input( ( u'条目[[%s]]已存在,是(y)否(n)跳过?' % chinese ).encode( 'U8' ) )
if result.lower() == 'y':
return ( '', '', '' )
elif result.lower() == 'n':
break
text = re.sub( u'(\|\s*name\s*=\s*)[^\|\n\r]*', u'\\1%s' % chinese, text )
text = re.sub( u'(\|\s*species\s*=\s*\'\'\')', u'\\1%s ' % chinese, text )
text = re.sub( u'(The \'\'\').+?(\'\'\')', u'\\1%s\\2' % chinese, text )
global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh
global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh
text = re.sub( u'(\|\s*regnum\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( regnum_zh, regnum_la ), text )
text = re.sub( u'(\|\s*phylum\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( phylum_zh, phylum_la ), text )
text = re.sub( u'(\|\s*classis\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( classis_zh, classis_la ), text )
text = re.sub( u'(\|\s*ordo\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( ordo_zh, ordo_la ), text )
text = re.sub( u'(\|\s*familia\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( familia_zh, familia_la ), text )
text = re.sub( u'(\|\s*genus\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] \'\'%s\'\'' % ( genus_zh, genus_la ), text )
text = re.sub( u'The \'\'\'(.*?)\'\'\' \(\'\'\'(.*?)\'\'\'\)', u'\'\'\'\\1\'\'\'(学名:{{lang|la|\\2}})', text )
text = re.sub( u'The \'\'\'(.*?)\'\'\' \((\'\'.*?\'\')\)',
u'\'\'\'\\1\'\'\'(学名:{{lang|la|\\2}})是[[%s]][[%s]]的一种,' % ( familia_zh, genus_zh ), text )
text = re.compile( u'==\s*Notes?\s*==', re.I ).sub( u'== 脚注 ==', text )
text = re.compile( u'==\s*References?\s*==', re.I ).sub( u'\n== 参考文献 ==', text )
text = re.compile( u'==\s*External links?\s*==', re.I ).sub( u'== 外部链接 ==', text )
text = re.compile( u'\s*\{\{[^\}]+stub\}\}\s*', re.I ).sub( '', text )
text = re.compile( u'\[\[Category:%s\]\]' % genus_la, re.I ).sub( u'[[Category:%s]]' % genus_zh, text )
text = re.compile( u'\[\[Category:Birds of .+?\]\]', re.I ).sub( u'', text )
text = re.sub( u'(?:\s*\[\[[a-z]+:.+?\]\])+', lambda m: interwikis( m, title ), text )
text = text.split( u'== 参考文献 ==' )
if len( text ) == 2:
iucn = build_iucn( binomial )
print iucn
if iucn:
text[1] = re.compile( u'((?:^\*)?).*iucn.*', re.I ).sub( iucn, text[1] )
#text[0] = re.compile( '%s' % title, re.I ).sub( chinese, text[0] )
for rp in replacements:
try:
if len( rp ) == 3:
text[0] = re.compile( rp[0], rp[2] ).sub( rp[1], text[0] )
else:
text[0] = re.compile( rp[0], re.I ).sub( rp[1], text[0] )
except Exception, e:
print rp
raise
text = u'== 参考文献 =='.join( text )
return ( chinese, binomial, text )
def build_html( title, text ):
title = title.encode( 'utf-8' )
text = text.encode( 'utf-8' )
query = urllib.urlencode( { 'title': title } )
html = '''<html>
<header>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<script language="javascript">
window.onload = function() {
document.getElementById( 'wiki' ).submit();
}
</script>
</header>
<body>
<form style="display:none;" id="wiki" action="http://zh.wikipedia.org/w/index.php?%s&action=submit" method="post">
<textarea name="wpTextbox1">%s</textarea>
</form>
</body>
</html>''' % ( query, cgi.escape( text ) )
return html
def main():
tmp = None
for ( english, oritext ) in gen():
( chinese, binomial, midtext ) = do_translate( english, oritext )
if not binomial:
continue
html = build_html( chinese, midtext )
if tmp:
while 1:
result = raw_input( '完成发表后请输入y,中止输入n:' )
if result.lower() == 'y':
break
elif result.lower() == 'n':
return
tmp.close()
tmp = tempfile.NamedTemporaryFile()
tmp.file.write( html )
tmp.file.close()
x = subprocess.Popen( ['firefox', tmp.name], shell = False )
x.wait()
if __name__ == '__main__':
main()
</nowiki>