跳转到内容

User:PhiLiP/ZhConversion.py

维基百科,自由的百科全书

这是本页的一个历史版本,由PhiLiP留言 | 贡献2008年6月24日 (二) 10:58编辑。这可能和当前版本存在着巨大的差异。

# -*- coding: utf-8  -*-
# file: ZhConversion.py

import re
import zipfile
import codecs
"""class getFile(object):
	def __init__(self, unihan_fname='Unihan.txt', zh-hans_fname='zh-hans.txt', zh-hant_fname='zh-hant.txt' \
				zh-cn_fname='zh-cn.txt', zh-tw_fname='zh-tw.txt', zh-sg_fname='zh-sg.txt', zh-hk_fname='zh-hk.txt'):"""

def getUnihan(unihan_fname='Unihan.zip'):
	unihanzipfile = zipfile.ZipFile(unihan_fname, 'r')
	data = unihanzipfile.read('Unihan.txt')
	return data

def getConversionTable(ctable_fname):
	conversiontable = codecs.open(ctable_fname, 'r', 'utf-8')
	data = conversiontable.read()
	return data

def getDictFromUnihan(variant):
	unihanfile = getUnihan(unihan_fname='Unihan.zip')
	elems = unihanfile.splitlines()
	to = {}
	sept = '\t' + variant + '\t'
	for elem in elems:
		left, sep, right = elem.partition(sept)
		if sep == sept:
			right = right.split()
			right = right[0]
			if left != right:
				to[ucs4chr(int(left[2:],16))] = ucs4chr(int(right[2:],16))
	return to

def getDictFromConversionTable(to, variant):
	conversiontable = getConversionTable(ctable_fname = variant + '.txt')
	p = re.compile('-\{([\s\S]*?)\}-')
	conversionslist = p.findall(conversiontable)
	elems = []
	for conversions in conversionslist:
		elems += conversions.splitlines()
	for elem in elems:
		left, sep, right = elem.partition('=>')
		if sep == '=>':
			left = left.replace('*','').strip()
			right = right.partition('//')[0].replace(';','').strip()
			if left in to:
				if left == right:
					to.pop(left)
				else:
					to[left] = right
			else:
				to[left] = right
	return to

def toHansDict():
	toHans = getDictFromUnihan('kSimplifiedVariant')
	toHans = getDictFromConversionTable(toHans, 'Zh-hans')
	return toHans

def toHantDict():
	toHant = getDictFromUnihan('kTraditionalVariant')
	toHant = getDictFromConversionTable(toHant, 'Zh-hant')
	return toHant

def toOtherDict(variant):
	toOther = {}
	toOther = getDictFromConversionTable(toOther, variant)
	return toOther

def getConversionCode(to):
	CString = ''
	for left, right in sorted(to.items(), key=lambda d: d[0]):
		CString += '"' + left + '" => "' + right + '",\n'
	return CString

def saveFile(toHant, toHans, toTW, toHK, toCN, toSG):
	zhConversion = codecs.open('ZhConversion.php', 'w', 'utf-8')
	CString = u'<?php\n$zh2Hant = array(\n'
	CString += getConversionCode(toHant)
	CString += u');\n\n$zh2Hans = array(\n'
	CString += getConversionCode(toHans)
	CString += u');\n\n$zh2TW = array(\n'
	CString += getConversionCode(toTW)
	CString += u');\n\n$zh2HK = array(\n'
	CString += getConversionCode(toHK)
	CString += u');\n\n$zh2CN = array(\n'
	CString += getConversionCode(toCN)
	CString += u');\n\n$zh2SG = array(\n'
	CString += getConversionCode(toSG)
	CString += u');'
	zhConversion.write(CString)
	zhConversion.close()

def ucs4chr(codepoint):
	try:
		return unichr(codepoint)
	except ValueError:
		hi, lo = divmod (codepoint-0x10000, 0x400)
		return unichr(0xd800+hi) + unichr(0xdc00+lo)

def ucs4ord(str):
	if len(str)==1:
		return ord(str)
	if len(str)==2:
		hi, lo = ord(str[0])-0xd800, ord(str[1])-0xdc00
		return hi*0x400+0x10000
	raise TypeError("ucs4ord() expected a valid ucs4 character")

print(1)
toHant = toHantDict()
toHans = toHansDict()
toTW = toOtherDict('Zh-tw')
toHK = toOtherDict('Zh-hk')
toCN = toOtherDict('Zh-cn')
toSG = toOtherDict('Zh-sg')
print(2)
saveFile(toHant, toHans, toTW, toHK, toCN, toSG)
print(3)