User:PhiLiP/ZhConversion.py
外观
# -*- coding: utf-8 -*-
# file: ZhConversion.py
import re
import zipfile
import codecs
"""class getFile(object):
def __init__(self, unihan_fname='Unihan.txt', zh-hans_fname='zh-hans.txt', zh-hant_fname='zh-hant.txt' \
zh-cn_fname='zh-cn.txt', zh-tw_fname='zh-tw.txt', zh-sg_fname='zh-sg.txt', zh-hk_fname='zh-hk.txt'):"""
def getUnihan(unihan_fname='Unihan.zip'):
unihanzipfile = zipfile.ZipFile(unihan_fname, 'r')
data = unihanzipfile.read('Unihan.txt')
return data
def getConversionTable(ctable_fname):
conversiontable = codecs.open(ctable_fname, 'r', 'utf-8')
data = conversiontable.read()
return data
def getConversionTableDiff(ctablediff_fname):
try:
conversiontablediff = codecs.open(ctablediff_fname + '.diff', 'r', 'utf-8')
data = conversiontablediff.read()
except IOError:
data = False
return data
def patchConversionTable(orig, diff):
origlines = orig.splitlines()
difflines = diff.splitlines()
i = 0
for diffline in difflines:
if diffline.startswith(u'@@'):
i = int(diffline.split()[2].split(',')[0]) - 1
elif diffline.startswith(u' '):
i += 1
elif diffline.startswith(u'+') and not diffline.startswith(u'+++'):
origlines.insert(i, diffline.strip(u'+'))
i += 1
elif diffline.startswith(u'-') and not diffline.startswith(u'---'):
origlines.pop(i)
orig = '\n'.join(origlines)
return orig
def getDictFromUnihan(variant):
unihanfile = getUnihan(unihan_fname='Unihan.zip')
elems = unihanfile.splitlines()
to = {}
sept = '\t' + variant + '\t'
for elem in elems:
left, sep, right = elem.partition(sept)
if sep == sept:
right = right.split()
right = right[0]
if left != right:
to[ucs4chr(int(left[2:],16))] = ucs4chr(int(right[2:],16))
return to
def getDictFromConversionTable(to, variant):
conversiontable = getConversionTable(ctable_fname = variant)
conversiontablediff = getConversionTableDiff(ctablediff_fname = variant)
if conversiontablediff:
conversiontable = patchConversionTable(conversiontable, conversiontablediff)
saveConversionTable(variant, conversiontable)
p = re.compile('-\{([\s\S]*?)\}-')
conversionslist = p.findall(conversiontable)
elems = []
for conversions in conversionslist:
elems += conversions.splitlines()
for elem in elems:
left, sep, right = elem.partition('=>')
if sep == '=>':
left = left.replace('*','').strip()
right = right.partition('//')[0].replace(';','').strip()
if left in to:
if left == right:
to.pop(left)
else:
to[left] = right
else:
to[left] = right
return to
def toHansDict():
toHans = getDictFromUnihan('kSimplifiedVariant')
toHans = getDictFromConversionTable(toHans, 'Zh-hans')
return toHans
def toHantDict():
toHant = getDictFromUnihan('kTraditionalVariant')
toHant = getDictFromConversionTable(toHant, 'Zh-hant')
return toHant
def toOtherDict(variant):
toOther = {}
toOther = getDictFromConversionTable(toOther, variant)
return toOther
def getConversionCode(to):
CString = ''
for left, right in sorted(to.items(), key=lambda d: d[0]):
CString += '"' + left + '" => "' + right + '",\n'
return CString
def saveFile(toHant, toHans, toTW, toHK, toCN, toSG):
CString = u'<?php\n/**\n * Simplified / Traditional Chinese conversion tables\n *' \
+ u'\n * Automatically generated using code and data in includes/zhtable/' \
+ u'\n * Do not modify directly!\n */\n\n'
zhConversion = codecs.open('ZhConversion.php', 'w', 'utf-8')
CString += u'$zh2Hant = array(\n'
CString += getConversionCode(toHant)
CString += u');\n\n$zh2Hans = array(\n'
CString += getConversionCode(toHans)
CString += u');\n\n$zh2TW = array(\n'
CString += getConversionCode(toTW)
CString += u');\n\n$zh2HK = array(\n'
CString += getConversionCode(toHK)
CString += u');\n\n$zh2CN = array(\n'
CString += getConversionCode(toCN)
CString += u');\n\n$zh2SG = array(\n'
CString += getConversionCode(toSG)
CString += u');'
zhConversion.write(CString)
zhConversion.close()
def saveConversionTable(variant, conversiontable):
conversiontablefile = codecs.open(variant + '_new', 'w', 'utf-8')
conversiontablefile.write(conversiontable)
conversiontablefile.close()
def ucs4chr(codepoint):
try:
return unichr(codepoint)
except ValueError:
hi, lo = divmod (codepoint-0x10000, 0x400)
return unichr(0xd800+hi) + unichr(0xdc00+lo)
def ucs4ord(str):
if len(str)==1:
return ord(str)
if len(str)==2:
hi, lo = ord(str[0])-0xd800, ord(str[1])-0xdc00
return hi*0x400+0x10000
raise TypeError("ucs4ord() expected a valid ucs4 character")
print(1)
toHant = toHantDict()
toHans = toHansDict()
toTW = toOtherDict('Zh-tw')
toHK = toOtherDict('Zh-hk')
toCN = toOtherDict('Zh-cn')
toSG = toOtherDict('Zh-sg')
print(2)
saveFile(toHant, toHans, toTW, toHK, toCN, toSG)
print(3)