博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
繁体简体转化_langconv.py
阅读量:5965 次
发布时间:2019-06-19

本文共 7883 字,大约阅读时间需要 26 分钟。

from copy import deepcopy import re try:     import psyco     psyco.full() except:     pass try:     from zh_wiki import zh2Hant, zh2Hans except ImportError:     from zhtools.zh_wiki import zh2Hant, zh2Hans import sys py3k = sys.version_info >= (3, 0, 0) if py3k:     UEMPTY = '' else:     _zh2Hant, _zh2Hans = {}, {}     for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):         for k, v in old.items():             new[k.decode('utf8')] = v.decode('utf8')     zh2Hant = _zh2Hant     zh2Hans = _zh2Hans     UEMPTY = ''.decode('utf8') # states (START, END, FAIL, WAIT_TAIL) = list(range(4)) # conditions (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5)) MAPS = {} class Node(object):     def __init__(self, from_word, to_word=None, is_tail=True,             have_child=False):         self.from_word = from_word         if to_word is None:             self.to_word = from_word             self.data = (is_tail, have_child, from_word)             self.is_original = True         else:             self.to_word = to_word or from_word             self.data = (is_tail, have_child, to_word)             self.is_original = False         self.is_tail = is_tail         self.have_child = have_child     def is_original_long_word(self):         return self.is_original and len(self.from_word)>1     def is_follow(self, chars):         return chars != self.from_word[:-1]     def __str__(self):         return '
' % (repr(self.from_word), repr(self.to_word), self.is_tail, self.have_child) __repr__ = __str__ class ConvertMap(object): def __init__(self, name, mapping=None): self.name = name self._map = {} if mapping: self.set_convert_map(mapping) def set_convert_map(self, mapping): convert_map = {} have_child = {} max_key_length = 0 for key in sorted(mapping.keys()): if len(key)>1: for i in range(1, len(key)): parent_key = key[:i] have_child[parent_key] = True have_child[key] = False max_key_length = max(max_key_length, len(key)) for key in sorted(have_child.keys()): convert_map[key] = (key in mapping, have_child[key], mapping.get(key, UEMPTY)) self._map = convert_map self.max_key_length = max_key_length def __getitem__(self, k): try: is_tail, have_child, to_word = self._map[k] return Node(k, to_word, is_tail, have_child) except: return Node(k) def __contains__(self, k): return k in self._map def __len__(self): return len(self._map) class StatesMachineException(Exception): pass class StatesMachine(object): def __init__(self): self.state = START self.final = UEMPTY self.len = 0 self.pool = UEMPTY def clone(self, pool): new = deepcopy(self) new.state = WAIT_TAIL new.pool = pool return new def feed(self, char, map): node = map[self.pool+char] if node.have_child: if node.is_tail: if node.is_original: cond = UNMATCHED_SWITCH else: cond = MATCHED_SWITCH else: cond = CONNECTOR else: if node.is_tail: cond = TAIL else: cond = ERROR new = None if cond == ERROR: self.state = FAIL elif cond == TAIL: if self.state == WAIT_TAIL and node.is_original_long_word(): self.state = FAIL else: self.final += node.to_word self.len += 1 self.pool = UEMPTY self.state = END elif self.state == START or self.state == WAIT_TAIL: if cond == MATCHED_SWITCH: new = self.clone(node.from_word) self.final += node.to_word self.len += 1 self.state = END self.pool = UEMPTY elif cond == UNMATCHED_SWITCH or cond == CONNECTOR: if self.state == START: new = self.clone(node.from_word) self.final += node.to_word self.len += 1 self.state = END else: if node.is_follow(self.pool): self.state = FAIL else: self.pool = node.from_word elif self.state == END: # END is a new START self.state = START new = self.feed(char, map) elif self.state == FAIL: raise StatesMachineException('Translate States Machine ' 'have error with input data %s' % node) return new def __len__(self): return self.len + 1 def __str__(self): return '
' % ( id(self), self.pool, self.state, self.final) __repr__ = __str__ class Converter(object): def __init__(self, to_encoding): self.to_encoding = to_encoding self.map = MAPS[to_encoding] self.start() def feed(self, char): branches = [] for fsm in self.machines: new = fsm.feed(char, self.map) if new: branches.append(new) if branches: self.machines.extend(branches) self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] all_ok = True for fsm in self.machines: if fsm.state != END: all_ok = False if all_ok: self._clean() return self.get_result() def _clean(self): if len(self.machines): self.machines.sort(key=lambda x: len(x)) # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) self.final += self.machines[0].final self.machines = [StatesMachine()] def start(self): self.machines = [StatesMachine()] self.final = UEMPTY def end(self): self.machines = [fsm for fsm in self.machines if fsm.state == FAIL or fsm.state == END] self._clean() def convert(self, string): self.start() for char in string: self.feed(char) self.end() return self.get_result() def get_result(self): return self.final def registery(name, mapping): global MAPS MAPS[name] = ConvertMap(name, mapping) registery('zh-hant', zh2Hant) registery('zh-hans', zh2Hans) del zh2Hant, zh2Hans def run(): import sys from optparse import OptionParser parser = OptionParser() parser.add_option('-e', type='string', dest='encoding', help='encoding') parser.add_option('-f', type='string', dest='file_in', help='input file (- for stdin)') parser.add_option('-t', type='string', dest='file_out', help='output file') (options, args) = parser.parse_args() if not options.encoding: parser.error('encoding must be set') if options.file_in: if options.file_in == '-': file_in = sys.stdin else: file_in = open(options.file_in) else: file_in = sys.stdin if options.file_out: if options.file_out == '-': file_out = sys.stdout else: file_out = open(options.file_out, 'wb') else: file_out = sys.stdout c = Converter(options.encoding) for line in file_in: # print >> file_out, c.convert(line.rstrip('\n').decode( file_out.write(c.convert(line.rstrip('\n').decode( 'utf8')).encode('utf8')) if __name__ == '__main__': run()

转载于:https://www.cnblogs.com/hapyygril/p/9904066.html

你可能感兴趣的文章
JS中的!=、== 、!==、===的用法和区别。
查看>>
vs2017 增加平台集
查看>>
Kinect+OpenNI学习笔记之10(不需要骨骼跟踪的人体多个手部分割)
查看>>
spring mvc(4)处理模型数据
查看>>
JS 判断当前使用浏览器名及版本
查看>>
让所有浏览器支持HTML5 video视频标签
查看>>
Socket 详解
查看>>
[Android Pro] Java进阶学习:jar打包详解
查看>>
xampp-apache配置
查看>>
zabbix专题:第十二章 zabbix proxy分布式监控配置
查看>>
tar 命令的详解
查看>>
Android Studio第二十七期 - RecycleView不同item布局
查看>>
穷人的分布式网络
查看>>
FR-TO-FR本地交换
查看>>
Python内置容器(2)——字典,迭代器,列表解析
查看>>
那年匆匆 -大学
查看>>
Internet 打印提示“打印机安装失败、打印机名称无效”的解决
查看>>
从Powershell ***脚本学到的如何执行后台runspace~
查看>>
SCCM TP4部署Office2013
查看>>
Linux系统启动过程,grub重装。
查看>>