Source code for chirptext.sino
# -*- coding: utf-8 -*-
"""
Tools for processing Chinese
"""
# This code is a part of chirptext library: https://github.com/letuananh/chirptext
# :copyright: (c) 2012 Le Tuan Anh <tuananh.ke@gmail.com>
# :license: MIT, see LICENSE for more details.
import os
import logging
from collections import defaultdict as dd
from . import chio
from .anhxa import to_obj, to_dict
# -------------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------------
def getLogger():
return logging.getLogger(__name__)
MY_DIR = os.path.dirname(os.path.realpath(__file__))
KANGXI_FILE = os.path.join(MY_DIR, 'data', 'sino', 'kangxi.csv')
# -------------------------------------------------------------------------------
# Data Structures
# -------------------------------------------------------------------------------
KANGXI_FIELDS = ["idseq", "radical", "variants", "strokes", "meaning", "pinyin", "hanviet", "hiragana", "romaji", "hangeul", "romaja", "frequency", "simplified", "examples"]
# -------------------------------------------------------------------------------
# Functions
# -------------------------------------------------------------------------------
[docs]class Radical(object):
""" Chinese Radical
Source: https://en.wikipedia.org/wiki/Kangxi_radical#Table_of_radicals
"""
def __init__(self, idseq='', radical='', variants='', strokes='', meaning='', pinyin='', hanviet='', hiragana='', romaji='', hangeul='', romaja='', frequency='', simplified='', examples=''):
self.idseq = idseq
self.radical = radical
self.variants = variants
self.strokes = strokes
self.meaning = meaning
self.pinyin = pinyin
self.hanviet = hanviet
self.hiragana = hiragana
self.romaji = romaji
self.hangeul = hangeul
self.romaja = romaja
self.frequency = frequency
self.simplified = simplified
self.examples = examples
def __repr__(self):
return str(self)
def __str__(self):
return "{}-{}[sc:{}]".format(self.radical, self.meaning, self.strokes)
def to_dict(self):
return to_dict(self)
__KANGXI_MAP = None
@staticmethod
def kangxi():
if not Radical.__KANGXI_MAP:
kxs = chio.read_csv(KANGXI_FILE, fieldnames=True)
Radical.__KANGXI_MAP = KangxiMap(kxs)
else:
getLogger().debug("Kangxi has been loaded once. Created KangxiMap will be re-used")
return Radical.__KANGXI_MAP
class KangxiMap:
def __init__(self, rads=None):
self.radicals = []
self.rad_map = {} # kangxi.radical -> kangxi object
self.id_rad_map = {} # idseq ('1', '2', i.e. string) -> rad object
self.strokes_map = dd(list) # map strokes => radicals
if rads:
for rad in rads:
rad_obj = to_obj(Radical, rad)
rad_obj.frequency = int(rad_obj.frequency)
rad_obj.idseq = int(rad_obj.idseq)
rad_obj.strokes = int(rad_obj.strokes)
self.add(rad_obj)
@property
def all(self):
return [r.radical for r in self.radicals]
@property
def strokes(self):
return {sc: [r.radical for r in rads] for sc, rads in self.strokes_map.items()}
def __len__(self):
return len(self.radicals)
def __getitem__(self, key):
if key in self.rad_map: # literal matching
return self.rad_map[key]
elif key in self.id_rad_map:
return self.id_rad_map[key]
else:
return self.radicals[key] # by list index
def __contains__(self, key):
return key in self.rad_map or key in self.id_rad_map
def add(self, rad):
self.radicals.append(rad)
self.rad_map[rad.radical] = rad
self.id_rad_map[str(rad.idseq)] = rad
self.strokes_map[int(rad.strokes)].append(rad)
# map variants & simplified
if rad.variants:
for v in rad.variants.split():
self.rad_map[v] = rad
if rad.simplified:
for s in rad.simplified.split():
self.rad_map[s] = rad