Source code for chirptext.deko
# -*- coding: utf-8 -*-
"""
Convenient Japanese text parser that produces results in TTL format
"""
# Reference
# - MeCab homepage: http://taku910.github.io/mecab/
#
# MeCab, デコ, got the joke?
# This script was adopted from https://github.com/letuananh/omwtk
#
# This code is a part of chirptext library: https://github.com/letuananh/chirptext
# :copyright: (c) 2012 Le Tuan Anh <tuananh.ke@gmail.com>
# :license: MIT, see LICENSE for more details.
from . import mecab
from . import dekoigo
from . import janome as deko_janome
from . import util
from .util import kata2hira, is_kana, HIRAGANA, KATAKANA
# allow mecab config
set_mecab_bin = mecab._register_mecab_loc
get_mecab_bin = mecab._get_mecab_loc
def engines():
_engines = []
try:
mv = mecab.version()
if mv:
_engines.append(("mecab", mecab))
except Exception:
pass
if dekoigo.igo_available():
_engines.append(("igo", dekoigo))
if deko_janome.janome_available():
_engines.append(("janome", deko_janome))
return _engines
def _locate_engine(*args, **kwargs):
_engines = engines()
if not _engines:
raise RuntimeError("There is no Japanese parser available")
else:
return _engines[0][1]
def parse(*args, **kwargs):
return _locate_engine().parse(*args, **kwargs)
def parse_doc(*args, **kwargs):
return _locate_engine().parse_doc(*args, **kwargs)
def tokenize(*args, **kwargs):
return _locate_engine().tokenize(*args, **kwargs)
def tokenize_sent(*args, **kwargs):
return _locate_engine().tokenize_sent(*args, **kwargs)
[docs]def analyse(content, splitlines=True, format=None, **kwargs):
""" Japanese text > tokenize/txt/html """
doc = parse_doc(content, splitlines=splitlines, **kwargs)
output_text = []
final = doc
# Generate output
if format == 'html':
for sent in doc:
output_text.append(util.sent_to_ruby(sent))
final = '<br/>\n'.join(output_text)
elif format == 'csv':
for sent in doc:
output_text.append(util.to_csv(sent))
output_text.append('\n')
final = '\n'.join(output_text)
elif format == 'txt':
final = '\n'.join((' '.join(tk.text for tk in sent) for sent in doc))
return final
__all__ = ['set_mecab_bin', 'get_mecab_bin', 'engines',
'parse', 'parse_doc', 'tokenize', 'tokenize_sent', 'analyse']