Source code for chirptext.luke
# -*- coding: utf-8 -*-
"""
Language profile: UK English
"""
# This code is a part of chirptext library: https://github.com/letuananh/chirptext
# :copyright: (c) 2012 Le Tuan Anh <tuananh.ke@gmail.com>
# :license: MIT, see LICENSE for more details.
import os
import codecs
import logging
# -------------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------------
logger = logging.getLogger(__name__)
MY_DIR = os.path.dirname(__file__)
MY_DATA = os.path.join(MY_DIR, 'data', 'luke')
SWADESH_1971_PATH = os.path.join(MY_DATA, 'swadesh/1971.txt')
SWADESH_RANKED_PATH = os.path.join(MY_DATA, 'swadesh/ranked.txt')
SWADESH_SIGN_PATH = os.path.join(MY_DATA, 'swadesh/sign.txt')
# -------------------------------------------------------------------------------
# Data structures
# -------------------------------------------------------------------------------
[docs]class Word(object):
""" Swadesh word """
def __init__(self, ID, word, score=0, description='', rank=0):
self.ID = ID
self.word = word
self.score = score
self.description = description
self.rank = rank
def __repr__(self):
return "Word(ID={}, word={})".format(repr(self.ID), repr(self.word))
def read_swadesh_1971():
with codecs.open(SWADESH_1971_PATH, 'r', encoding='utf-8') as infile:
lines = infile.read().splitlines()
table = [l.split(maxsplit=1) for l in lines if l and not l.startswith("#")]
words = []
for idx, row in enumerate(table):
desc = row[1] if len(row) == 2 else ''
word = Word(ID=idx + 1, word=row[0], description=desc)
words.append(word)
return words
def read_swadesh_ranked():
with codecs.open(SWADESH_RANKED_PATH, 'r', encoding='utf-8') as infile:
lines = infile.read().splitlines()
table = [l.split() for l in lines if l and not l.startswith("#")]
words = []
for idx, row in enumerate(table):
swid, top40, lemma, score = row
word = Word(ID=swid, word=lemma, score=score, rank=idx + 1)
words.append(word)
return words
def read_swadesh_sign():
with codecs.open(SWADESH_SIGN_PATH, 'r', encoding='utf-8') as infile:
lines = infile.read().splitlines()
table = [l for l in lines if l and not l.startswith("#")]
words = []
for idx, row in enumerate(table):
word = Word(ID=idx + 1, word=row.strip())
words.append(word)
return words