Source code for chirptext.luke

# -*- coding: utf-8 -*-

"""
Language profile: UK English
"""

# This code is a part of chirptext library: https://github.com/letuananh/chirptext
# :copyright: (c) 2012 Le Tuan Anh <tuananh.ke@gmail.com>
# :license: MIT, see LICENSE for more details.

import os
import codecs
import logging

# -------------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------------

logger = logging.getLogger(__name__)
MY_DIR = os.path.dirname(__file__)
MY_DATA = os.path.join(MY_DIR, 'data', 'luke')
SWADESH_1971_PATH = os.path.join(MY_DATA, 'swadesh/1971.txt')
SWADESH_RANKED_PATH = os.path.join(MY_DATA, 'swadesh/ranked.txt')
SWADESH_SIGN_PATH = os.path.join(MY_DATA, 'swadesh/sign.txt')


# -------------------------------------------------------------------------------
# Data structures
# -------------------------------------------------------------------------------

[docs]class Word(object):
    """ Swadesh word """
    def __init__(self, ID, word, score=0, description='', rank=0):
        self.ID = ID
        self.word = word
        self.score = score
        self.description = description
        self.rank = rank

    def __repr__(self):
        return "Word(ID={}, word={})".format(repr(self.ID), repr(self.word))


def read_swadesh_1971():
    with codecs.open(SWADESH_1971_PATH, 'r', encoding='utf-8') as infile:
        lines = infile.read().splitlines()
        table = [l.split(maxsplit=1) for l in lines if l and not l.startswith("#")]
        words = []
        for idx, row in enumerate(table):
            desc = row[1] if len(row) == 2 else ''
            word = Word(ID=idx + 1, word=row[0], description=desc)
            words.append(word)
        return words


def read_swadesh_ranked():
    with codecs.open(SWADESH_RANKED_PATH, 'r', encoding='utf-8') as infile:
        lines = infile.read().splitlines()
        table = [l.split() for l in lines if l and not l.startswith("#")]
        words = []
        for idx, row in enumerate(table):
            swid, top40, lemma, score = row
            word = Word(ID=swid, word=lemma, score=score, rank=idx + 1)
            words.append(word)
        return words


def read_swadesh_sign():
    with codecs.open(SWADESH_SIGN_PATH, 'r', encoding='utf-8') as infile:
        lines = infile.read().splitlines()
        table = [l for l in lines if l and not l.startswith("#")]
        words = []
        for idx, row in enumerate(table):
            word = Word(ID=idx + 1, word=row.strip())
            words.append(word)
        return words
Source code for chirptext.luke

chirptext

Navigation

Related Topics