lang-model/naive-nlu/tree_nlu/tests/tokenization.py

from ..session.org_mode import global_session as session
from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar
from ..visualization import show_knowledge


def _assert(args):
    assert(args)


def _assert_msg(args, msg):
    assert args, msg


EXAMPLES = [
    ('example', {
        "text": 'cat',
        "tokens": ['cat'],
    }),
    ('example', {
        "text": 'cats',
        "tokens": ['cats'],
        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
    }),
    ('example', {
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],
    }),
    ('example', {
        "text": 'is earth a planet?',
        "tokens": ['is', 'earth', 'a', 'planet', '?'],
    }),
    ('test', {
        "text": 'plane',
        "tokens": ['plane'],
    }),
    # ('test', {
    #     "text": 'planes',
    #     "tokens": ['planes'],
    #     "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
    # }),
    ('test', {
        "text": 'some other text',
        "tokens": ['some', 'other', 'text'],
    }),
    ('test', {
        "text": 'is the sun a star?',
        "tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
    }),
    ('test', {
        "text": 'sometextnotseparatedbyspaces',
        "tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
    })
]


def main():
    knowledge = KnowledgeBase()

    total = len(EXAMPLES)

    for i, (case_type, example) in enumerate(EXAMPLES):
        show_progbar(i, total, example['text'])
        if case_type == 'example':
            with session().log(example['text']):
                knowledge.layers.tokenization.train(example)

        elif case_type == 'test':
            with session().log(example['text']):
                tokens = list(knowledge.layers.tokenization.tokenize(example['text']))

                session().log('Expected “{}”, found “{}”'
                            .format(example['tokens'], tokens))
                assert example['tokens'] == tokens

        else:
            raise Exception('Not implemented case {}'.format(case_type))

    print("\r\x1b[K", end='')
    return knowledge