lang-model/naive-nlu/tree_nlu/utils/tokenization.py

from ..session.org_mode import (
    global_session as session,
)

BASIC_TOKENIZATION_EXAMPLES = (
    ({
        "text": 'cat',
        "tokens": ['cat'],
    }),
    ({
        "text": 'cats',
        "tokens": ['cats'],
        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
    }),
    ({
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],
    }),
    ({
        "text": 'is earth a planet?',
        "tokens": ['is', 'earth', 'a', 'planet', '?'],
    }),
)


def train_basic_tokenization(knowledge_base):
    with session().log('Training basic tokenization'):
        for example in BASIC_TOKENIZATION_EXAMPLES:
            knowledge_base.train_tokenizer(example)