lang-model/naive-nlu/tree_nlu/utils/tokenization.py

30 lines
730 B
Python

from ..session.org_mode import (
global_session as session,
)
BASIC_TOKENIZATION_EXAMPLES = (
({
"text": 'cat',
"tokens": ['cat'],
}),
({
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
({
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
({
"text": 'is earth a planet?',
"tokens": ['is', 'earth', 'a', 'planet', '?'],
}),
)
def train_basic_tokenization(knowledge_base):
with session().log('Training basic tokenization'):
for example in BASIC_TOKENIZATION_EXAMPLES:
knowledge_base.train_tokenizer(example)