from ..session.org_mode import ( global_session as session, ) BASIC_TOKENIZATION_EXAMPLES = ( ({ "text": 'cat', "tokens": ['cat'], }), ({ "text": 'cats', "tokens": ['cats'], "meaning": { 'cats': ('add-modifier', 'cat', 'plural') }, }), ({ "text": 'text separated by spaces', "tokens": ['text', 'separated', 'by', 'spaces'], }), ({ "text": 'is earth a planet?', "tokens": ['is', 'earth', 'a', 'planet', '?'], }), ) def train_basic_tokenization(knowledge_base): with session().log('Training basic tokenization'): for example in BASIC_TOKENIZATION_EXAMPLES: knowledge_base.layers.tokenization.train(example)